From b20830b2ed4577a4b48ee0cc01ec2a963380328e Mon Sep 17 00:00:00 2001
From: kee hyun an <keehyuna@nvidia.com>
Date: Mon, 4 Nov 2024 20:25:03 +0900
Subject: [PATCH 1/7] feat: Runtime output buffer optimization

---
 core/runtime/TRTEngine.cpp                    |  4 +
 core/runtime/TRTEngine.h                      |  4 +
 core/runtime/execute_engine.cpp               | 64 ++++++++++----
 core/runtime/register_jit_hooks.cpp           |  1 +
 .../runtime/_PythonTorchTensorRTModule.py     | 88 +++++++++++--------
 .../dynamo/runtime/_TorchTensorRTModule.py    |  1 +
 6 files changed, 108 insertions(+), 54 deletions(-)
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
index 5a5c1ad83d..d24640eae9 100644
--- a/core/runtime/TRTEngine.cpp
+++ b/core/runtime/TRTEngine.cpp
@@ -319,6 +319,10 @@ int64_t TRTEngine::get_automatic_device_memory_budget() {
   return cuda_engine->getWeightStreamingAutomaticBudget();
 }
 
+void TRTEngine::set_pre_allocated_outputs(bool enable) {
+  use_pre_allocated_outputs = enable;
+}
+
 std::string TRTEngine::to_str() const {
   // clang-format off
   std::stringstream ss;
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
index 88fb7ab275..950b6edf15 100644
--- a/core/runtime/TRTEngine.h
+++ b/core/runtime/TRTEngine.h
@@ -88,6 +88,7 @@ struct TRTEngine : torch::CustomClassHolder {
   int64_t get_streamable_device_memory_budget();
   int64_t get_automatic_device_memory_budget();
   std::vector<at::Tensor> infer_outputs(std::vector<std::vector<int64_t>> input_shapes);
+  void set_pre_allocated_outputs(bool enable);
   friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
   static const char BINDING_DELIM = '%';
 
@@ -102,6 +103,9 @@ struct TRTEngine : torch::CustomClassHolder {
   std::vector<at::Tensor> input_buffers = {};
   std::vector<at::Tensor> output_buffers = {};
   std::string shape_key;
+  bool cudagraphs_enabled = false;
+  bool use_pre_allocated_outputs = true;
+  std::vector<at::Tensor> pre_allocated_outputs;
 
   // TODO: Implement a call method
   // c10::List<at::Tensor> Run(c10::List<at::Tensor> inputs);
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
index a7908468f4..f7ba509494 100644
--- a/core/runtime/execute_engine.cpp
+++ b/core/runtime/execute_engine.cpp
@@ -5,6 +5,7 @@
 #include "torch/csrc/jit/runtime/custom_operator.h"
 #include "torch/torch.h"
 
+#include <ATen/record_function.h>
 #include "core/runtime/TRTEngineProfiler.h"
 #include "core/runtime/runtime.h"
 #include "core/util/prelude.h"
@@ -60,9 +61,8 @@ RTDevice select_rt_device(const RTDevice& engine_device, const RTDevice& curr_de
   return new_target_device_opt.value();
 }
 
-bool _cudagraphs_validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
-  // Validate whether the current input shapes to the engine
-  // invalidate the existing cudagraphs object
+bool _validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
+  // Validate whether the current input shapes to the engine has changed
 
   // Populate the shape key for the inputs
   // x: (3, 4), y: (4, 5) --> Key: (3,4)(4,5)
@@ -83,15 +83,32 @@ bool _cudagraphs_validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_
 
   auto new_shape_key = new_shape_key_ss.str();
 
-  // Compare the shape key to the original key and invalidate shapes if they do not match
+  // Compare the shape key to the original key
   if (new_shape_key != compiled_engine->shape_key) {
-    LOG_DEBUG("Resetting Cudagraph on New Shape Key " << new_shape_key);
+    LOG_DEBUG("Input shape changed " << compiled_engine->shape_key << " -> " << new_shape_key);
     compiled_engine->shape_key = new_shape_key;
-    compiled_engine->cudagraph.reset();
-    return false;
+    return true;
   }
 
-  return true;
+  return false;
+}
+
+std::vector<at::Tensor> create_output_tensors(c10::intrusive_ptr<TRTEngine> compiled_engine) {
+  std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
+  for (auto output_indices : compiled_engine->out_binding_map) {
+    // out_binding_map stores TRT_IDX: PYT_IDX
+    auto pyt_idx = output_indices.second;
+
+    std::string name = compiled_engine->out_binding_names[pyt_idx];
+    auto out_shape = compiled_engine->exec_ctx->getTensorShape(name.c_str());
+    LOG_DEBUG("Output Name: " << name << " Shape: " << out_shape);
+
+    auto dims = core::util::toVec(out_shape);
+    auto type = util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
+    outputs[pyt_idx] = std::move(at::empty(dims, {at::kCUDA}).to(type).contiguous());
+  }
+
+  return outputs;
 }
 
 std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
@@ -114,10 +131,15 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     compiled_engine->cudagraph.enable_debug_mode();
   }
 
+  bool shape_changed = _validate_shapes(inputs, compiled_engine);
+
   // Whether cudagraphs needs to record the graph on this pass
-  bool need_cudagraphs_record = (CUDAGRAPHS_MODE && (!_cudagraphs_validate_shapes(inputs, compiled_engine)));
+  // Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
+  bool need_cudagraphs_record =
+      (((!compiled_engine->cudagraphs_enabled) && CUDAGRAPHS_MODE) || (CUDAGRAPHS_MODE && shape_changed));
+  compiled_engine->cudagraphs_enabled = CUDAGRAPHS_MODE;
 
-  if (!CUDAGRAPHS_MODE) {
+  if (!CUDAGRAPHS_MODE || shape_changed) {
     compiled_engine->cudagraph.reset();
   }
 
@@ -178,6 +200,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
   { // Input Setup
     std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
+    RECORD_FUNCTION("process input", std::vector<c10::IValue>());
     if (compiled_engine->profile_execution) {
       input_profiler_guard =
           std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
@@ -259,23 +282,20 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
   { // Output Setup
     std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
+    RECORD_FUNCTION("process output", std::vector<c10::IValue>());
     if (compiled_engine->profile_execution) {
       output_profiler_guard =
           std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
     }
+    if ((false == compiled_engine->use_pre_allocated_outputs) || shape_changed) {
+      outputs = create_output_tensors(compiled_engine);
+    } else {
+      outputs = compiled_engine->pre_allocated_outputs;
+    }
 
     for (auto output_indices : compiled_engine->out_binding_map) {
-      // out_binding_map stores TRT_IDX: PYT_IDX
       auto pyt_idx = output_indices.second;
-
       std::string name = compiled_engine->out_binding_names[pyt_idx];
-      auto out_shape = compiled_engine->exec_ctx->getTensorShape(name.c_str());
-      LOG_DEBUG("Output Name: " << name << " Shape: " << out_shape);
-
-      auto dims = core::util::toVec(out_shape);
-      auto type = util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
-      outputs[pyt_idx] = std::move(at::empty(dims, {at::kCUDA}).to(type).contiguous());
-
       if (need_cudagraphs_record) {
         // If we are recording the cuda graph then we need to update the persistent output buffer
         compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
@@ -311,6 +331,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
   std::unique_lock<std::mutex> lock(compiled_engine->mu);
 
   { // Engine Execution (execute on engine stream)
+    RECORD_FUNCTION("Trt runtime", std::vector<c10::IValue>());
     c10::cuda::CUDAStreamGuard stream_guard(compiled_engine->engine_stream);
 
     std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
@@ -345,6 +366,11 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     }
   } // End engine exeuction (resets to caller stream)
 
+  // Create output buffer for next execution of graph or trt context.
+  if (compiled_engine->use_pre_allocated_outputs) {
+    compiled_engine->pre_allocated_outputs = create_output_tensors(compiled_engine);
+  }
+
   // Block caller stream until engine execution is complete
   at::cuda::CUDAEvent trt_exec_complete;
   trt_exec_complete.record(compiled_engine->engine_stream);
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
index 042bf085c8..eade7cdcfa 100644
--- a/core/runtime/register_jit_hooks.cpp
+++ b/core/runtime/register_jit_hooks.cpp
@@ -88,6 +88,7 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
         .def("dump_engine_layer_info", &TRTEngine::dump_engine_layer_info)
         .def("get_engine_layer_info", &TRTEngine::get_engine_layer_info)
         .def("infer_outputs", &TRTEngine::infer_outputs)
+        .def("set_pre_allocated_outputs", &TRTEngine::set_pre_allocated_outputs)
         .def_property(
             "device_memory_budget",
             &TRTEngine::get_device_memory_budget,
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
index e31d73f337..17a38c716d 100644
--- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -107,6 +107,9 @@ def __init__(
         self.engine = None
         self.weight_name_map = weight_name_map
         self.target_platform = Platform.current_platform()
+        self.cudagraphs_enabled = False
+        self.pre_allocated_outputs: List[torch.Tensor] = []
+        self.use_pre_allocated_outputs = False
 
         if self.serialized_engine is not None and not self.settings.lazy_engine_init:
             self.setup_engine()
@@ -171,7 +174,7 @@ def setup_engine(self) -> None:
             self.engine.get_tensor_shape(input_name) for input_name in self.input_names
         ]
         self.output_dtypes = [
-            dtype._from(self.engine.get_tensor_dtype(output_name))
+            dtype._from(self.engine.get_tensor_dtype(output_name)).to(torch.dtype)
             for output_name in self.output_names
         ]
         self.output_shapes = [
@@ -232,6 +235,19 @@ def __del__(self) -> None:
         if self.cudagraph:
             self.cudagraph.reset()
 
+    def create_output_tensors(self) -> List[torch.Tensor]:
+        # create output tensors
+        outputs: List[torch.Tensor] = []
+
+        for o, _ in enumerate(self.output_names):
+            output = torch.empty(
+                size=self.output_shapes[o],
+                dtype=self.output_dtypes[o],
+                device=torch.cuda.current_device(),
+            )
+            outputs.append(output)
+        return outputs
+
     def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, ...]:
         # Ensure inputs are available in all scopes and cast symbolic integers to Tensors
         contiguous_inputs: List[torch.Tensor] = [
@@ -247,11 +263,17 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
             self._check_initialized()
 
             cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode()
-            need_cudagraphs_record = (
-                cudagraphs_enabled and not self.cudagraphs_validate_shapes(inputs)
-            )
+            shape_changed = self.validate_input_shapes(inputs)
+            # Cudagraphs record is required if cudagraphs_enabled is toggled to True regardless of shape change
+            if not self.cudagraphs_enabled and cudagraphs_enabled:
+                need_cudagraphs_record = True
+            else:
+                need_cudagraphs_record = cudagraphs_enabled and shape_changed
+            self.cudagraphs_enabled = cudagraphs_enabled
 
             if need_cudagraphs_record:
+                if self.cudagraph:
+                    self.cudagraph.reset()
                 self._input_buffers = [None] * len(self.input_names)
                 self._output_buffers = [None] * len(self.output_names)
 
@@ -259,7 +281,7 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                 self.cudagraph.reset()
                 self.cudagraph = None
 
-            # If in safe mode, check at each iteration for for whether a switch is required
+            # If in safe mode, check at each iteration for whether a switch is required
             if (
                 torch_tensorrt.runtime._multi_device_safe_mode._PY_RT_MULTI_DEVICE_SAFE_MODE
             ):
@@ -350,14 +372,14 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                             self.context.set_tensor_address(
                                 input_name, contiguous_inputs[i].data_ptr()
                             )
-
-                # Check if input shapes can be inferred.
-                uninferred_input_names = self.context.infer_shapes()
-                if uninferred_input_names:
-                    logger.warning(
-                        f"The shapes of the inputs: {uninferred_input_names} cannot be inferred and could lead to undefined behavior. \
-                                This could happen if the input tensor addresses/shapes haven't been configured correctly"
-                    )
+                if shape_changed:
+                    # Check if input shapes can be inferred.
+                    uninferred_input_names = self.context.infer_shapes()
+                    if uninferred_input_names:
+                        logger.warning(
+                            f"The shapes of the inputs: {uninferred_input_names} cannot be inferred and could lead to undefined behavior. \
+                                    This could happen if the input tensor addresses/shapes haven't been configured correctly"
+                        )
 
             with (
                 torch.autograd.profiler.record_function(
@@ -366,24 +388,20 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                 if self.profiling_enabled
                 else nullcontext()
             ):
-                # create output tensors
-                outputs: List[torch.Tensor] = []
-
-                for o, output_name in enumerate(self.output_names):
-                    shape = tuple(self.context.get_tensor_shape(output_name))
-
-                    if DYNAMIC_DIM in shape:
+                if not self.use_pre_allocated_outputs or shape_changed:
+                    self.output_shapes = [
+                        tuple(self.context.get_tensor_shape(output_name))
+                        for output_name in self.output_names
+                    ]
+                    if DYNAMIC_DIM in self.output_shapes:
                         raise ValueError(
                             "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported."
                         )
+                    outputs = self.create_output_tensors()
+                else:
+                    outputs = self.pre_allocated_outputs
 
-                    output = torch.empty(
-                        size=shape,
-                        dtype=self.output_dtypes[o].to(torch.dtype),
-                        device=torch.cuda.current_device(),
-                    )
-
-                    outputs.append(output)
+                for o, output_name in enumerate(self.output_names):
 
                     if need_cudagraphs_record:
                         self._output_buffers[o] = outputs[o].clone()
@@ -444,6 +462,9 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
 
                 self._caller_stream.wait_stream(self._engine_stream)
 
+            if self.use_pre_allocated_outputs:
+                self.pre_allocated_outputs = self.create_output_tensors()
+
             if cudagraphs_enabled:
                 for idx, o in enumerate(outputs):
                     o.copy_(self._output_buffers[idx])
@@ -485,10 +506,9 @@ def get_layer_info(self) -> str:
         )
         return engine_json
 
-    def cudagraphs_validate_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
+    def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
         """
-        Validates the input shapes of the forward function
-        versus the version currently active for the
+        Validates the input shapes of the forward function has changed
         """
         # Representation of input shapes to a given model
         # Shapes are concatenated as so:
@@ -498,10 +518,8 @@ def cudagraphs_validate_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
         # If the new shape key differs from the existing one,
         # invalidate the old shape key and remove the CUDAGraph
         if new_shape_key != self.shape_key:
-            logger.debug(f"Resetting Cudagraph on new shape key {new_shape_key}")
+            logger.debug(f"Input shape changed {self.shape_key} -> {new_shape_key}")
             self.shape_key = new_shape_key
-            if self.cudagraph:
-                self.cudagraph.reset()
-            return False
+            return True
 
-        return True
+        return False
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
index 1bebe20fda..99f863f1da 100644
--- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -207,6 +207,7 @@ def setup_engine(self) -> None:
         if self.engine is not None:
             return
         self.engine = torch.classes.tensorrt.Engine(self._pack_engine_info())
+        self.engine.set_pre_allocated_outputs(True)
 
     def encode_metadata(self, metadata: Any) -> str:
         metadata = copy.deepcopy(metadata)

From 998c0c61d1e8aa9dcdc9b1939612c843d9e122ef Mon Sep 17 00:00:00 2001
From: kee hyun an <keehyuna@nvidia.com>
Date: Tue, 5 Nov 2024 09:31:38 +0900
Subject: [PATCH 2/7] chore: setting for test

---
 core/runtime/TRTEngine.cpp                                   | 4 ----
 core/runtime/execute_engine.cpp                              | 4 ----
 core/runtime/register_jit_hooks.cpp                          | 2 +-
 .../dynamo/runtime/_PythonTorchTensorRTModule.py             | 5 ++++-
 py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py     | 5 ++++-
 5 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
index d24640eae9..5a5c1ad83d 100644
--- a/core/runtime/TRTEngine.cpp
+++ b/core/runtime/TRTEngine.cpp
@@ -319,10 +319,6 @@ int64_t TRTEngine::get_automatic_device_memory_budget() {
   return cuda_engine->getWeightStreamingAutomaticBudget();
 }
 
-void TRTEngine::set_pre_allocated_outputs(bool enable) {
-  use_pre_allocated_outputs = enable;
-}
-
 std::string TRTEngine::to_str() const {
   // clang-format off
   std::stringstream ss;
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
index f7ba509494..682d56ab3d 100644
--- a/core/runtime/execute_engine.cpp
+++ b/core/runtime/execute_engine.cpp
@@ -5,7 +5,6 @@
 #include "torch/csrc/jit/runtime/custom_operator.h"
 #include "torch/torch.h"
 
-#include <ATen/record_function.h>
 #include "core/runtime/TRTEngineProfiler.h"
 #include "core/runtime/runtime.h"
 #include "core/util/prelude.h"
@@ -200,7 +199,6 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
   { // Input Setup
     std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
-    RECORD_FUNCTION("process input", std::vector<c10::IValue>());
     if (compiled_engine->profile_execution) {
       input_profiler_guard =
           std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
@@ -282,7 +280,6 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
   { // Output Setup
     std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
-    RECORD_FUNCTION("process output", std::vector<c10::IValue>());
     if (compiled_engine->profile_execution) {
       output_profiler_guard =
           std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
@@ -331,7 +328,6 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
   std::unique_lock<std::mutex> lock(compiled_engine->mu);
 
   { // Engine Execution (execute on engine stream)
-    RECORD_FUNCTION("Trt runtime", std::vector<c10::IValue>());
     c10::cuda::CUDAStreamGuard stream_guard(compiled_engine->engine_stream);
 
     std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
index eade7cdcfa..e5edcf9729 100644
--- a/core/runtime/register_jit_hooks.cpp
+++ b/core/runtime/register_jit_hooks.cpp
@@ -88,7 +88,7 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
         .def("dump_engine_layer_info", &TRTEngine::dump_engine_layer_info)
         .def("get_engine_layer_info", &TRTEngine::get_engine_layer_info)
         .def("infer_outputs", &TRTEngine::infer_outputs)
-        .def("set_pre_allocated_outputs", &TRTEngine::set_pre_allocated_outputs)
+        .def_readwrite("use_pre_allocated_outputs", &TRTEngine::use_pre_allocated_outputs)
         .def_property(
             "device_memory_budget",
             &TRTEngine::get_device_memory_budget,
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
index 17a38c716d..afb67d1165 100644
--- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -109,7 +109,7 @@ def __init__(
         self.target_platform = Platform.current_platform()
         self.cudagraphs_enabled = False
         self.pre_allocated_outputs: List[torch.Tensor] = []
-        self.use_pre_allocated_outputs = False
+        self.use_pre_allocated_outputs = True
 
         if self.serialized_engine is not None and not self.settings.lazy_engine_init:
             self.setup_engine()
@@ -248,6 +248,9 @@ def create_output_tensors(self) -> List[torch.Tensor]:
             outputs.append(output)
         return outputs
 
+    def set_output_opt(self, enable: bool) -> None:
+        self.use_pre_allocated_outputs = enable
+
     def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, ...]:
         # Ensure inputs are available in all scopes and cast symbolic integers to Tensors
         contiguous_inputs: List[torch.Tensor] = [
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
index 99f863f1da..b3ec3258f0 100644
--- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -207,7 +207,7 @@ def setup_engine(self) -> None:
         if self.engine is not None:
             return
         self.engine = torch.classes.tensorrt.Engine(self._pack_engine_info())
-        self.engine.set_pre_allocated_outputs(True)
+        self.set_output_opt(True)
 
     def encode_metadata(self, metadata: Any) -> str:
         metadata = copy.deepcopy(metadata)
@@ -272,6 +272,9 @@ def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None:
         self.input_binding_names = state[2]
         self.output_binding_names = state[3]
 
+    def set_output_opt(self, enable: bool) -> None:
+        self.engine.use_pre_allocated_outputs = enable
+
     def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]:
         """Implementation of the forward pass for a TensorRT engine
 

From 210ae8b20c810c69409ee37393acbdea61989bb6 Mon Sep 17 00:00:00 2001
From: kee hyun an <keehyuna@nvidia.com>
Date: Thu, 14 Nov 2024 18:25:46 +0900
Subject: [PATCH 3/7] chore: Initialize shape key as non-empty string to
 validate no input tensor

---
 core/runtime/TRTEngine.h                                      | 2 +-
 .../dynamo/runtime/_PythonTorchTensorRTModule.py              | 2 +-
 py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py      | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
index 950b6edf15..947ce7934d 100644
--- a/core/runtime/TRTEngine.h
+++ b/core/runtime/TRTEngine.h
@@ -102,7 +102,7 @@ struct TRTEngine : torch::CustomClassHolder {
   at::cuda::CUDAStream caller_stream = c10::cuda::getDefaultCUDAStream();
   std::vector<at::Tensor> input_buffers = {};
   std::vector<at::Tensor> output_buffers = {};
-  std::string shape_key;
+  std::string shape_key = "None";
   bool cudagraphs_enabled = false;
   bool use_pre_allocated_outputs = true;
   std::vector<at::Tensor> pre_allocated_outputs;
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
index afb67d1165..c856ff3be6 100644
--- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -248,7 +248,7 @@ def create_output_tensors(self) -> List[torch.Tensor]:
             outputs.append(output)
         return outputs
 
-    def set_output_opt(self, enable: bool) -> None:
+    def set_pre_allocated_outputs(self, enable: bool) -> None:
         self.use_pre_allocated_outputs = enable
 
     def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, ...]:
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
index b3ec3258f0..1c9ef0a9c5 100644
--- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -207,7 +207,7 @@ def setup_engine(self) -> None:
         if self.engine is not None:
             return
         self.engine = torch.classes.tensorrt.Engine(self._pack_engine_info())
-        self.set_output_opt(True)
+        self.set_pre_allocated_outputs(False)
 
     def encode_metadata(self, metadata: Any) -> str:
         metadata = copy.deepcopy(metadata)
@@ -272,7 +272,7 @@ def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None:
         self.input_binding_names = state[2]
         self.output_binding_names = state[3]
 
-    def set_output_opt(self, enable: bool) -> None:
+    def set_pre_allocated_outputs(self, enable: bool) -> None:
         self.engine.use_pre_allocated_outputs = enable
 
     def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]:

From 4a5f0d1280f2ebcfdce6d51fc737936869307465 Mon Sep 17 00:00:00 2001
From: kee hyun an <keehyuna@nvidia.com>
Date: Mon, 18 Nov 2024 21:19:14 +0900
Subject: [PATCH 4/7] chore: rebase and rename variable

---
 core/runtime/TRTEngine.h                                    | 2 +-
 core/runtime/execute_engine.cpp                             | 4 ++--
 .../dynamo/runtime/_PythonTorchTensorRTModule.py            | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
index 947ce7934d..f0e427c7da 100644
--- a/core/runtime/TRTEngine.h
+++ b/core/runtime/TRTEngine.h
@@ -103,7 +103,7 @@ struct TRTEngine : torch::CustomClassHolder {
   std::vector<at::Tensor> input_buffers = {};
   std::vector<at::Tensor> output_buffers = {};
   std::string shape_key = "None";
-  bool cudagraphs_enabled = false;
+  bool prev_cudagraphs_enabled = false;
   bool use_pre_allocated_outputs = true;
   std::vector<at::Tensor> pre_allocated_outputs;
 
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
index 682d56ab3d..c6f9dfb473 100644
--- a/core/runtime/execute_engine.cpp
+++ b/core/runtime/execute_engine.cpp
@@ -135,8 +135,8 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
   // Whether cudagraphs needs to record the graph on this pass
   // Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
   bool need_cudagraphs_record =
-      (((!compiled_engine->cudagraphs_enabled) && CUDAGRAPHS_MODE) || (CUDAGRAPHS_MODE && shape_changed));
-  compiled_engine->cudagraphs_enabled = CUDAGRAPHS_MODE;
+      (((!compiled_engine->prev_cudagraphs_enabled) && CUDAGRAPHS_MODE) || (CUDAGRAPHS_MODE && shape_changed));
+  compiled_engine->prev_cudagraphs_enabled = CUDAGRAPHS_MODE;
 
   if (!CUDAGRAPHS_MODE || shape_changed) {
     compiled_engine->cudagraph.reset();
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
index c856ff3be6..776b919a19 100644
--- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -107,7 +107,7 @@ def __init__(
         self.engine = None
         self.weight_name_map = weight_name_map
         self.target_platform = Platform.current_platform()
-        self.cudagraphs_enabled = False
+        self.prev_cudagraphs_enabled = False
         self.pre_allocated_outputs: List[torch.Tensor] = []
         self.use_pre_allocated_outputs = True
 
@@ -268,11 +268,11 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
             cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode()
             shape_changed = self.validate_input_shapes(inputs)
             # Cudagraphs record is required if cudagraphs_enabled is toggled to True regardless of shape change
-            if not self.cudagraphs_enabled and cudagraphs_enabled:
+            if not self.prev_cudagraphs_enabled and cudagraphs_enabled:
                 need_cudagraphs_record = True
             else:
                 need_cudagraphs_record = cudagraphs_enabled and shape_changed
-            self.cudagraphs_enabled = cudagraphs_enabled
+            self.prev_cudagraphs_enabled = cudagraphs_enabled
 
             if need_cudagraphs_record:
                 if self.cudagraph:

From f480353184d92f8784c85258b2b8878b8625fe42 Mon Sep 17 00:00:00 2001
From: kee hyun an <keehyuna@nvidia.com>
Date: Tue, 19 Nov 2024 10:03:52 +0900
Subject: [PATCH 5/7] chore: Functionalize inputs setup

---
 core/runtime/execute_engine.cpp               | 139 +++++++++---------
 .../runtime/_PythonTorchTensorRTModule.py     | 104 +++++++------
 .../dynamo/runtime/test_002_cudagraphs_cpp.py |   3 +
 .../dynamo/runtime/test_002_cudagraphs_py.py  |   4 +
 4 files changed, 135 insertions(+), 115 deletions(-)

diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
index c6f9dfb473..8c7ee4352d 100644
--- a/core/runtime/execute_engine.cpp
+++ b/core/runtime/execute_engine.cpp
@@ -91,7 +91,77 @@ bool _validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngi
 
   return false;
 }
+void setup_input_tensors(
+    std::vector<at::Tensor> inputs,
+    c10::intrusive_ptr<TRTEngine> compiled_engine,
+    bool need_cudagraphs_record) {
+  // this is a buffer to store shape tensor input addresses throughout the runtime scope
+  std::list<std::vector<int64_t>> inputShapeTensorValues;
+  std::list<at::Tensor> formatted_inputs(compiled_engine->num_io.first);
+
+  for (size_t i = 0; i < inputs.size(); i++) {
+    std::string name = compiled_engine->in_binding_names[i];
+
+    TORCHTRT_CHECK(
+        inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
+
+    auto expected_type =
+        util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
+    TORCHTRT_CHECK(
+        inputs[i].dtype() == expected_type,
+        "Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
+
+    auto dims = core::util::toDims(inputs[i].sizes());
+    auto shape = core::util::toVec(dims);
+    LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
+
+    if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
+      // Shape tensor inputs are casted to int64 explicitly.
+      // Refer to
+      // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
+      auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64);
+      std::vector<int64_t> inputs_cpu_vec(
+          input_cpu.data_ptr<int64_t>(), input_cpu.data_ptr<int64_t>() + input_cpu.numel());
+      inputShapeTensorValues.emplace_back(inputs_cpu_vec);
+      TORCHTRT_CHECK(
+          compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
+          "Error while setting the tensor address for shape inputs");
+
+      if (CUDAGRAPHS_MODE) {
+        // @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
+        compiled_engine->input_buffers[i] = input_cpu;
+      }
+      TORCHTRT_CHECK(
+          compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
+          "Error while setting the tensor address for shape inputs");
 
+    } else {
+      at::Tensor contig_input = inputs[i].view(shape).contiguous();
+      formatted_inputs.emplace_back(std::move(contig_input));
+
+      if (need_cudagraphs_record) {
+        // Create a new persistent input buffer
+        compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone());
+      }
+
+      TORCHTRT_CHECK(
+          compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
+
+      if (CUDAGRAPHS_MODE) {
+        // If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
+        compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
+        TORCHTRT_CHECK(
+            compiled_engine->exec_ctx->setTensorAddress(name.c_str(), compiled_engine->input_buffers[i].data_ptr()),
+            "Error while setting the input tensor address for inputs");
+      } else {
+        // Otherwise use the formatted buffer directly
+        TORCHTRT_CHECK(
+            compiled_engine->exec_ctx->setTensorAddress(name.c_str(), formatted_inputs.back().data_ptr()),
+            "Error while setting the input tensor address for inputs");
+      }
+    }
+  }
+}
 std::vector<at::Tensor> create_output_tensors(c10::intrusive_ptr<TRTEngine> compiled_engine) {
   std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
   for (auto output_indices : compiled_engine->out_binding_map) {
@@ -142,11 +212,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     compiled_engine->cudagraph.reset();
   }
 
-  // this is a buffer to store shape tensor input addresses throughout the runtime scope
-  std::list<std::vector<int64_t>> inputShapeTensorValues;
-
   // Intialize inputs and outputs to be available throughout the succeeding scopes
-  std::list<at::Tensor> formatted_inputs(compiled_engine->num_io.first);
   std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
 
   if (MULTI_DEVICE_SAFE_MODE) {
@@ -204,68 +270,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
           std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
     }
 
-    for (size_t i = 0; i < inputs.size(); i++) {
-      std::string name = compiled_engine->in_binding_names[i];
-
-      TORCHTRT_CHECK(
-          inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
-
-      auto expected_type =
-          util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
-      TORCHTRT_CHECK(
-          inputs[i].dtype() == expected_type,
-          "Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
-
-      auto dims = core::util::toDims(inputs[i].sizes());
-      auto shape = core::util::toVec(dims);
-      LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
-
-      if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
-        // Shape tensor inputs are casted to int64 explicitly.
-        // Refer to
-        // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
-        auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64);
-        std::vector<int64_t> inputs_cpu_vec(
-            input_cpu.data_ptr<int64_t>(), input_cpu.data_ptr<int64_t>() + input_cpu.numel());
-        inputShapeTensorValues.emplace_back(inputs_cpu_vec);
-        TORCHTRT_CHECK(
-            compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
-            "Error while setting the tensor address for shape inputs");
-
-        if (CUDAGRAPHS_MODE) {
-          // @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
-          compiled_engine->input_buffers[i] = input_cpu;
-        }
-        TORCHTRT_CHECK(
-            compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
-            "Error while setting the tensor address for shape inputs");
-
-      } else {
-        at::Tensor contig_input = inputs[i].view(shape).contiguous();
-        formatted_inputs.emplace_back(std::move(contig_input));
-
-        if (need_cudagraphs_record) {
-          // Create a new persistent input buffer
-          compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone());
-        }
-
-        TORCHTRT_CHECK(
-            compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
-
-        if (CUDAGRAPHS_MODE) {
-          // If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
-          compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
-          TORCHTRT_CHECK(
-              compiled_engine->exec_ctx->setTensorAddress(name.c_str(), compiled_engine->input_buffers[i].data_ptr()),
-              "Error while setting the input tensor address for inputs");
-        } else {
-          // Otherwise use the formatted buffer directly
-          TORCHTRT_CHECK(
-              compiled_engine->exec_ctx->setTensorAddress(name.c_str(), formatted_inputs.back().data_ptr()),
-              "Error while setting the input tensor address for inputs");
-        }
-      }
-    }
+    setup_input_tensors(inputs, compiled_engine, need_cudagraphs_record);
 
     // Check if input shapes can be inferred.
     int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
@@ -284,7 +289,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
       output_profiler_guard =
           std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
     }
-    if ((false == compiled_engine->use_pre_allocated_outputs) || shape_changed) {
+    if (!compiled_engine->use_pre_allocated_outputs || shape_changed) {
       outputs = create_output_tensors(compiled_engine);
     } else {
       outputs = compiled_engine->pre_allocated_outputs;
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
index 776b919a19..903160ca2b 100644
--- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -109,7 +109,7 @@ def __init__(
         self.target_platform = Platform.current_platform()
         self.prev_cudagraphs_enabled = False
         self.pre_allocated_outputs: List[torch.Tensor] = []
-        self.use_pre_allocated_outputs = True
+        self.use_pre_allocated_outputs = False
 
         if self.serialized_engine is not None and not self.settings.lazy_engine_init:
             self.setup_engine()
@@ -235,6 +235,57 @@ def __del__(self) -> None:
         if self.cudagraph:
             self.cudagraph.reset()
 
+    def setup_input_tensors(
+        self,
+        contiguous_inputs: List[torch.Tensor],
+        cudagraphs_enabled: bool,
+        need_cudagraphs_record: bool,
+    ) -> None:
+        for i, input_name in enumerate(self.input_names):
+            if not contiguous_inputs[i].is_cuda:
+                logger.warning(
+                    f"Detected input {input_name} of engine {self.engine.name} is not on a cuda device. "
+                    "This tensor is being moved by the runtime but for performance considerations, "
+                    "ensure your inputs are all on GPU and open an issue here "
+                    "(https://github.com/pytorch/TensorRT/issues) if this warning persists."
+                )
+                contiguous_inputs = (
+                    contiguous_inputs[:i]
+                    + [contiguous_inputs[i].cuda()]
+                    + contiguous_inputs[i + 1 :]
+                )
+
+            assert (
+                contiguous_inputs[i].dtype == self.input_dtypes[i]
+            ), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}."
+
+            if need_cudagraphs_record:
+                # If cudagraphs is enabled, this memory is reserved for future cudagraph runs
+                # Clone is required to avoid re-using user-provided GPU memory
+                self._input_buffers[i] = contiguous_inputs[i].clone()
+
+            # For shape tensors, we use CPU pointers and for data tensors, we use GPU pointers
+            # as per TensorRT requirements
+            if self.engine.is_shape_inference_io(input_name):
+                # Shape tensor inputs are casted to int64 explicitly
+                # Currently Torch CPU pointers are not working; numpy pointers are used instead
+                # to refer to underlying memory
+                inputs_cpu = contiguous_inputs[i].cpu().to(torch.int64).numpy().copy()
+                self.context.set_tensor_address(input_name, inputs_cpu.ctypes.data)
+            else:
+                self.context.set_input_shape(
+                    input_name, tuple(contiguous_inputs[i].shape)
+                )
+                if cudagraphs_enabled:
+                    self._input_buffers[i].copy_(contiguous_inputs[i])
+                    self.context.set_tensor_address(
+                        input_name, self._input_buffers[i].data_ptr()
+                    )
+                else:
+                    self.context.set_tensor_address(
+                        input_name, contiguous_inputs[i].data_ptr()
+                    )
+
     def create_output_tensors(self) -> List[torch.Tensor]:
         # create output tensors
         outputs: List[torch.Tensor] = []
@@ -272,6 +323,7 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                 need_cudagraphs_record = True
             else:
                 need_cudagraphs_record = cudagraphs_enabled and shape_changed
+
             self.prev_cudagraphs_enabled = cudagraphs_enabled
 
             if need_cudagraphs_record:
@@ -327,54 +379,10 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                     self.input_names
                 ), f"Wrong number of inputs, expect {len(self.input_names)} get {len(contiguous_inputs)}."
 
-                for i, input_name in enumerate(self.input_names):
-                    if not contiguous_inputs[i].is_cuda:
-                        logger.warning(
-                            f"Detected input {input_name} of engine {self.engine.name} is not on a cuda device. "
-                            "This tensor is being moved by the runtime but for performance considerations, "
-                            "ensure your inputs are all on GPU and open an issue here "
-                            "(https://github.com/pytorch/TensorRT/issues) if this warning persists."
-                        )
-                        contiguous_inputs = (
-                            contiguous_inputs[:i]
-                            + [contiguous_inputs[i].cuda()]
-                            + contiguous_inputs[i + 1 :]
-                        )
-
-                    assert (
-                        contiguous_inputs[i].dtype == self.input_dtypes[i]
-                    ), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}."
+                self.setup_input_tensors(
+                    contiguous_inputs, cudagraphs_enabled, need_cudagraphs_record
+                )
 
-                    if need_cudagraphs_record:
-                        # If cudagraphs is enabled, this memory is reserved for future cudagraph runs
-                        # Clone is required to avoid re-using user-provided GPU memory
-                        self._input_buffers[i] = contiguous_inputs[i].clone()
-
-                    # For shape tensors, we use CPU pointers and for data tensors, we use GPU pointers
-                    # as per TensorRT requirements
-                    if self.engine.is_shape_inference_io(input_name):
-                        # Shape tensor inputs are casted to int64 explicitly
-                        # Currently Torch CPU pointers are not working; numpy pointers are used instead
-                        # to refer to underlying memory
-                        inputs_cpu = (
-                            contiguous_inputs[i].cpu().to(torch.int64).numpy().copy()
-                        )
-                        self.context.set_tensor_address(
-                            input_name, inputs_cpu.ctypes.data
-                        )
-                    else:
-                        self.context.set_input_shape(
-                            input_name, tuple(contiguous_inputs[i].shape)
-                        )
-                        if cudagraphs_enabled:
-                            self._input_buffers[i].copy_(contiguous_inputs[i])
-                            self.context.set_tensor_address(
-                                input_name, self._input_buffers[i].data_ptr()
-                            )
-                        else:
-                            self.context.set_tensor_address(
-                                input_name, contiguous_inputs[i].data_ptr()
-                            )
                 if shape_changed:
                     # Check if input shapes can be inferred.
                     uninferred_input_names = self.context.infer_shapes()
diff --git a/tests/py/dynamo/runtime/test_002_cudagraphs_cpp.py b/tests/py/dynamo/runtime/test_002_cudagraphs_cpp.py
index a017eaabca..8649ca8e84 100644
--- a/tests/py/dynamo/runtime/test_002_cudagraphs_cpp.py
+++ b/tests/py/dynamo/runtime/test_002_cudagraphs_cpp.py
@@ -17,6 +17,9 @@
     "Torch-TensorRT runtime is not available",
 )
 class TestCudagraphsCPP(TestCase):
+    def tearDown(self):
+        # Reset to default cuda graph mode after each test
+        torch_tensorrt.runtime.set_cudagraphs_mode(False)
 
     def test_cudagraphs_on(self):
         torch_tensorrt.runtime.set_cudagraphs_mode(True)
diff --git a/tests/py/dynamo/runtime/test_002_cudagraphs_py.py b/tests/py/dynamo/runtime/test_002_cudagraphs_py.py
index 4f962083a8..4bdcfbbef4 100644
--- a/tests/py/dynamo/runtime/test_002_cudagraphs_py.py
+++ b/tests/py/dynamo/runtime/test_002_cudagraphs_py.py
@@ -13,6 +13,10 @@
 
 
 class TestCudagraphsPython(TestCase):
+    def tearDown(self):
+        # Reset to default cuda graph mode after each test
+        torch_tensorrt.runtime.set_cudagraphs_mode(False)
+
     def test_cudagraphs_on(self):
         torch_tensorrt.runtime.set_cudagraphs_mode(True)
         self.assertTrue(torch_tensorrt.runtime.get_cudagraphs_mode())

From 23131c34e529c4563d63c1563b4adb62c30ec309 Mon Sep 17 00:00:00 2001
From: kee hyun an <keehyuna@nvidia.com>
Date: Mon, 25 Nov 2024 22:52:24 +0900
Subject: [PATCH 6/7] chore: Runtime api for pre-allocated outputs

---
 core/runtime/TRTEngine.cpp                    |   3 +
 core/runtime/TRTEngine.h                      |  35 ++++-
 core/runtime/execute_engine.cpp               |  13 +-
 .../runtime/_PythonTorchTensorRTModule.py     |  56 ++++++--
 .../dynamo/runtime/_TorchTensorRTModule.py    |   1 -
 py/torch_tensorrt/runtime/__init__.py         |   1 +
 .../runtime/_pre_allocated_outputs.py         |  41 ++++++
 .../runtime/test_pre_allocated_outputs.py     | 130 ++++++++++++++++++
 8 files changed, 259 insertions(+), 21 deletions(-)
 create mode 100644 py/torch_tensorrt/runtime/_pre_allocated_outputs.py
 create mode 100644 tests/py/dynamo/runtime/test_pre_allocated_outputs.py

diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
index 5a5c1ad83d..b103695503 100644
--- a/core/runtime/TRTEngine.cpp
+++ b/core/runtime/TRTEngine.cpp
@@ -99,6 +99,9 @@ TRTEngine::TRTEngine(
   exec_ctx = make_trt(cuda_engine->createExecutionContext());
   TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to create TensorRT execution context");
 
+  runtime_states.prev_cudagraphs_enabled = CUDAGRAPHS_MODE;
+  runtime_states.prev_pre_allocated_outputs_enabled = false;
+
   if (_in_binding_names.size() == 0 && _out_binding_names.size() == 0) {
     uint64_t inputs = 0;
     uint64_t outputs = 0;
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
index f0e427c7da..c9e1571c16 100644
--- a/core/runtime/TRTEngine.h
+++ b/core/runtime/TRTEngine.h
@@ -30,6 +30,37 @@ using FlattenedState = std::tuple<
     std::tuple<std::string, std::string>, // serialized metadata
     std::tuple<std::string, std::string>>; // Platform
 
+struct RuntimeStates {
+  bool need_cudagraphs_record;
+  bool can_use_pre_allocated_outputs;
+};
+
+struct TorchTRTRuntimeStates {
+  // Previous runtime states
+  bool prev_cudagraphs_enabled, prev_pre_allocated_outputs_enabled;
+
+  // Evaluates whether certain conditions are met to enable CUDA Graph recording or to reuse pre-allocated outputs
+  // based on the current and previous states, as well as input shape has changed
+  RuntimeStates validate_states(bool cudagraphs_enabled, bool pre_allocated_outputs_enabled, bool shape_changed) {
+    bool need_cudagraphs_record = false;
+    bool can_use_pre_allocated_outputs = false;
+
+    // Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
+    if (cudagraphs_enabled && (!prev_cudagraphs_enabled || shape_changed)) {
+      need_cudagraphs_record = true;
+    }
+    // Pre-allocated output can be used when previous and current state are true without shape change
+    if (prev_pre_allocated_outputs_enabled && pre_allocated_outputs_enabled && !shape_changed) {
+      can_use_pre_allocated_outputs = true;
+    }
+    prev_cudagraphs_enabled = cudagraphs_enabled;
+    prev_pre_allocated_outputs_enabled = pre_allocated_outputs_enabled;
+
+    RuntimeStates values = {need_cudagraphs_record, can_use_pre_allocated_outputs};
+    return values;
+  }
+};
+
 struct TRTEngine : torch::CustomClassHolder {
   // Each engine needs it's own runtime object
   std::shared_ptr<nvinfer1::IRuntime> rt;
@@ -89,6 +120,7 @@ struct TRTEngine : torch::CustomClassHolder {
   int64_t get_automatic_device_memory_budget();
   std::vector<at::Tensor> infer_outputs(std::vector<std::vector<int64_t>> input_shapes);
   void set_pre_allocated_outputs(bool enable);
+  TorchTRTRuntimeStates runtime_states;
   friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
   static const char BINDING_DELIM = '%';
 
@@ -103,8 +135,7 @@ struct TRTEngine : torch::CustomClassHolder {
   std::vector<at::Tensor> input_buffers = {};
   std::vector<at::Tensor> output_buffers = {};
   std::string shape_key = "None";
-  bool prev_cudagraphs_enabled = false;
-  bool use_pre_allocated_outputs = true;
+  bool use_pre_allocated_outputs = false;
   std::vector<at::Tensor> pre_allocated_outputs;
 
   // TODO: Implement a call method
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
index 8c7ee4352d..2881c21129 100644
--- a/core/runtime/execute_engine.cpp
+++ b/core/runtime/execute_engine.cpp
@@ -203,10 +203,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
   bool shape_changed = _validate_shapes(inputs, compiled_engine);
 
   // Whether cudagraphs needs to record the graph on this pass
-  // Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
-  bool need_cudagraphs_record =
-      (((!compiled_engine->prev_cudagraphs_enabled) && CUDAGRAPHS_MODE) || (CUDAGRAPHS_MODE && shape_changed));
-  compiled_engine->prev_cudagraphs_enabled = CUDAGRAPHS_MODE;
+  RuntimeStates states = compiled_engine->runtime_states.validate_states(
+      CUDAGRAPHS_MODE, compiled_engine->use_pre_allocated_outputs, shape_changed);
+  bool need_cudagraphs_record = states.need_cudagraphs_record;
 
   if (!CUDAGRAPHS_MODE || shape_changed) {
     compiled_engine->cudagraph.reset();
@@ -289,10 +288,10 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
       output_profiler_guard =
           std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
     }
-    if (!compiled_engine->use_pre_allocated_outputs || shape_changed) {
-      outputs = create_output_tensors(compiled_engine);
-    } else {
+    if (states.can_use_pre_allocated_outputs) {
       outputs = compiled_engine->pre_allocated_outputs;
+    } else {
+      outputs = create_output_tensors(compiled_engine);
     }
 
     for (auto output_indices : compiled_engine->out_binding_map) {
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
index 903160ca2b..0af8897fab 100644
--- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -23,6 +23,40 @@
 logger = logging.getLogger(__name__)
 
 
+class TorchTRTRuntimeStates:
+    def __init__(self, cudagraphs_enabled: bool, pre_allocated_outputs_enabled: bool):
+        self.prev_cudagraphs_enabled = cudagraphs_enabled
+        self.prev_pre_allocated_outputs_enabled = pre_allocated_outputs_enabled
+
+    def validate_states(
+        self,
+        cudagraphs_enabled: bool,
+        pre_allocated_outputs_enabled: bool,
+        shape_changed: bool,
+    ) -> Tuple[bool, bool]:
+        # Evaluates whether certain conditions are met to enable CUDA Graph recording or to reuse pre-allocated outputs
+        # based on the current and previous states, as well as input shape has changed
+        need_cudagraphs_record = False
+        can_use_pre_allocated_outputs = False
+
+        # Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
+        if cudagraphs_enabled and (not self.prev_cudagraphs_enabled or shape_changed):
+            need_cudagraphs_record = True
+
+        # Pre-allocated output can be used when previous and current state are true without shape change
+        if (
+            self.prev_pre_allocated_outputs_enabled
+            and pre_allocated_outputs_enabled
+            and (not shape_changed)
+        ):
+            can_use_pre_allocated_outputs = True
+
+        self.prev_cudagraphs_enabled = cudagraphs_enabled
+        self.prev_pre_allocated_outputs_enabled = pre_allocated_outputs_enabled
+
+        return need_cudagraphs_record, can_use_pre_allocated_outputs
+
+
 class PythonTorchTensorRTModule(Module):  # type: ignore[misc]
     """PythonTorchTensorRTModule is a PyTorch module which encompasses an arbitrary TensorRT Engine.
 
@@ -107,7 +141,9 @@ def __init__(
         self.engine = None
         self.weight_name_map = weight_name_map
         self.target_platform = Platform.current_platform()
-        self.prev_cudagraphs_enabled = False
+        self.runtime_states = TorchTRTRuntimeStates(
+            torch_tensorrt.runtime.get_cudagraphs_mode(), False
+        )
         self.pre_allocated_outputs: List[torch.Tensor] = []
         self.use_pre_allocated_outputs = False
 
@@ -318,13 +354,11 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
 
             cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode()
             shape_changed = self.validate_input_shapes(inputs)
-            # Cudagraphs record is required if cudagraphs_enabled is toggled to True regardless of shape change
-            if not self.prev_cudagraphs_enabled and cudagraphs_enabled:
-                need_cudagraphs_record = True
-            else:
-                need_cudagraphs_record = cudagraphs_enabled and shape_changed
-
-            self.prev_cudagraphs_enabled = cudagraphs_enabled
+            need_cudagraphs_record, can_use_pre_allocated_outputs = (
+                self.runtime_states.validate_states(
+                    cudagraphs_enabled, self.use_pre_allocated_outputs, shape_changed
+                )
+            )
 
             if need_cudagraphs_record:
                 if self.cudagraph:
@@ -399,7 +433,9 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                 if self.profiling_enabled
                 else nullcontext()
             ):
-                if not self.use_pre_allocated_outputs or shape_changed:
+                if can_use_pre_allocated_outputs:
+                    outputs = self.pre_allocated_outputs
+                else:
                     self.output_shapes = [
                         tuple(self.context.get_tensor_shape(output_name))
                         for output_name in self.output_names
@@ -409,8 +445,6 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                             "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported."
                         )
                     outputs = self.create_output_tensors()
-                else:
-                    outputs = self.pre_allocated_outputs
 
                 for o, output_name in enumerate(self.output_names):
 
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
index 1c9ef0a9c5..15b4d16ee3 100644
--- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -207,7 +207,6 @@ def setup_engine(self) -> None:
         if self.engine is not None:
             return
         self.engine = torch.classes.tensorrt.Engine(self._pack_engine_info())
-        self.set_pre_allocated_outputs(False)
 
     def encode_metadata(self, metadata: Any) -> str:
         metadata = copy.deepcopy(metadata)
diff --git a/py/torch_tensorrt/runtime/__init__.py b/py/torch_tensorrt/runtime/__init__.py
index 77b4401222..9960460b60 100644
--- a/py/torch_tensorrt/runtime/__init__.py
+++ b/py/torch_tensorrt/runtime/__init__.py
@@ -8,4 +8,5 @@
     set_cudagraphs_mode,
 )
 from torch_tensorrt.runtime._multi_device_safe_mode import set_multi_device_safe_mode
+from torch_tensorrt.runtime._pre_allocated_outputs import enable_pre_allocated_outputs
 from torch_tensorrt.runtime._weight_streaming import weight_streaming
diff --git a/py/torch_tensorrt/runtime/_pre_allocated_outputs.py b/py/torch_tensorrt/runtime/_pre_allocated_outputs.py
new file mode 100644
index 0000000000..c392c38838
--- /dev/null
+++ b/py/torch_tensorrt/runtime/_pre_allocated_outputs.py
@@ -0,0 +1,41 @@
+import logging
+from typing import Any
+
+import torch
+from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule
+
+logger = logging.getLogger(__name__)
+
+
+class _PreAllocatedOutputContextManager(object):
+    """
+    Helper class used to enable pre-allocated output feature in runtime module
+    """
+
+    def __init__(self, module: torch.fx.GraphModule) -> None:
+        rt_mods = []
+        for name, rt_mod in module.named_children():
+            if "_run_on_acc" in name and isinstance(
+                rt_mod, (PythonTorchTensorRTModule, TorchTensorRTModule)
+            ):
+                rt_mods.append(rt_mod)
+        self.rt_mods = rt_mods
+
+    def set_pre_allocated_output(self, enable: bool) -> None:
+        for mod in self.rt_mods:
+            mod.set_pre_allocated_outputs(enable)
+
+    def __enter__(self) -> "_PreAllocatedOutputContextManager":
+        # Enable pre-allocated output
+        self.set_pre_allocated_output(True)
+        return self
+
+    def __exit__(self, *args: Any) -> None:
+        # Disable pre-allocated output
+        self.set_pre_allocated_output(False)
+
+
+def enable_pre_allocated_outputs(
+    module: torch.fx.GraphModule,
+) -> _PreAllocatedOutputContextManager:
+    return _PreAllocatedOutputContextManager(module)
diff --git a/tests/py/dynamo/runtime/test_pre_allocated_outputs.py b/tests/py/dynamo/runtime/test_pre_allocated_outputs.py
new file mode 100644
index 0000000000..b8c7b61fb3
--- /dev/null
+++ b/tests/py/dynamo/runtime/test_pre_allocated_outputs.py
@@ -0,0 +1,130 @@
+import torch
+import torch_tensorrt as torchtrt
+from parameterized import parameterized
+from torch.testing._internal.common_utils import TestCase, run_tests
+
+INPUT_SIZE = (3, 16, 16)
+TRIALS = 5
+
+
+class TestPreAllocatedOutputs(TestCase):
+    @parameterized.expand(
+        [
+            ("python_runtime", True),
+            ("cpp_runtime", False),
+        ]
+    )
+    def test_pre_allocated_outputs_default(self, _, use_python_runtime):
+        class SampleModel(torch.nn.Module):
+            def forward(self, x):
+                return torch.softmax((x + 2) * 7, dim=0)
+
+        model = SampleModel().eval().cuda()
+        inputs = [torch.randn(*INPUT_SIZE).cuda() for _ in range(TRIALS)]
+        fx_graph = torch.fx.symbolic_trace(model)
+
+        # Validate that the results between Torch and Torch-TRT are similar
+        optimized_model = torchtrt.compile(
+            fx_graph,
+            "torch_compile",
+            inputs[0],
+            min_block_size=1,
+            pass_through_build_failures=True,
+            use_python_runtime=use_python_runtime,
+        )
+
+        ref_out_list = []
+        trt_out_list = []
+        with torchtrt.runtime.enable_pre_allocated_outputs(optimized_model):
+            for i in inputs:
+                ref_out_list.append(fx_graph(i).detach().cpu())
+                trt_out_list.append(optimized_model(i).detach().cpu())
+
+        for torch_model_results, optimized_model_results in zip(
+            ref_out_list, trt_out_list
+        ):
+            torch.testing.assert_close(
+                torch_model_results,
+                optimized_model_results,
+                rtol=5e-03,
+                atol=5e-03,
+                equal_nan=True,
+                check_dtype=True,
+            )
+
+        torch._dynamo.reset()
+
+    @parameterized.expand(
+        [
+            ("python_runtime", True),
+            ("cpp_runtime", False),
+        ]
+    )
+    def test_pre_allocated_outputs_dynamic(self, _, use_python_runtime):
+        class SampleModel(torch.nn.Module):
+            def forward(self, x):
+                return torch.relu((x + 2) * 0.5)
+
+        inputs = torchtrt.Input(
+            min_shape=(1, 3, 128, 224),
+            opt_shape=(8, 3, 192, 224),
+            max_shape=(16, 3, 224, 224),
+            dtype=torch.float,
+            name="x",
+        )
+        fx_graph = torch.fx.symbolic_trace(SampleModel())
+
+        optimized_model = torchtrt.compile(
+            fx_graph,
+            "dynamo",
+            inputs,
+            min_block_size=1,
+            pass_through_build_failures=True,
+            torch_executed_ops={"torch.ops.aten.mul.Tensor"},
+            use_python_runtime=use_python_runtime,
+        )
+
+        input_list = []
+        ref_out_list = []
+        trt_out_list = []
+        # Alternating cuda_graphs enable and input shapes at every five iterations.
+        for i in [1, 3, 8, 11, 16]:
+            for j in [128, 128, 222, 222, 224]:
+                input_list.append(torch.randn((i, 3, j, 224)).cuda())
+
+        pre_allocated_output_ctx = torchtrt.runtime.enable_pre_allocated_outputs(
+            optimized_model
+        )
+        pre_allocated_output = False
+        for enable_cuda_graphs in [False, True]:
+            for i in range(len(input_list)):
+                # Toggles cuda graph at all index in TRIALS
+                if i % TRIALS == i // TRIALS:
+                    cuda_graphs = enable_cuda_graphs
+                else:
+                    cuda_graphs = not enable_cuda_graphs
+                if i % 3 == 0:
+                    pre_allocated_output = not pre_allocated_output
+
+                torchtrt.runtime.set_cudagraphs_mode(cuda_graphs)
+                pre_allocated_output_ctx.set_pre_allocated_output(pre_allocated_output)
+
+                ref_out_list.append(fx_graph(input_list[i]))
+                trt_out_list.append(optimized_model(input_list[i]))
+
+        for torch_model_results, optimized_model_results in zip(
+            ref_out_list, trt_out_list
+        ):
+            torch.testing.assert_close(
+                torch_model_results,
+                optimized_model_results,
+                rtol=5e-03,
+                atol=5e-03,
+                equal_nan=True,
+                check_dtype=True,
+            )
+        torch._dynamo.reset()
+
+
+if __name__ == "__main__":
+    run_tests()

From 7f5804862fcfa1b6c2d71ca5343153dba7c02aea Mon Sep 17 00:00:00 2001
From: kee hyun an <keehyuna@nvidia.com>
Date: Thu, 28 Nov 2024 18:02:26 +0900
Subject: [PATCH 7/7] chore: update doc

---
 docsrc/index.rst                              |   2 +
 .../dynamo/pre_allocated_output_example.py    | 111 ++++++++++++++++++
 2 files changed, 113 insertions(+)
 create mode 100644 examples/dynamo/pre_allocated_output_example.py

diff --git a/docsrc/index.rst b/docsrc/index.rst
index 5d88c8ecae..dcb481b0a2 100644
--- a/docsrc/index.rst
+++ b/docsrc/index.rst
@@ -67,6 +67,7 @@ Tutorials
 * :ref:`custom_kernel_plugins`
 * :ref:`mutable_torchtrt_module_example`
 * :ref:`weight_streaming_example`
+* :ref:`pre_allocated_output_example`
 
 .. toctree::
    :caption: Tutorials
@@ -84,6 +85,7 @@ Tutorials
    tutorials/_rendered_examples/dynamo/custom_kernel_plugins
    tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example
    tutorials/_rendered_examples/dynamo/weight_streaming_example
+   tutorials/_rendered_examples/dynamo/pre_allocated_output_example
 
 Dynamo Frontend
 ----------------
diff --git a/examples/dynamo/pre_allocated_output_example.py b/examples/dynamo/pre_allocated_output_example.py
new file mode 100644
index 0000000000..3fa30a9641
--- /dev/null
+++ b/examples/dynamo/pre_allocated_output_example.py
@@ -0,0 +1,111 @@
+"""
+.. _pre_allocated_output_example:
+
+Pre-allocated output buffer
+======================================================
+
+The TensorRT runtime module acts as a wrapper around a PyTorch model (or subgraph) that has been compiled and optimized into a TensorRT engine.
+
+When the compiled module is executed, input and output tensors are set to TensorRT context for processing.
+If output buffer allocation is moved after the execution of the TensorRT context and used it for next inference, GPU tasks and memory allocation tasks can operate concurrently. This overlap allows for more efficient use of GPU resources, potentially improving the performance of inference.
+
+This optimization is particularly effective in below cases
+
+1. Small inference time
+    - The allocation of output buffers typically requires minimal CPU cycles, as the caching mechanism efficiently handles memory reuse. The time taken for this allocation is relatively constant compared to the overall inference time, leading to noticeable performance improvements, especially in scenarios involving small inference workloads. This is because the reduced allocation time contributes to faster execution when the computational workload is not large enough to overshadow these savings.
+2. Multiple graph breaks
+    - If the module contains operations that are not supported by TensorRT, the unsupported parts are handled by PyTorch and this fallback results in a graph break. The cumulative effect of optimized buffer allocations across multiple subgraphs can enhance overall inference performance.
+    - While optimizing output buffers can mitigate some of this overhead, reducing or removing graph breaks should be prioritized as it enables more comprehensive optimizations
+3. Static input or infrequent input shape change
+    - If shape is changed, pre-allocated buffer cannot be used for next inference and there will new allocation before executing the TensorRT context. This feature is not suitable for use cases with frequent input shape changes
+"""
+
+# %%
+# Imports and Model Definition
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+import timeit
+
+import numpy as np
+import torch
+import torch_tensorrt
+from transformers import BertModel
+
+# %%
+# Define function to measure inference performance
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+def test_module_perf(model, *input):
+    timings = []
+
+    # Warm-up phase to ensure consistent and accurate performance measurements.
+    with torch.no_grad():
+        for _ in range(3):
+            model(*input)
+    torch.cuda.synchronize()
+
+    # Timing phase to measure inference performance
+    with torch.no_grad():
+        for i in range(10):
+            start_time = timeit.default_timer()
+            model(*input)
+            torch.cuda.synchronize()
+            end_time = timeit.default_timer()
+            timings.append(end_time - start_time)
+    times = np.array(timings)
+    time_med = np.median(times)
+
+    # Return the median time as a representative performance metric
+    return time_med
+
+
+# %%
+# Load model and compile
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+# Load bert model
+model = (
+    BertModel.from_pretrained("bert-base-uncased", torchscript=True)
+    .eval()
+    .half()
+    .to("cuda")
+)
+# Define sample inputs
+inputs = [
+    torch.randint(0, 5, (1, 128), dtype=torch.int32).to("cuda"),
+    torch.randint(0, 5, (1, 128), dtype=torch.int32).to("cuda"),
+]
+# Next, we compile the model using torch_tensorrt.compile
+optimized_model = torch_tensorrt.compile(
+    model,
+    ir="dynamo",
+    enabled_precisions={torch.half},
+    inputs=inputs,
+)
+
+# %%
+# Enable/Disable pre-allocated output buffer feature using runtime api
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+# We can enable the pre-allocated output buffer with a context manager
+with torch_tensorrt.runtime.enable_pre_allocated_outputs(optimized_model):
+    out_trt = optimized_model(*inputs)
+
+# Alternatively, we can enable the feature using a context object
+pre_allocated_output_ctx = torch_tensorrt.runtime.enable_pre_allocated_outputs(
+    optimized_model
+)
+pre_allocated_output_ctx.set_pre_allocated_output(True)
+time_opt = test_module_perf(optimized_model, *inputs)
+
+# Disable the pre-allocated output buffer feature and perform inference normally
+pre_allocated_output_ctx.set_pre_allocated_output(False)
+out_trt = optimized_model(*inputs)
+time_normal = test_module_perf(optimized_model, *inputs)
+
+time_opt_ms = time_opt * 1000
+time_normal_ms = time_normal * 1000
+
+print(f"normal trt model time: {time_normal_ms:.3f} ms")
+print(f"pre-allocated output buffer model time: {time_opt_ms:.3f} ms")