Skip to content

Commit

Permalink
chore: Runtime api for pre-allocated outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
keehyuna committed Nov 25, 2024
1 parent f480353 commit 23131c3
Show file tree
Hide file tree
Showing 8 changed files with 259 additions and 21 deletions.
3 changes: 3 additions & 0 deletions core/runtime/TRTEngine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ TRTEngine::TRTEngine(
exec_ctx = make_trt(cuda_engine->createExecutionContext());
TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to create TensorRT execution context");

runtime_states.prev_cudagraphs_enabled = CUDAGRAPHS_MODE;
runtime_states.prev_pre_allocated_outputs_enabled = false;

if (_in_binding_names.size() == 0 && _out_binding_names.size() == 0) {
uint64_t inputs = 0;
uint64_t outputs = 0;
Expand Down
35 changes: 33 additions & 2 deletions core/runtime/TRTEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,37 @@ using FlattenedState = std::tuple<
std::tuple<std::string, std::string>, // serialized metadata
std::tuple<std::string, std::string>>; // Platform

struct RuntimeStates {
bool need_cudagraphs_record;
bool can_use_pre_allocated_outputs;
};

struct TorchTRTRuntimeStates {
// Previous runtime states
bool prev_cudagraphs_enabled, prev_pre_allocated_outputs_enabled;

// Evaluates whether certain conditions are met to enable CUDA Graph recording or to reuse pre-allocated outputs
// based on the current and previous states, as well as input shape has changed
RuntimeStates validate_states(bool cudagraphs_enabled, bool pre_allocated_outputs_enabled, bool shape_changed) {
bool need_cudagraphs_record = false;
bool can_use_pre_allocated_outputs = false;

// Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
if (cudagraphs_enabled && (!prev_cudagraphs_enabled || shape_changed)) {
need_cudagraphs_record = true;
}
// Pre-allocated output can be used when previous and current state are true without shape change
if (prev_pre_allocated_outputs_enabled && pre_allocated_outputs_enabled && !shape_changed) {
can_use_pre_allocated_outputs = true;
}
prev_cudagraphs_enabled = cudagraphs_enabled;
prev_pre_allocated_outputs_enabled = pre_allocated_outputs_enabled;

RuntimeStates values = {need_cudagraphs_record, can_use_pre_allocated_outputs};
return values;
}
};

struct TRTEngine : torch::CustomClassHolder {
// Each engine needs it's own runtime object
std::shared_ptr<nvinfer1::IRuntime> rt;
Expand Down Expand Up @@ -89,6 +120,7 @@ struct TRTEngine : torch::CustomClassHolder {
int64_t get_automatic_device_memory_budget();
std::vector<at::Tensor> infer_outputs(std::vector<std::vector<int64_t>> input_shapes);
void set_pre_allocated_outputs(bool enable);
TorchTRTRuntimeStates runtime_states;
friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
static const char BINDING_DELIM = '%';

Expand All @@ -103,8 +135,7 @@ struct TRTEngine : torch::CustomClassHolder {
std::vector<at::Tensor> input_buffers = {};
std::vector<at::Tensor> output_buffers = {};
std::string shape_key = "None";
bool prev_cudagraphs_enabled = false;
bool use_pre_allocated_outputs = true;
bool use_pre_allocated_outputs = false;
std::vector<at::Tensor> pre_allocated_outputs;

// TODO: Implement a call method
Expand Down
13 changes: 6 additions & 7 deletions core/runtime/execute_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,10 +203,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
bool shape_changed = _validate_shapes(inputs, compiled_engine);

// Whether cudagraphs needs to record the graph on this pass
// Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
bool need_cudagraphs_record =
(((!compiled_engine->prev_cudagraphs_enabled) && CUDAGRAPHS_MODE) || (CUDAGRAPHS_MODE && shape_changed));
compiled_engine->prev_cudagraphs_enabled = CUDAGRAPHS_MODE;
RuntimeStates states = compiled_engine->runtime_states.validate_states(
CUDAGRAPHS_MODE, compiled_engine->use_pre_allocated_outputs, shape_changed);
bool need_cudagraphs_record = states.need_cudagraphs_record;

if (!CUDAGRAPHS_MODE || shape_changed) {
compiled_engine->cudagraph.reset();
Expand Down Expand Up @@ -289,10 +288,10 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
output_profiler_guard =
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
}
if (!compiled_engine->use_pre_allocated_outputs || shape_changed) {
outputs = create_output_tensors(compiled_engine);
} else {
if (states.can_use_pre_allocated_outputs) {
outputs = compiled_engine->pre_allocated_outputs;
} else {
outputs = create_output_tensors(compiled_engine);
}

for (auto output_indices : compiled_engine->out_binding_map) {
Expand Down
56 changes: 45 additions & 11 deletions py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,40 @@
logger = logging.getLogger(__name__)


class TorchTRTRuntimeStates:
def __init__(self, cudagraphs_enabled: bool, pre_allocated_outputs_enabled: bool):
self.prev_cudagraphs_enabled = cudagraphs_enabled
self.prev_pre_allocated_outputs_enabled = pre_allocated_outputs_enabled

def validate_states(
self,
cudagraphs_enabled: bool,
pre_allocated_outputs_enabled: bool,
shape_changed: bool,
) -> Tuple[bool, bool]:
# Evaluates whether certain conditions are met to enable CUDA Graph recording or to reuse pre-allocated outputs
# based on the current and previous states, as well as input shape has changed
need_cudagraphs_record = False
can_use_pre_allocated_outputs = False

# Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
if cudagraphs_enabled and (not self.prev_cudagraphs_enabled or shape_changed):
need_cudagraphs_record = True

# Pre-allocated output can be used when previous and current state are true without shape change
if (
self.prev_pre_allocated_outputs_enabled
and pre_allocated_outputs_enabled
and (not shape_changed)
):
can_use_pre_allocated_outputs = True

self.prev_cudagraphs_enabled = cudagraphs_enabled
self.prev_pre_allocated_outputs_enabled = pre_allocated_outputs_enabled

return need_cudagraphs_record, can_use_pre_allocated_outputs


class PythonTorchTensorRTModule(Module): # type: ignore[misc]
"""PythonTorchTensorRTModule is a PyTorch module which encompasses an arbitrary TensorRT Engine.
Expand Down Expand Up @@ -107,7 +141,9 @@ def __init__(
self.engine = None
self.weight_name_map = weight_name_map
self.target_platform = Platform.current_platform()
self.prev_cudagraphs_enabled = False
self.runtime_states = TorchTRTRuntimeStates(
torch_tensorrt.runtime.get_cudagraphs_mode(), False
)
self.pre_allocated_outputs: List[torch.Tensor] = []
self.use_pre_allocated_outputs = False

Expand Down Expand Up @@ -318,13 +354,11 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .

cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode()
shape_changed = self.validate_input_shapes(inputs)
# Cudagraphs record is required if cudagraphs_enabled is toggled to True regardless of shape change
if not self.prev_cudagraphs_enabled and cudagraphs_enabled:
need_cudagraphs_record = True
else:
need_cudagraphs_record = cudagraphs_enabled and shape_changed

self.prev_cudagraphs_enabled = cudagraphs_enabled
need_cudagraphs_record, can_use_pre_allocated_outputs = (
self.runtime_states.validate_states(
cudagraphs_enabled, self.use_pre_allocated_outputs, shape_changed
)
)

if need_cudagraphs_record:
if self.cudagraph:
Expand Down Expand Up @@ -399,7 +433,9 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
if self.profiling_enabled
else nullcontext()
):
if not self.use_pre_allocated_outputs or shape_changed:
if can_use_pre_allocated_outputs:
outputs = self.pre_allocated_outputs
else:
self.output_shapes = [
tuple(self.context.get_tensor_shape(output_name))
for output_name in self.output_names
Expand All @@ -409,8 +445,6 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
"Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported."
)
outputs = self.create_output_tensors()
else:
outputs = self.pre_allocated_outputs

for o, output_name in enumerate(self.output_names):

Expand Down
1 change: 0 additions & 1 deletion py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,6 @@ def setup_engine(self) -> None:
if self.engine is not None:
return
self.engine = torch.classes.tensorrt.Engine(self._pack_engine_info())
self.set_pre_allocated_outputs(False)

def encode_metadata(self, metadata: Any) -> str:
metadata = copy.deepcopy(metadata)
Expand Down
1 change: 1 addition & 0 deletions py/torch_tensorrt/runtime/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@
set_cudagraphs_mode,
)
from torch_tensorrt.runtime._multi_device_safe_mode import set_multi_device_safe_mode
from torch_tensorrt.runtime._pre_allocated_outputs import enable_pre_allocated_outputs
from torch_tensorrt.runtime._weight_streaming import weight_streaming
41 changes: 41 additions & 0 deletions py/torch_tensorrt/runtime/_pre_allocated_outputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import logging
from typing import Any

import torch
from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule

logger = logging.getLogger(__name__)


class _PreAllocatedOutputContextManager(object):
"""
Helper class used to enable pre-allocated output feature in runtime module
"""

def __init__(self, module: torch.fx.GraphModule) -> None:
rt_mods = []
for name, rt_mod in module.named_children():
if "_run_on_acc" in name and isinstance(
rt_mod, (PythonTorchTensorRTModule, TorchTensorRTModule)
):
rt_mods.append(rt_mod)
self.rt_mods = rt_mods

def set_pre_allocated_output(self, enable: bool) -> None:
for mod in self.rt_mods:
mod.set_pre_allocated_outputs(enable)

def __enter__(self) -> "_PreAllocatedOutputContextManager":
# Enable pre-allocated output
self.set_pre_allocated_output(True)
return self

def __exit__(self, *args: Any) -> None:
# Disable pre-allocated output
self.set_pre_allocated_output(False)


def enable_pre_allocated_outputs(
module: torch.fx.GraphModule,
) -> _PreAllocatedOutputContextManager:
return _PreAllocatedOutputContextManager(module)
130 changes: 130 additions & 0 deletions tests/py/dynamo/runtime/test_pre_allocated_outputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import torch
import torch_tensorrt as torchtrt
from parameterized import parameterized
from torch.testing._internal.common_utils import TestCase, run_tests

INPUT_SIZE = (3, 16, 16)
TRIALS = 5


class TestPreAllocatedOutputs(TestCase):
@parameterized.expand(
[
("python_runtime", True),
("cpp_runtime", False),
]
)
def test_pre_allocated_outputs_default(self, _, use_python_runtime):
class SampleModel(torch.nn.Module):
def forward(self, x):
return torch.softmax((x + 2) * 7, dim=0)

model = SampleModel().eval().cuda()
inputs = [torch.randn(*INPUT_SIZE).cuda() for _ in range(TRIALS)]
fx_graph = torch.fx.symbolic_trace(model)

# Validate that the results between Torch and Torch-TRT are similar
optimized_model = torchtrt.compile(
fx_graph,
"torch_compile",
inputs[0],
min_block_size=1,
pass_through_build_failures=True,
use_python_runtime=use_python_runtime,
)

ref_out_list = []
trt_out_list = []
with torchtrt.runtime.enable_pre_allocated_outputs(optimized_model):
for i in inputs:
ref_out_list.append(fx_graph(i).detach().cpu())
trt_out_list.append(optimized_model(i).detach().cpu())

for torch_model_results, optimized_model_results in zip(
ref_out_list, trt_out_list
):
torch.testing.assert_close(
torch_model_results,
optimized_model_results,
rtol=5e-03,
atol=5e-03,
equal_nan=True,
check_dtype=True,
)

torch._dynamo.reset()

@parameterized.expand(
[
("python_runtime", True),
("cpp_runtime", False),
]
)
def test_pre_allocated_outputs_dynamic(self, _, use_python_runtime):
class SampleModel(torch.nn.Module):
def forward(self, x):
return torch.relu((x + 2) * 0.5)

inputs = torchtrt.Input(
min_shape=(1, 3, 128, 224),
opt_shape=(8, 3, 192, 224),
max_shape=(16, 3, 224, 224),
dtype=torch.float,
name="x",
)
fx_graph = torch.fx.symbolic_trace(SampleModel())

optimized_model = torchtrt.compile(
fx_graph,
"dynamo",
inputs,
min_block_size=1,
pass_through_build_failures=True,
torch_executed_ops={"torch.ops.aten.mul.Tensor"},
use_python_runtime=use_python_runtime,
)

input_list = []
ref_out_list = []
trt_out_list = []
# Alternating cuda_graphs enable and input shapes at every five iterations.
for i in [1, 3, 8, 11, 16]:
for j in [128, 128, 222, 222, 224]:
input_list.append(torch.randn((i, 3, j, 224)).cuda())

pre_allocated_output_ctx = torchtrt.runtime.enable_pre_allocated_outputs(
optimized_model
)
pre_allocated_output = False
for enable_cuda_graphs in [False, True]:
for i in range(len(input_list)):
# Toggles cuda graph at all index in TRIALS
if i % TRIALS == i // TRIALS:
cuda_graphs = enable_cuda_graphs
else:
cuda_graphs = not enable_cuda_graphs
if i % 3 == 0:
pre_allocated_output = not pre_allocated_output

torchtrt.runtime.set_cudagraphs_mode(cuda_graphs)
pre_allocated_output_ctx.set_pre_allocated_output(pre_allocated_output)

ref_out_list.append(fx_graph(input_list[i]))
trt_out_list.append(optimized_model(input_list[i]))

for torch_model_results, optimized_model_results in zip(
ref_out_list, trt_out_list
):
torch.testing.assert_close(
torch_model_results,
optimized_model_results,
rtol=5e-03,
atol=5e-03,
equal_nan=True,
check_dtype=True,
)
torch._dynamo.reset()


if __name__ == "__main__":
run_tests()

0 comments on commit 23131c3

Please sign in to comment.