Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Runtime output buffer optimization #3276

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions core/runtime/TRTEngine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ TRTEngine::TRTEngine(
exec_ctx = make_trt(cuda_engine->createExecutionContext());
TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to create TensorRT execution context");

runtime_states.prev_cudagraphs_enabled = CUDAGRAPHS_MODE;
runtime_states.prev_pre_allocated_outputs_enabled = false;

if (_in_binding_names.size() == 0 && _out_binding_names.size() == 0) {
uint64_t inputs = 0;
uint64_t outputs = 0;
Expand Down
37 changes: 36 additions & 1 deletion core/runtime/TRTEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,37 @@ using FlattenedState = std::tuple<
std::tuple<std::string, std::string>, // serialized metadata
std::tuple<std::string, std::string>>; // Platform

struct RuntimeStates {
bool need_cudagraphs_record;
bool can_use_pre_allocated_outputs;
};
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If weight streaming budget is changed in cuda graph mode, new capture is required.
weight streaming state will be added


struct TorchTRTRuntimeStates {
// Previous runtime states
bool prev_cudagraphs_enabled, prev_pre_allocated_outputs_enabled;

// Evaluates whether certain conditions are met to enable CUDA Graph recording or to reuse pre-allocated outputs
// based on the current and previous states, as well as input shape has changed
RuntimeStates validate_states(bool cudagraphs_enabled, bool pre_allocated_outputs_enabled, bool shape_changed) {
bool need_cudagraphs_record = false;
bool can_use_pre_allocated_outputs = false;

// Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
if (cudagraphs_enabled && (!prev_cudagraphs_enabled || shape_changed)) {
need_cudagraphs_record = true;
}
// Pre-allocated output can be used when previous and current state are true without shape change
if (prev_pre_allocated_outputs_enabled && pre_allocated_outputs_enabled && !shape_changed) {
can_use_pre_allocated_outputs = true;
}
prev_cudagraphs_enabled = cudagraphs_enabled;
prev_pre_allocated_outputs_enabled = pre_allocated_outputs_enabled;

RuntimeStates values = {need_cudagraphs_record, can_use_pre_allocated_outputs};
return values;
}
};

struct TRTEngine : torch::CustomClassHolder {
// Each engine needs it's own runtime object
std::shared_ptr<nvinfer1::IRuntime> rt;
Expand Down Expand Up @@ -88,6 +119,8 @@ struct TRTEngine : torch::CustomClassHolder {
int64_t get_streamable_device_memory_budget();
int64_t get_automatic_device_memory_budget();
std::vector<at::Tensor> infer_outputs(std::vector<std::vector<int64_t>> input_shapes);
void set_pre_allocated_outputs(bool enable);
TorchTRTRuntimeStates runtime_states;
friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
static const char BINDING_DELIM = '%';

Expand All @@ -101,7 +134,9 @@ struct TRTEngine : torch::CustomClassHolder {
at::cuda::CUDAStream caller_stream = c10::cuda::getDefaultCUDAStream();
std::vector<at::Tensor> input_buffers = {};
std::vector<at::Tensor> output_buffers = {};
std::string shape_key;
std::string shape_key = "None";
bool use_pre_allocated_outputs = false;
std::vector<at::Tensor> pre_allocated_outputs;

// TODO: Implement a call method
// c10::List<at::Tensor> Run(c10::List<at::Tensor> inputs);
Expand Down
196 changes: 111 additions & 85 deletions core/runtime/execute_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,8 @@ RTDevice select_rt_device(const RTDevice& engine_device, const RTDevice& curr_de
return new_target_device_opt.value();
}

bool _cudagraphs_validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
// Validate whether the current input shapes to the engine
// invalidate the existing cudagraphs object
bool _validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
// Validate whether the current input shapes to the engine has changed

// Populate the shape key for the inputs
// x: (3, 4), y: (4, 5) --> Key: (3,4)(4,5)
Expand All @@ -83,15 +82,102 @@ bool _cudagraphs_validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_

auto new_shape_key = new_shape_key_ss.str();

// Compare the shape key to the original key and invalidate shapes if they do not match
// Compare the shape key to the original key
if (new_shape_key != compiled_engine->shape_key) {
LOG_DEBUG("Resetting Cudagraph on New Shape Key " << new_shape_key);
LOG_DEBUG("Input shape changed " << compiled_engine->shape_key << " -> " << new_shape_key);
compiled_engine->shape_key = new_shape_key;
compiled_engine->cudagraph.reset();
return false;
return true;
}

return true;
return false;
}
void setup_input_tensors(
std::vector<at::Tensor> inputs,
c10::intrusive_ptr<TRTEngine> compiled_engine,
bool need_cudagraphs_record) {
// this is a buffer to store shape tensor input addresses throughout the runtime scope
std::list<std::vector<int64_t>> inputShapeTensorValues;
std::list<at::Tensor> formatted_inputs(compiled_engine->num_io.first);

for (size_t i = 0; i < inputs.size(); i++) {
std::string name = compiled_engine->in_binding_names[i];

TORCHTRT_CHECK(
inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());

auto expected_type =
util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
TORCHTRT_CHECK(
inputs[i].dtype() == expected_type,
"Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());

auto dims = core::util::toDims(inputs[i].sizes());
auto shape = core::util::toVec(dims);
LOG_DEBUG("Input Name: " << name << " Shape: " << dims);

if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
// Shape tensor inputs are casted to int64 explicitly.
// Refer to
// https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64);
std::vector<int64_t> inputs_cpu_vec(
input_cpu.data_ptr<int64_t>(), input_cpu.data_ptr<int64_t>() + input_cpu.numel());
inputShapeTensorValues.emplace_back(inputs_cpu_vec);
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
"Error while setting the tensor address for shape inputs");

if (CUDAGRAPHS_MODE) {
// @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
compiled_engine->input_buffers[i] = input_cpu;
}
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
"Error while setting the tensor address for shape inputs");

} else {
at::Tensor contig_input = inputs[i].view(shape).contiguous();
formatted_inputs.emplace_back(std::move(contig_input));

if (need_cudagraphs_record) {
// Create a new persistent input buffer
compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone());
}

TORCHTRT_CHECK(
compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");

if (CUDAGRAPHS_MODE) {
// If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), compiled_engine->input_buffers[i].data_ptr()),
"Error while setting the input tensor address for inputs");
} else {
// Otherwise use the formatted buffer directly
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), formatted_inputs.back().data_ptr()),
"Error while setting the input tensor address for inputs");
}
}
}
}
std::vector<at::Tensor> create_output_tensors(c10::intrusive_ptr<TRTEngine> compiled_engine) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we functionalize inputs allocation/creation in the execute engine similar to this ? ( I posted a similar comment in your wrapper module PR)

std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
for (auto output_indices : compiled_engine->out_binding_map) {
// out_binding_map stores TRT_IDX: PYT_IDX
auto pyt_idx = output_indices.second;

std::string name = compiled_engine->out_binding_names[pyt_idx];
auto out_shape = compiled_engine->exec_ctx->getTensorShape(name.c_str());
LOG_DEBUG("Output Name: " << name << " Shape: " << out_shape);

auto dims = core::util::toVec(out_shape);
auto type = util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
outputs[pyt_idx] = std::move(at::empty(dims, {at::kCUDA}).to(type).contiguous());
}

return outputs;
}

std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
Expand All @@ -114,18 +200,18 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
compiled_engine->cudagraph.enable_debug_mode();
}

bool shape_changed = _validate_shapes(inputs, compiled_engine);

// Whether cudagraphs needs to record the graph on this pass
bool need_cudagraphs_record = (CUDAGRAPHS_MODE && (!_cudagraphs_validate_shapes(inputs, compiled_engine)));
RuntimeStates states = compiled_engine->runtime_states.validate_states(
CUDAGRAPHS_MODE, compiled_engine->use_pre_allocated_outputs, shape_changed);
bool need_cudagraphs_record = states.need_cudagraphs_record;

if (!CUDAGRAPHS_MODE) {
if (!CUDAGRAPHS_MODE || shape_changed) {
compiled_engine->cudagraph.reset();
}

// this is a buffer to store shape tensor input addresses throughout the runtime scope
std::list<std::vector<int64_t>> inputShapeTensorValues;

// Intialize inputs and outputs to be available throughout the succeeding scopes
std::list<at::Tensor> formatted_inputs(compiled_engine->num_io.first);
std::vector<at::Tensor> outputs(compiled_engine->num_io.second);

if (MULTI_DEVICE_SAFE_MODE) {
Expand Down Expand Up @@ -183,68 +269,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
}

for (size_t i = 0; i < inputs.size(); i++) {
std::string name = compiled_engine->in_binding_names[i];

TORCHTRT_CHECK(
inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());

auto expected_type =
util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
TORCHTRT_CHECK(
inputs[i].dtype() == expected_type,
"Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());

auto dims = core::util::toDims(inputs[i].sizes());
auto shape = core::util::toVec(dims);
LOG_DEBUG("Input Name: " << name << " Shape: " << dims);

if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
// Shape tensor inputs are casted to int64 explicitly.
// Refer to
// https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64);
std::vector<int64_t> inputs_cpu_vec(
input_cpu.data_ptr<int64_t>(), input_cpu.data_ptr<int64_t>() + input_cpu.numel());
inputShapeTensorValues.emplace_back(inputs_cpu_vec);
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
"Error while setting the tensor address for shape inputs");

if (CUDAGRAPHS_MODE) {
// @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
compiled_engine->input_buffers[i] = input_cpu;
}
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
"Error while setting the tensor address for shape inputs");

} else {
at::Tensor contig_input = inputs[i].view(shape).contiguous();
formatted_inputs.emplace_back(std::move(contig_input));

if (need_cudagraphs_record) {
// Create a new persistent input buffer
compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone());
}

TORCHTRT_CHECK(
compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");

if (CUDAGRAPHS_MODE) {
// If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), compiled_engine->input_buffers[i].data_ptr()),
"Error while setting the input tensor address for inputs");
} else {
// Otherwise use the formatted buffer directly
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), formatted_inputs.back().data_ptr()),
"Error while setting the input tensor address for inputs");
}
}
}
setup_input_tensors(inputs, compiled_engine, need_cudagraphs_record);

// Check if input shapes can be inferred.
int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
Expand All @@ -263,19 +288,15 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
output_profiler_guard =
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
}
if (states.can_use_pre_allocated_outputs) {
outputs = compiled_engine->pre_allocated_outputs;
} else {
outputs = create_output_tensors(compiled_engine);
}

for (auto output_indices : compiled_engine->out_binding_map) {
// out_binding_map stores TRT_IDX: PYT_IDX
auto pyt_idx = output_indices.second;

std::string name = compiled_engine->out_binding_names[pyt_idx];
auto out_shape = compiled_engine->exec_ctx->getTensorShape(name.c_str());
LOG_DEBUG("Output Name: " << name << " Shape: " << out_shape);

auto dims = core::util::toVec(out_shape);
auto type = util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
outputs[pyt_idx] = std::move(at::empty(dims, {at::kCUDA}).to(type).contiguous());

if (need_cudagraphs_record) {
// If we are recording the cuda graph then we need to update the persistent output buffer
compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
Expand Down Expand Up @@ -345,6 +366,11 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
}
} // End engine exeuction (resets to caller stream)

// Create output buffer for next execution of graph or trt context.
if (compiled_engine->use_pre_allocated_outputs) {
compiled_engine->pre_allocated_outputs = create_output_tensors(compiled_engine);
}

// Block caller stream until engine execution is complete
at::cuda::CUDAEvent trt_exec_complete;
trt_exec_complete.record(compiled_engine->engine_stream);
Expand Down
1 change: 1 addition & 0 deletions core/runtime/register_jit_hooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
.def("dump_engine_layer_info", &TRTEngine::dump_engine_layer_info)
.def("get_engine_layer_info", &TRTEngine::get_engine_layer_info)
.def("infer_outputs", &TRTEngine::infer_outputs)
.def_readwrite("use_pre_allocated_outputs", &TRTEngine::use_pre_allocated_outputs)
.def_property(
"device_memory_budget",
&TRTEngine::get_device_memory_budget,
Expand Down
Loading
Loading