From 2ea181aaa74c291f37879eb97e18577f89156cfe Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 12 Jun 2024 17:24:51 -0700 Subject: [PATCH 01/55] chore: add gpt2 example --- examples/dynamo/torch_compile_gpt2.py | 70 +++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 examples/dynamo/torch_compile_gpt2.py diff --git a/examples/dynamo/torch_compile_gpt2.py b/examples/dynamo/torch_compile_gpt2.py new file mode 100644 index 0000000000..ddbb2573e6 --- /dev/null +++ b/examples/dynamo/torch_compile_gpt2.py @@ -0,0 +1,70 @@ +import torch +import torch_tensorrt +from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteriaList +from transformers.generation.stopping_criteria import ( + EosTokenCriteria, + MaxLengthCriteria, +) + +# Define tokenizer and model +torch_device = "cuda" if torch.cuda.is_available() else "cpu" +tokenizer = AutoTokenizer.from_pretrained("gpt2") +model = ( + AutoModelForCausalLM.from_pretrained( + "gpt2", pad_token_id=tokenizer.eos_token_id, use_cache=False + ) + .eval() + .to(torch_device) +) + +# Input prompt +model_inputs = tokenizer("I enjoy walking with my cute dog", return_tensors="pt").to( + torch_device +) +input_ids = model_inputs["input_ids"] +max_tokens = 40 + +# Pyt model outputs +greedy_output = model.generate(**model_inputs, max_new_tokens=max_tokens) +print( + "Pytorch model generated text: ", + tokenizer.decode(greedy_output[0], skip_special_tokens=True), +) + +# Compile Torch-TRT model +torch._dynamo.mark_dynamic(input_ids, 1, min=7, max=1023) +model.forward = torch.compile( + model.forward, + backend="tensorrt", + dynamic=None, + options={ + "enabled_precisions": {torch.float}, + "torch_executed_ops": {"torch.ops.aten.slice.Tensor"}, + "debug": True, + "disable_tf32": True, + }, +) + +# Auto-regressive generation loop for greedy search +stopping_criteria = StoppingCriteriaList( + [ + MaxLengthCriteria(max_length=max_tokens), + EosTokenCriteria(eos_token_id=tokenizer.eos_token_id), + ] +) +token_id = 0 +while token_id < 20: + trt_outputs = model(input_ids) + logits = trt_outputs.logits + next_token_logits = logits[:, -1, :] + next_tokens = torch.argmax(next_token_logits, dim=-1) + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + if stopping_criteria(input_ids, logits).item(): + break + token_id += 1 + +# Decode the sentence +print( + "TensorRT model generated text: ", + tokenizer.decode(input_ids[0], skip_special_tokens=True), +) From 37b65a5900ba7ec4bf0f0193b3e27ea9018c8d2f Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 13 Jun 2024 00:44:17 +0000 Subject: [PATCH 02/55] chore: add llama2 example --- examples/dynamo/torch_export_llama2.py | 48 ++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 examples/dynamo/torch_export_llama2.py diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py new file mode 100644 index 0000000000..cc4c6beda5 --- /dev/null +++ b/examples/dynamo/torch_export_llama2.py @@ -0,0 +1,48 @@ +import torch +import torch_tensorrt +from transformers import AutoModelForCausalLM, AutoTokenizer + + +def export_llama2(model, inputs): + """ + Exports the llama2 model into an ExportedProgram + """ + with torch.no_grad(): + seq_len = torch.export.Dim("seq_len", min=2, max=1024) + ep = torch.export.export( + model, (inputs,), dynamic_shapes=({1: seq_len},), strict=False + ) + + return ep + + +# Define the Llama2 model from hugging face +# kv_cache is not supported in Torch-TRT currently. +# attn_implementation=sdpa has tracing issues +llama_path = "meta-llama/Llama-2-7b-hf" +model = ( + AutoModelForCausalLM.from_pretrained( + llama_path, use_cache=False, attn_implementation="eager" + ) + .eval() + .cuda() +) +tokenizer = AutoTokenizer.from_pretrained(llama_path) + +base_prompt = "How many hours are in a day?" +base_inputs = tokenizer(base_prompt, return_tensors="pt").to("cuda:0") +input_ids = base_inputs.input_ids +pyt_out = model(input_ids) + +llama2_ep = export_llama2(model, input_ids) +trt_model = torch_tensorrt.dynamo.compile( + llama2_ep, + inputs=[input_ids], + enabled_precisions={torch.float16}, + min_block_size=5, + truncate_double=True, + torch_executed_ops={"torch.ops.aten.slice.Tensor"}, + debug=True, +) + +trt_out = model(input_ids) From 619393947088d535a28af3fd5a95041a2a83ea5b Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 14 Jun 2024 15:12:44 -0700 Subject: [PATCH 03/55] chore: updates --- examples/dynamo/torch_export_gpt2.py | 104 +++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 examples/dynamo/torch_export_gpt2.py diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py new file mode 100644 index 0000000000..f9391f4d87 --- /dev/null +++ b/examples/dynamo/torch_export_gpt2.py @@ -0,0 +1,104 @@ +import torch +import torch_tensorrt +from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteriaList +from transformers.generation.stopping_criteria import ( + EosTokenCriteria, + MaxLengthCriteria, +) + + +def export_gpt2(model, inputs): + """ + Exports the llama2 model into an ExportedProgram + """ + with torch.no_grad(): + # max=1024 has contraint violation error. https://github.com/pytorch/pytorch/issues/125604 + seq_len = torch.export.Dim("seq_len", min=1, max=1024) + try: + print("Trying to export the model using torch.export.export()..") + # strict=False only enables aotautograd tracing and excludes dynamo. + ep = torch.export.export( + model, (inputs,), dynamic_shapes=({1: seq_len},), strict=False + ) + except: + print( + "Trying torch.export._trace._export to trace the graph since torch.export.export() failed" + ) + # This API is used to express the constraint violation guards as asserts in the graph. + ep = torch.export._trace._export( + model, + (inputs,), + dynamic_shapes=({1: seq_len},), + strict=False, + _allow_complex_guards_as_runtime_asserts=True, + ) + + return ep + + +# Define tokenizer and model +torch_device = "cuda" if torch.cuda.is_available() else "cpu" +tokenizer = AutoTokenizer.from_pretrained("gpt2") +model = ( + AutoModelForCausalLM.from_pretrained( + "gpt2", + pad_token_id=tokenizer.eos_token_id, + use_cache=False, + ) + .eval() + .to(torch_device) +) + +# Input prompt +model_inputs = tokenizer("I enjoy walking with my cute dog", return_tensors="pt").to( + torch_device +) +input_ids = model_inputs["input_ids"] +max_tokens = 40 + +# Pyt model outputs +# greedy_output = model.generate(**model_inputs, max_new_tokens=max_tokens) +# print( +# "Pytorch model generated text: ", +# tokenizer.decode(greedy_output[0], skip_special_tokens=True), +# ) +pyt_outputs = model(input_ids) + +# Compile Torch-TRT model +gpt2_ep = export_gpt2(model, input_ids) +trt_model = torch_tensorrt.dynamo.compile( + gpt2_ep, + inputs=[input_ids], + enabled_precisions={torch.float32}, + # min_block_size=5, + truncate_double=True, + torch_executed_ops={"torch.ops.aten.slice.Tensor"}, + disable_tf32=True, + debug=True, +) + +trt_outputs = trt_model(input_ids) + +# Auto-regressive generation loop for greedy search +stopping_criteria = StoppingCriteriaList( + [ + MaxLengthCriteria(max_length=max_tokens), + EosTokenCriteria(eos_token_id=tokenizer.eos_token_id), + ] +) +token_id = 0 +while token_id < 20: + trt_outputs = model(input_ids) + logits = trt_outputs.logits + next_token_logits = logits[:, -1, :] + next_tokens = torch.argmax(next_token_logits, dim=-1) + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + if stopping_criteria(input_ids, logits).item(): + break + token_id += 1 + +# Decode the sentence +print( + "TensorRT model generated text: ", + tokenizer.decode(input_ids[0], skip_special_tokens=True), +) From 9af8e3935ec620e25bd34fb9464583102c593676 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 25 Jun 2024 11:30:56 -0700 Subject: [PATCH 04/55] chore: remove aten.full decomposition --- py/torch_tensorrt/dynamo/lowering/_decomposition_groups.py | 1 - 1 file changed, 1 deletion(-) diff --git a/py/torch_tensorrt/dynamo/lowering/_decomposition_groups.py b/py/torch_tensorrt/dynamo/lowering/_decomposition_groups.py index 1fca374dcb..177d4962d5 100644 --- a/py/torch_tensorrt/dynamo/lowering/_decomposition_groups.py +++ b/py/torch_tensorrt/dynamo/lowering/_decomposition_groups.py @@ -166,7 +166,6 @@ aten.clamp_min, aten.clamp_max, aten.linalg_vector_norm, - aten.full, aten.repeat, } torch_disabled_decompositions: Set[Union[OpOverload, OpOverloadPacket]] = { From 50d4096581a2ec513cca9dedc5dfb96ce2f62942 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 25 Jun 2024 16:26:44 -0700 Subject: [PATCH 05/55] chore: fix expand DS support --- .../dynamo/conversion/impl/slice/ops.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index 8927367135..301b5cb0e4 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -14,6 +14,7 @@ get_trt_tensor, ) from torch_tensorrt.dynamo.conversion.impl.cat import cat +from torch_tensorrt.dynamo.conversion.impl.shape import shape as get_shape from torch_tensorrt.dynamo.conversion.impl.slice.base import slice from torch_tensorrt.fx.converters.converter_utils import ( has_dynamic_shape, @@ -90,10 +91,14 @@ def expand( # After the above padding, the shape and tensor rank must be equal assert len(input_t.shape) == shape_rank - # -1 denotes taking the shape from the original input tensor - shape = tuple( - [input_t.shape[i] if shape[i] == -1 else shape[i] for i in range(shape_rank)] - ) + shape_t = [] + for i in range(shape_rank): + if shape[i] == -1: + shape_t.append( + get_shape(ctx, target, source_ir, name + f"_shape_dim{i}", input_t, i) + ) + else: + shape_t.append(shape[i]) # Establish the desired output shape, strides, and starting indices input_tensor_shape = tuple(input_t.shape) @@ -102,7 +107,7 @@ def expand( [int(i == o) for i, o in zip(input_tensor_shape, shape)] ) # stride == 1 if dimensions match, 0 otherwise - shape_ = shape + shape_ = shape_t # Handle dynamic shapes case where shape has dynamic dimension if any(isinstance(ele, TRTTensor) for ele in shape): shape_ = cat( @@ -110,7 +115,7 @@ def expand( target, source_ir, name + "_shape_concat", - shape, + shape_t, 0, cast_dtype=trt.int32, ) From 59febf588b8b903f9ccb85dbb93d59fc1df2ba3e Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 26 Jun 2024 08:36:57 -0700 Subject: [PATCH 06/55] chore: minor fix --- py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index 301b5cb0e4..e02878d0e7 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -109,7 +109,7 @@ def expand( shape_ = shape_t # Handle dynamic shapes case where shape has dynamic dimension - if any(isinstance(ele, TRTTensor) for ele in shape): + if any(isinstance(ele, TRTTensor) for ele in shape_t): shape_ = cat( ctx, target, From c3e4382c4ab4409dc0e69cc740576da43359bf65 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 26 Jun 2024 12:40:07 -0700 Subject: [PATCH 07/55] chore: updates --- .../dynamo/conversion/impl/slice/ops.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index e02878d0e7..c1bd80f78a 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -103,9 +103,16 @@ def expand( # Establish the desired output shape, strides, and starting indices input_tensor_shape = tuple(input_t.shape) start = tuple([0] * shape_rank) - stride = tuple( - [int(i == o) for i, o in zip(input_tensor_shape, shape)] - ) # stride == 1 if dimensions match, 0 otherwise + + # TODO: Revisit stride calculation. stride[dim]=0 implies that dimension is being broadcasted. + # stride should be 1 for all non-broadcasted dims + stride = [] + for i, o in zip(input_tensor_shape, shape_t): + # If the shape has ITensor, we treat it as a reshape dim instead of a broadcasted dim + if isinstance(i, int) and isinstance(o, int): + stride.append(int(i == o)) + else: + stride.append(1) shape_ = shape_t # Handle dynamic shapes case where shape has dynamic dimension From 0673db4ffd9fa4e9c062516763835b66223593b8 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 26 Jun 2024 13:53:08 -0700 Subject: [PATCH 08/55] chore: add testcase --- .../dynamo/conversion/impl/slice/ops.py | 4 ++- .../py/dynamo/conversion/test_expand_aten.py | 28 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index c1bd80f78a..7a6a3f03f0 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -16,6 +16,7 @@ from torch_tensorrt.dynamo.conversion.impl.cat import cat from torch_tensorrt.dynamo.conversion.impl.shape import shape as get_shape from torch_tensorrt.dynamo.conversion.impl.slice.base import slice +from torch_tensorrt.dynamo.utils import DYNAMIC_DIM from torch_tensorrt.fx.converters.converter_utils import ( has_dynamic_shape, prepend_ones, @@ -109,7 +110,8 @@ def expand( stride = [] for i, o in zip(input_tensor_shape, shape_t): # If the shape has ITensor, we treat it as a reshape dim instead of a broadcasted dim - if isinstance(i, int) and isinstance(o, int): + # shape_t cannot have -1. If the input at this dimension has a shape of -1, set the stride to 1. This indicates that the input is dynamic and does not imply broadcasting at that specific dimension. + if isinstance(i, int) and isinstance(o, int) and i != DYNAMIC_DIM: stride.append(int(i == o)) else: stride.append(1) diff --git a/tests/py/dynamo/conversion/test_expand_aten.py b/tests/py/dynamo/conversion/test_expand_aten.py index 0d35700139..34ba53b800 100644 --- a/tests/py/dynamo/conversion/test_expand_aten.py +++ b/tests/py/dynamo/conversion/test_expand_aten.py @@ -2,6 +2,7 @@ import torch.nn as nn from parameterized import parameterized from torch.testing._internal.common_utils import run_tests +from torch_tensorrt import Input from .harness import DispatchTestCase @@ -27,6 +28,33 @@ def forward(self, x): inputs, ) + @parameterized.expand( + [ + ("2d_dim", (2, 1), (4, 1), (6, 1), (-1, 3)), + ("3d_dim", (2, 1, 1), (4, 1, 1), (6, 1, 1), (-1, 3, 4)), + ("4d_dim", (1, 1, 1, 1), (3, 1, 1, 1), (5, 1, 1, 1), (-1, 2, 3, 6)), + ("keep_dim", (2, 1, 5, 5), (4, 1, 5, 5), (6, 1, 5, 5), (-1, 3, -1, -1)), + ("different_ranks", (1, 2, 1), (1, 2, 1), (2, 2, 1), (2, -1, -1, -1)), + ] + ) + def test_expand_dynamic(self, _, min_shape, opt_shape, max_shape, expanded_shape): + class ExpandDynamic(nn.Module): + def forward(self, x): + return torch.ops.aten.expand.default(x, expanded_shape) + + input_specs = [ + Input( + dtype=torch.float32, + min_shape=min_shape, + opt_shape=opt_shape, + max_shape=max_shape, + ), + ] + self.run_test_with_dynamic_shape( + ExpandDynamic(), + input_specs, + ) + if __name__ == "__main__": run_tests() From 4464fd54dad0324b153782843e200f436fc33ae0 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 26 Jun 2024 14:14:19 -0700 Subject: [PATCH 09/55] chore: updates --- examples/dynamo/torch_export_gpt2.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py index f9391f4d87..43af33750c 100644 --- a/examples/dynamo/torch_export_gpt2.py +++ b/examples/dynamo/torch_export_gpt2.py @@ -57,11 +57,7 @@ def export_gpt2(model, inputs): max_tokens = 40 # Pyt model outputs -# greedy_output = model.generate(**model_inputs, max_new_tokens=max_tokens) -# print( -# "Pytorch model generated text: ", -# tokenizer.decode(greedy_output[0], skip_special_tokens=True), -# ) +greedy_output = model.generate(**model_inputs, max_new_tokens=max_tokens) pyt_outputs = model(input_ids) # Compile Torch-TRT model @@ -70,24 +66,23 @@ def export_gpt2(model, inputs): gpt2_ep, inputs=[input_ids], enabled_precisions={torch.float32}, - # min_block_size=5, truncate_double=True, torch_executed_ops={"torch.ops.aten.slice.Tensor"}, disable_tf32=True, - debug=True, ) trt_outputs = trt_model(input_ids) # Auto-regressive generation loop for greedy search +max_length = len(input_ids) + max_tokens stopping_criteria = StoppingCriteriaList( [ - MaxLengthCriteria(max_length=max_tokens), + MaxLengthCriteria(max_length=max_length), EosTokenCriteria(eos_token_id=tokenizer.eos_token_id), ] ) token_id = 0 -while token_id < 20: +while token_id < max_tokens: trt_outputs = model(input_ids) logits = trt_outputs.logits next_token_logits = logits[:, -1, :] @@ -98,6 +93,11 @@ def export_gpt2(model, inputs): token_id += 1 # Decode the sentence +print( + "Pytorch model generated text: ", + tokenizer.decode(greedy_output[0], skip_special_tokens=True), +) +print("=============================") print( "TensorRT model generated text: ", tokenizer.decode(input_ids[0], skip_special_tokens=True), From 63b13cfc48fcc450843b2d292f15d65be4851e11 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 28 Jun 2024 09:56:13 -0700 Subject: [PATCH 10/55] chore: updates --- examples/dynamo/torch_export_llama2.py | 14 +++++--- .../dynamo/partitioning/common.py | 32 +++++++++++++++---- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py index cc4c6beda5..fdc814c9fa 100644 --- a/examples/dynamo/torch_export_llama2.py +++ b/examples/dynamo/torch_export_llama2.py @@ -8,7 +8,7 @@ def export_llama2(model, inputs): Exports the llama2 model into an ExportedProgram """ with torch.no_grad(): - seq_len = torch.export.Dim("seq_len", min=2, max=1024) + seq_len = torch.export.Dim("seq_len", min=1, max=64) ep = torch.export.export( model, (inputs,), dynamic_shapes=({1: seq_len},), strict=False ) @@ -38,11 +38,17 @@ def export_llama2(model, inputs): trt_model = torch_tensorrt.dynamo.compile( llama2_ep, inputs=[input_ids], - enabled_precisions={torch.float16}, - min_block_size=5, + enabled_precisions={torch.float32}, + min_block_size=1, truncate_double=True, torch_executed_ops={"torch.ops.aten.slice.Tensor"}, debug=True, + disable_tf32=True, ) -trt_out = model(input_ids) +trt_out = trt_model(input_ids) +# breakpoint() +# print("Mean diff: ", torch.mean(torch.abs(pyt_out.logits-trt_out.logits))) +print("Mean diff: ", torch.mean(torch.abs(pyt_out - trt_out))) +breakpoint() +print("done") diff --git a/py/torch_tensorrt/dynamo/partitioning/common.py b/py/torch_tensorrt/dynamo/partitioning/common.py index fdc55126ee..40900106b0 100644 --- a/py/torch_tensorrt/dynamo/partitioning/common.py +++ b/py/torch_tensorrt/dynamo/partitioning/common.py @@ -18,7 +18,10 @@ def contains_sym_int(tensor: torch.Tensor) -> bool: def construct_dynamic_input( - input_shape: torch.Size, input_dtype: torch.dtype, is_shape_tensor: bool = False + input_shape: torch.Size, + input_dtype: torch.dtype, + name: str = "", + is_shape_tensor: bool = False, ) -> Input: """ Constructs a torch_tensorrt.Input based on a symbolic input @@ -63,22 +66,28 @@ def construct_dynamic_input( opt_shape=opt_shape, max_shape=max_shape, dtype=input_dtype, + name=name, is_shape_tensor=is_shape_tensor, ) def get_input( - input_shape: torch.Size, dtype: torch.dtype, is_shape_tensor: bool = False + input_shape: torch.Size, + dtype: torch.dtype, + name: str = "", + is_shape_tensor: bool = False, ) -> Input: """ Based on type of dimensions in the input_shape, construct regular or dynamic shaped inputs """ if contains_sym_int(input_shape): return construct_dynamic_input( - input_shape, dtype, is_shape_tensor=is_shape_tensor + input_shape, dtype, name=name, is_shape_tensor=is_shape_tensor ) else: - return Input(shape=input_shape, dtype=dtype, is_shape_tensor=is_shape_tensor) + return Input( + shape=input_shape, dtype=dtype, name=name, is_shape_tensor=is_shape_tensor + ) def construct_submodule_inputs(module: torch.fx.GraphModule) -> Sequence[Input]: @@ -101,11 +110,18 @@ def construct_submodule_inputs(module: torch.fx.GraphModule) -> Sequence[Input]: input_meta = input.meta["val"] if isinstance(input_meta, (FakeTensor, torch.Tensor)): input_shape = input_meta.size() - torchtrt_inputs.append(get_input(input_shape, input_meta.dtype)) + torchtrt_inputs.append( + get_input(input_shape, input_meta.dtype, name=input.name) + ) elif isinstance(input_meta, torch.SymInt): # Assuming sym_integers | shape inputs always have torch.int64 dtype torchtrt_inputs.append( - get_input([input_meta], torch.int64, is_shape_tensor=True) + get_input( + [input_meta], + torch.int64, + name=input.name, + is_shape_tensor=True, + ) ) else: raise ValueError( @@ -115,7 +131,9 @@ def construct_submodule_inputs(module: torch.fx.GraphModule) -> Sequence[Input]: elif "tensor_meta" in input.meta: input_meta = input.meta["tensor_meta"] input_shape = input_meta.shape - torchtrt_inputs.append(get_input(input_shape, input_meta.dtype)) + torchtrt_inputs.append( + get_input(input_shape, input_meta.dtype, name=input.name) + ) else: raise AssertionError( f"Input {input.name} does not contain val and tensor_meta fields in the metadata. Please ensure you have exported the graph correctly" From e97a94fdb64d46889043e4058e7c1742d861b68c Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 10 Jul 2024 15:29:39 -0700 Subject: [PATCH 11/55] chore: updates --- examples/dynamo/torch_export_llama2.py | 34 +++- py/torch_tensorrt/dynamo/_compiler.py | 20 +++ .../dynamo/conversion/aten_ops_converters.py | 3 + .../dynamo/conversion/converter_utils.py | 7 +- .../dynamo/conversion/impl/cast.py | 10 +- .../dynamo/conversion/impl/elementwise/ops.py | 9 +- .../dynamo/conversion/impl/select.py | 7 +- .../dynamo/conversion/impl/slice/ops.py | 170 +++++++++++++++++- 8 files changed, 224 insertions(+), 36 deletions(-) diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py index fdc814c9fa..b8b266e3a2 100644 --- a/examples/dynamo/torch_export_llama2.py +++ b/examples/dynamo/torch_export_llama2.py @@ -8,10 +8,26 @@ def export_llama2(model, inputs): Exports the llama2 model into an ExportedProgram """ with torch.no_grad(): - seq_len = torch.export.Dim("seq_len", min=1, max=64) - ep = torch.export.export( - model, (inputs,), dynamic_shapes=({1: seq_len},), strict=False - ) + # max=1024 has contraint violation error. https://github.com/pytorch/pytorch/issues/125604 + seq_len = torch.export.Dim("seq_len", min=1, max=1024) + try: + print("Trying to export the model using torch.export.export()..") + # strict=False only enables aotautograd tracing and excludes dynamo. + ep = torch.export.export( + model, (inputs,), dynamic_shapes=({1: seq_len},), strict=False + ) + except: + print( + "Trying torch.export._trace._export to trace the graph since torch.export.export() failed" + ) + # This API is used to express the constraint violation guards as asserts in the graph. + ep = torch.export._trace._export( + model, + (inputs,), + dynamic_shapes=({1: seq_len},), + strict=False, + _allow_complex_guards_as_runtime_asserts=True, + ) return ep @@ -22,7 +38,7 @@ def export_llama2(model, inputs): llama_path = "meta-llama/Llama-2-7b-hf" model = ( AutoModelForCausalLM.from_pretrained( - llama_path, use_cache=False, attn_implementation="eager" + llama_path, use_cache=False, attn_implementation="sdpa" ) .eval() .cuda() @@ -41,14 +57,14 @@ def export_llama2(model, inputs): enabled_precisions={torch.float32}, min_block_size=1, truncate_double=True, - torch_executed_ops={"torch.ops.aten.slice.Tensor"}, + # torch_executed_ops={"torch.ops.aten.slice.Tensor"}, debug=True, disable_tf32=True, ) trt_out = trt_model(input_ids) -# breakpoint() +breakpoint() # print("Mean diff: ", torch.mean(torch.abs(pyt_out.logits-trt_out.logits))) print("Mean diff: ", torch.mean(torch.abs(pyt_out - trt_out))) -breakpoint() -print("done") +# breakpoint() +# print("done") diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index add9687758..bf3eba7cfc 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -276,6 +276,21 @@ def compile_module( "for the most thorough analysis" ) + def _update_sdpa_meta(gm: torch.fx.GraphModule) -> bool: + for node in gm.graph.nodes: + if node.target == torch._C._nn.scaled_dot_product_attention: + # We are using the "value" vector metadata + node.meta["val"] = node.args[2].meta["val"] + + return True + + # Update SDPA op meta since it is empty. + # TODO: Remove this once Pytorch fixes this. + if _update_sdpa_meta(gm): + logger.debug( + "Found torch._C._nn.scaled_dot_product_attention operator in subgraph. Updating its metadata manually as PyTorch doesn't provide it." + ) + # If the number of supported operations is 0 or less than the block size, skip the subgraph # TODO: Add condition to second expression below when require_full_compilation is added if num_supported_ops == 0 or ( @@ -354,6 +369,11 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool: # Criteria for a module to be convertible to TRT if settings.use_fast_partitioner and "_run_on_acc" not in name: dryrun_tracker.to_run_in_torch.extend(parse_non_trt_nodes(submodule)) + logger.debug( + "Submodule in PyTorch: %s\n %s", + str(name), + str(submodule.graph), + ) continue subgraph_data = PerSubgraphData() diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index f524531d22..017282b9d3 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -315,6 +315,7 @@ def aten_ops_embedding_bag( ) +@dynamo_tensorrt_converter(operator.mod, supports_dynamic_shapes=True) @dynamo_tensorrt_converter(torch.ops.aten.fmod.Scalar) @dynamo_tensorrt_converter(torch.ops.aten.fmod.Tensor) def aten_ops_fmod( @@ -1991,6 +1992,7 @@ def aten_ops_div( ) +@dynamo_tensorrt_converter(operator.pow, supports_dynamic_shapes=True) @dynamo_tensorrt_converter( torch.ops.aten.pow.Tensor_Tensor, supports_dynamic_shapes=True ) @@ -2251,6 +2253,7 @@ def aten_ops_bitwise_not( ) +@dynamo_tensorrt_converter(operator.eq, supports_dynamic_shapes=True) @dynamo_tensorrt_converter(torch.ops.aten.eq.Tensor) @dynamo_tensorrt_converter(torch.ops.aten.eq.Scalar) @enforce_tensor_types( diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py index 4bff27fd26..5c4c41870f 100644 --- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py +++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py @@ -160,10 +160,9 @@ def cast_trt_tensor( target_str = ConverterRegistry.qualified_name_or_str(target) target_name = f"{source_ir}_ops{('.' + target_str) if target_str else ''}" - identity_layer = ctx.net.add_identity(input_val) - identity_layer.set_output_type(0, trt_dtype) - identity_layer.name = f"Cast ITensor {input_val.name} from {input_val.dtype} to {trt_dtype} - [{target_name}]-[{name}]" - return identity_layer.get_output(0) + cast_layer = ctx.net.add_cast(input_val, trt_dtype) + cast_layer.name = f"Cast ITensor {input_val.name} from {input_val.dtype} to {trt_dtype} - [{target_name}]-[{name}]" + return cast_layer.get_output(0) else: return input_val diff --git a/py/torch_tensorrt/dynamo/conversion/impl/cast.py b/py/torch_tensorrt/dynamo/conversion/impl/cast.py index b6d024eb08..6f49547a3d 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/cast.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/cast.py @@ -2,6 +2,7 @@ from typing import Optional, Union import numpy as np +import tensorrt as trt import torch from torch.fx.node import Target from torch_tensorrt import _enums @@ -11,8 +12,6 @@ from torch_tensorrt.dynamo.conversion.converter_utils import cast_trt_tensor from torch_tensorrt.fx.types import TRTDataType, TRTTensor -import tensorrt as trt - LOGGER: logging.Logger = logging.getLogger(__name__) @@ -38,10 +37,9 @@ def to_copy( target_str = ConverterRegistry.qualified_name_or_str(target) target_name = f"{source_ir}_ops{('.' + target_str) if target_str else ''}" - identity_layer = ctx.net.add_identity(input) - identity_layer.set_output_type(0, trt_dtype) - identity_layer.name = f"Forced Cast ITensor {input.name} from {input.dtype} to {trt_dtype} - [{target_name}]-[{name}]" - return identity_layer.get_output(0) + cast_layer = ctx.net.add_cast(input, trt_dtype) + cast_layer.name = f"Forced Cast ITensor {input.name} from {input.dtype} to {trt_dtype} - [{target_name}]-[{name}]" + return cast_layer.get_output(0) else: casted_tensor = cast_trt_tensor(ctx, input, dtype, name, target, source_ir) return casted_tensor diff --git a/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py index d130980082..b9aa69fc93 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py @@ -513,12 +513,13 @@ def pow( lhs_val: Union[TRTTensor, int, float], rhs_val: Union[TRTTensor, int, float], ) -> TRTTensor: - if isinstance(lhs_val, TRTTensor) and isinstance(rhs_val, TRTTensor): - lhs_val, rhs_val = cast_int_int_div_trt_tensor(ctx, lhs_val, rhs_val, name) - - return convert_binary_elementwise( + # POW operation supports only float32 and int8 inputs + lhs_val = get_trt_tensor(ctx, lhs_val, name + "_lhs_val", trt.float32) + rhs_val = get_trt_tensor(ctx, rhs_val, name + "_rhs_val", trt.float32) + out = convert_binary_elementwise( ctx, target, source_ir, name, trt.ElementWiseOperation.POW, lhs_val, rhs_val ) + return out def floor_divide( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/select.py b/py/torch_tensorrt/dynamo/conversion/impl/select.py index 6d9a86f89b..dc5eb6996b 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/select.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/select.py @@ -114,10 +114,9 @@ def index( tensor_indices.append(ind) if not tensor_indices: - identity_layer = ctx.net.add_identity(input) - identity_layer.set_output_type(0, trt.int32) - set_layer_name(identity_layer, target, name + "_index_identity", source_ir) - return identity_layer.get_output(0) + cast_layer = ctx.net.add_cast(input, trt.int32) + set_layer_name(cast_layer, target, name + "_index_casted", source_ir) + return cast_layer.get_output(0) elif len(tensor_indices) == 1: indices_tensor = get_trt_tensor( ctx, tensor_indices[0], name + "_parameter_to_fp32_tensor" diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index 7a6a3f03f0..776e2bec8e 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -1,4 +1,5 @@ import math +import sys from typing import Optional, Sequence import numpy as np @@ -14,6 +15,11 @@ get_trt_tensor, ) from torch_tensorrt.dynamo.conversion.impl.cat import cat +from torch_tensorrt.dynamo.conversion.impl.elementwise import floor_divide +from torch_tensorrt.dynamo.conversion.impl.elementwise.ops import ( + convert_binary_elementwise, +) +from torch_tensorrt.dynamo.conversion.impl.shape import get_shape_with_dynamic_shape from torch_tensorrt.dynamo.conversion.impl.shape import shape as get_shape from torch_tensorrt.dynamo.conversion.impl.slice.base import slice from torch_tensorrt.dynamo.utils import DYNAMIC_DIM @@ -36,29 +42,175 @@ def slice_op( # TODO: This should be slice not whatever is in base stop: Optional[int], step: int, ) -> TRTTensor: + # check if dim is same as dynamic shape dimension + # this is required when stop is ITensor + dynamic_input_dim_equal = False + for i in range(len(input.shape)): + if input.shape[i] == DYNAMIC_DIM and i == dim: + dynamic_input_dim_equal = True + # Special case for start being None if start is None: start = 0 # Special case for stop being None + stop_dynamic_None = False if stop is None: - stop = input.shape[dim] + stop_dynamic_None = True if input.shape[dim] == -1 else False + stop = 0 if input.shape[dim] == -1 else input.shape[dim] dim = get_positive_dim(dim, len(input.shape)) - start = get_positive_dim(start, input.shape[dim]) - stop = get_positive_dim(stop, input.shape[dim]) - if has_dynamic_shape(input.shape): - # Check whether slice target dim is dynamic shape dim - assert input.shape[dim] != -1, "Can't slice on dynamic shape dimension!" + # Assign the initial start tensor + start_slice = [] + # the add_slice will take care of dynamic input shape cases here + if isinstance(start, int): + start_slice = [0] * len(input.shape) + start_slice[dim] = start + else: + for i in range(len(input.shape)): + start_slice.append(0) if i != dim else start_slice.append(start) + + # Assign the initial stop tensor + stop_slice = [] + if isinstance(stop, int) and dynamic_input_dim_equal: + stop_slice = input.shape + stop_slice[dim] = stop + else: + # required for cases where stop is ITensor and dim != dynamic dim of input + # not required for cases where stop is negative and dim != dynamic dim of inpu + for i in range(len(input.shape)): + if input.shape[i] == DYNAMIC_DIM and i != dim: + stop_slice.append( + get_shape( + ctx, target, source_ir, name + f"_shape_dim_stop_{i}", input, i + ) + ) + elif i == dim: + stop_slice.append(stop) + else: + stop_slice.append(input.shape[i]) - start_slice = [0] * len(input.shape) - start_slice[dim] = start stride_slice = [1] * len(input.shape) stride_slice[dim] = step output_shape = list(input.shape) - output_shape[dim] = math.ceil((stop - start) / step) + if input.shape[dim] != -1 and isinstance(start, int) and isinstance(stop, int): + start = get_positive_dim(start, input.shape[dim]) + stop = get_positive_dim(stop, input.shape[dim]) + start_slice[dim] = start + else: + # the start and stop or None is dynamic along dim or or start or stop is an ITensor + if ( + not (isinstance(start, int)) + or not (isinstance(stop, int)) + or start < 0 + or stop < 0 + or stop_dynamic_None + or stop == sys.maxsize + ): + # special assignments for dynamic cases + if isinstance(start, int) and start < 0: + start_slice = input.shape + start_slice[dim] = -1 * start + if (isinstance(stop, int) and stop < 0) or stop_dynamic_None: + stop_slice = [0] * len(input.shape) + stop_slice[dim] = -1 * stop + if stop == sys.maxsize: + stop_slice = [0] * len(input.shape) + start_slice_tensor = cat( + ctx, + target, + source_ir, + name + "_start_slice_concat", + tuple(start_slice), + 0, + cast_dtype=trt.int32, + ) + stop_slice_tensor = cat( + ctx, + target, + source_ir, + name + "_stop_slice_concat", + tuple(stop_slice), + 0, + cast_dtype=trt.int32, + ) + stride_slice_tensor = cat( + ctx, + target, + source_ir, + name + "_stride_slice_concat", + tuple(stride_slice), + 0, + cast_dtype=trt.int32, + ) + + if isinstance(start, int) and start < 0: + shape = get_shape_with_dynamic_shape( + ctx, target, source_ir, name, output_shape, input + ) + start_slice_tensor = convert_binary_elementwise( + ctx, + target, + source_ir, + name + "_sub_start", + trt.ElementWiseOperation.SUB, + shape, + start_slice_tensor, + ) + if isinstance(stop, int) and ( + (stop < 0) or stop_dynamic_None or stop == sys.maxsize + ): + shape = get_shape_with_dynamic_shape( + ctx, target, source_ir, name, output_shape, input + ) + stop_slice_tensor = convert_binary_elementwise( + ctx, + target, + source_ir, + name + "_sub_stop", + trt.ElementWiseOperation.SUB, + shape, + stop_slice_tensor, + ) + + # this is required for the ceil operation + output_shape_tensor_num = convert_binary_elementwise( + ctx, + target, + source_ir, + name + "_sub_num", + trt.ElementWiseOperation.SUB, + start_slice_tensor, + stop_slice_tensor, + ) + output_shape_tensor_neg = floor_divide( + ctx, + target, + source_ir, + name + "_div", + output_shape_tensor_num, + stride_slice_tensor, + ) + output_shape_tensor = convert_binary_elementwise( + ctx, + target, + source_ir, + name + "_prod", + trt.ElementWiseOperation.PROD, + output_shape_tensor_neg, + -1, + ) + layer = ctx.net.add_slice( + input, start=trt.Dims(), shape=trt.Dims(), stride=trt.Dims() + ) + layer.set_input(1, start_slice_tensor) + layer.set_input(2, output_shape_tensor) + layer.set_input(3, stride_slice_tensor) + return layer.get_output(0) + + output_shape[dim] = math.ceil((stop - start) / step) return slice( ctx, target, source_ir, name, input, start_slice, output_shape, stride_slice ) From 4f503a8acd2f68b4d5b6822aeb9baed731f5dcac Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 11 Jul 2024 09:33:15 -0700 Subject: [PATCH 12/55] chore: updates --- examples/dynamo/torch_export_llama2.py | 58 ++++++++++++++++--- py/torch_tensorrt/dynamo/_compiler.py | 15 ----- .../lower_scaled_dot_product_attention.py | 5 +- 3 files changed, 53 insertions(+), 25 deletions(-) diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py index b8b266e3a2..c4672efbdd 100644 --- a/examples/dynamo/torch_export_llama2.py +++ b/examples/dynamo/torch_export_llama2.py @@ -1,6 +1,10 @@ import torch import torch_tensorrt -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteriaList +from transformers.generation.stopping_criteria import ( + EosTokenCriteria, + MaxLengthCriteria, +) def export_llama2(model, inputs): @@ -9,7 +13,7 @@ def export_llama2(model, inputs): """ with torch.no_grad(): # max=1024 has contraint violation error. https://github.com/pytorch/pytorch/issues/125604 - seq_len = torch.export.Dim("seq_len", min=1, max=1024) + seq_len = torch.export.Dim("seq_len", min=1, max=16) try: print("Trying to export the model using torch.export.export()..") # strict=False only enables aotautograd tracing and excludes dynamo. @@ -35,7 +39,7 @@ def export_llama2(model, inputs): # Define the Llama2 model from hugging face # kv_cache is not supported in Torch-TRT currently. # attn_implementation=sdpa has tracing issues -llama_path = "meta-llama/Llama-2-7b-hf" +llama_path = "meta-llama/Llama-2-7b-chat-hf" model = ( AutoModelForCausalLM.from_pretrained( llama_path, use_cache=False, attn_implementation="sdpa" @@ -45,10 +49,13 @@ def export_llama2(model, inputs): ) tokenizer = AutoTokenizer.from_pretrained(llama_path) -base_prompt = "How many hours are in a day?" +base_prompt = "Can you explain what is dynamic programming?" base_inputs = tokenizer(base_prompt, return_tensors="pt").to("cuda:0") input_ids = base_inputs.input_ids + +max_tokens = 40 pyt_out = model(input_ids) +# generate_ids = model.generate(base_inputs.input_ids, max_length=max_tokens) llama2_ep = export_llama2(model, input_ids) trt_model = torch_tensorrt.dynamo.compile( @@ -57,14 +64,47 @@ def export_llama2(model, inputs): enabled_precisions={torch.float32}, min_block_size=1, truncate_double=True, - # torch_executed_ops={"torch.ops.aten.slice.Tensor"}, debug=True, disable_tf32=True, ) trt_out = trt_model(input_ids) -breakpoint() -# print("Mean diff: ", torch.mean(torch.abs(pyt_out.logits-trt_out.logits))) -print("Mean diff: ", torch.mean(torch.abs(pyt_out - trt_out))) # breakpoint() -# print("done") +# print("Mean diff: ", torch.mean(torch.abs(pyt_out.logits-trt_out.logits))) +print("Mean diff: ", torch.mean(torch.abs(pyt_out.logits - trt_out.logits))) +breakpoint() +# Auto-regressive generation loop for greedy search +max_length = len(input_ids) + max_tokens +stopping_criteria = StoppingCriteriaList( + [ + MaxLengthCriteria(max_length=max_length), + EosTokenCriteria(eos_token_id=tokenizer.eos_token_id), + ] +) + +token_id = 0 +while token_id < max_tokens: + print("Generating token: ", token_id) + trt_outputs = model(input_ids) + logits = trt_outputs.logits + next_token_logits = logits[:, -1, :] + next_tokens = torch.argmax(next_token_logits, dim=-1) + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + if stopping_criteria(input_ids, logits).item(): + break + token_id += 1 + + +# Decode the sentence +print("=============================") +# print( +# "Pytorch model generated text: ", +# tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0], +# ) +print("=============================") +print( + "TensorRT model generated text: ", + tokenizer.batch_decode( + input_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False + )[0], +) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index bf3eba7cfc..c613ba0b2d 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -276,21 +276,6 @@ def compile_module( "for the most thorough analysis" ) - def _update_sdpa_meta(gm: torch.fx.GraphModule) -> bool: - for node in gm.graph.nodes: - if node.target == torch._C._nn.scaled_dot_product_attention: - # We are using the "value" vector metadata - node.meta["val"] = node.args[2].meta["val"] - - return True - - # Update SDPA op meta since it is empty. - # TODO: Remove this once Pytorch fixes this. - if _update_sdpa_meta(gm): - logger.debug( - "Found torch._C._nn.scaled_dot_product_attention operator in subgraph. Updating its metadata manually as PyTorch doesn't provide it." - ) - # If the number of supported operations is 0 or less than the block size, skip the subgraph # TODO: Add condition to second expression below when require_full_compilation is added if num_supported_ops == 0 or ( diff --git a/py/torch_tensorrt/dynamo/lowering/passes/lower_scaled_dot_product_attention.py b/py/torch_tensorrt/dynamo/lowering/passes/lower_scaled_dot_product_attention.py index ddb7e603d8..a9933ee67b 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/lower_scaled_dot_product_attention.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/lower_scaled_dot_product_attention.py @@ -1,3 +1,4 @@ +import copy import logging import operator from typing import Callable, Sequence, Tuple @@ -23,7 +24,6 @@ def lower_scaled_dot_product_attention( """ original_fns, replacement = scaled_dot_product_attention_replacement() replaced_nodes = [] - # For each original function, search for it in the graph and replace for original in original_fns: replaced_nodes += torch.fx.subgraph_rewriter.replace_pattern_with_filters( @@ -54,6 +54,9 @@ def lower_scaled_dot_product_attention( == torch.nn.functional.scaled_dot_product_attention ) + # Copy the metadata of the replaced attention node to the new node + new_attention_node.meta = copy.copy(attention_node_replaced.meta) + # If the attention operator had keyword-args, copy them to the new node if attention_node_replaced.kwargs: new_attention_node.kwargs = {**attention_node_replaced.kwargs} From 0d00d8c9ae872bf74bb93d8926c48089810f8279 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 11 Jul 2024 11:14:17 -0700 Subject: [PATCH 13/55] chore: updates --- examples/dynamo/torch_export_gpt2.py | 77 ++++------------------- examples/dynamo/torch_export_llama2.py | 84 ++++++-------------------- 2 files changed, 32 insertions(+), 129 deletions(-) diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py index 43af33750c..fc6cead6c6 100644 --- a/examples/dynamo/torch_export_gpt2.py +++ b/examples/dynamo/torch_export_gpt2.py @@ -1,40 +1,7 @@ import torch import torch_tensorrt -from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteriaList -from transformers.generation.stopping_criteria import ( - EosTokenCriteria, - MaxLengthCriteria, -) - - -def export_gpt2(model, inputs): - """ - Exports the llama2 model into an ExportedProgram - """ - with torch.no_grad(): - # max=1024 has contraint violation error. https://github.com/pytorch/pytorch/issues/125604 - seq_len = torch.export.Dim("seq_len", min=1, max=1024) - try: - print("Trying to export the model using torch.export.export()..") - # strict=False only enables aotautograd tracing and excludes dynamo. - ep = torch.export.export( - model, (inputs,), dynamic_shapes=({1: seq_len},), strict=False - ) - except: - print( - "Trying torch.export._trace._export to trace the graph since torch.export.export() failed" - ) - # This API is used to express the constraint violation guards as asserts in the graph. - ep = torch.export._trace._export( - model, - (inputs,), - dynamic_shapes=({1: seq_len},), - strict=False, - _allow_complex_guards_as_runtime_asserts=True, - ) - - return ep - +from transformers import AutoModelForCausalLM, AutoTokenizer +from utils import export_llm, generate # Define tokenizer and model torch_device = "cuda" if torch.cuda.is_available() else "cpu" @@ -44,6 +11,7 @@ def export_gpt2(model, inputs): "gpt2", pad_token_id=tokenizer.eos_token_id, use_cache=False, + attn_implementation="eager", ) .eval() .to(torch_device) @@ -54,51 +22,32 @@ def export_gpt2(model, inputs): torch_device ) input_ids = model_inputs["input_ids"] -max_tokens = 40 +max_tokens = 20 -# Pyt model outputs -greedy_output = model.generate(**model_inputs, max_new_tokens=max_tokens) -pyt_outputs = model(input_ids) +# Auto-regressive generation loop for greedy search using PyTorch model +pyt_gen_tokens = generate(model, input_ids, max_tokens, tokenizer.eos_token_id) # Compile Torch-TRT model -gpt2_ep = export_gpt2(model, input_ids) +gpt2_ep = export_llm(model, input_ids, max_seq_len=1024) trt_model = torch_tensorrt.dynamo.compile( gpt2_ep, inputs=[input_ids], enabled_precisions={torch.float32}, truncate_double=True, - torch_executed_ops={"torch.ops.aten.slice.Tensor"}, - disable_tf32=True, + debug=True, ) -trt_outputs = trt_model(input_ids) - -# Auto-regressive generation loop for greedy search -max_length = len(input_ids) + max_tokens -stopping_criteria = StoppingCriteriaList( - [ - MaxLengthCriteria(max_length=max_length), - EosTokenCriteria(eos_token_id=tokenizer.eos_token_id), - ] -) -token_id = 0 -while token_id < max_tokens: - trt_outputs = model(input_ids) - logits = trt_outputs.logits - next_token_logits = logits[:, -1, :] - next_tokens = torch.argmax(next_token_logits, dim=-1) - input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) - if stopping_criteria(input_ids, logits).item(): - break - token_id += 1 +# Auto-regressive generation loop for greedy search using Torch-TensorRT model +generated_token_ids = generate(trt_model, input_ids, max_tokens, tokenizer.eos_token_id) # Decode the sentence +print("=============================") print( "Pytorch model generated text: ", - tokenizer.decode(greedy_output[0], skip_special_tokens=True), + tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True), ) print("=============================") print( "TensorRT model generated text: ", - tokenizer.decode(input_ids[0], skip_special_tokens=True), + tokenizer.decode(generated_token_ids[0], skip_special_tokens=True), ) diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py index c4672efbdd..759cc09d35 100644 --- a/examples/dynamo/torch_export_llama2.py +++ b/examples/dynamo/torch_export_llama2.py @@ -1,40 +1,7 @@ import torch import torch_tensorrt -from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteriaList -from transformers.generation.stopping_criteria import ( - EosTokenCriteria, - MaxLengthCriteria, -) - - -def export_llama2(model, inputs): - """ - Exports the llama2 model into an ExportedProgram - """ - with torch.no_grad(): - # max=1024 has contraint violation error. https://github.com/pytorch/pytorch/issues/125604 - seq_len = torch.export.Dim("seq_len", min=1, max=16) - try: - print("Trying to export the model using torch.export.export()..") - # strict=False only enables aotautograd tracing and excludes dynamo. - ep = torch.export.export( - model, (inputs,), dynamic_shapes=({1: seq_len},), strict=False - ) - except: - print( - "Trying torch.export._trace._export to trace the graph since torch.export.export() failed" - ) - # This API is used to express the constraint violation guards as asserts in the graph. - ep = torch.export._trace._export( - model, - (inputs,), - dynamic_shapes=({1: seq_len},), - strict=False, - _allow_complex_guards_as_runtime_asserts=True, - ) - - return ep - +from transformers import AutoModelForCausalLM, AutoTokenizer +from utils import export_llm, generate # Define the Llama2 model from hugging face # kv_cache is not supported in Torch-TRT currently. @@ -55,9 +22,11 @@ def export_llama2(model, inputs): max_tokens = 40 pyt_out = model(input_ids) -# generate_ids = model.generate(base_inputs.input_ids, max_length=max_tokens) -llama2_ep = export_llama2(model, input_ids) +# Auto-regressive generation loop for greedy search using PyTorch model +pyt_gen_tokens = generate(model, input_ids, max_tokens, tokenizer.eos_token_id) + +llama2_ep = export_llm(model, input_ids) trt_model = torch_tensorrt.dynamo.compile( llama2_ep, inputs=[input_ids], @@ -69,42 +38,27 @@ def export_llama2(model, inputs): ) trt_out = trt_model(input_ids) -# breakpoint() -# print("Mean diff: ", torch.mean(torch.abs(pyt_out.logits-trt_out.logits))) -print("Mean diff: ", torch.mean(torch.abs(pyt_out.logits - trt_out.logits))) -breakpoint() -# Auto-regressive generation loop for greedy search -max_length = len(input_ids) + max_tokens -stopping_criteria = StoppingCriteriaList( - [ - MaxLengthCriteria(max_length=max_length), - EosTokenCriteria(eos_token_id=tokenizer.eos_token_id), - ] -) -token_id = 0 -while token_id < max_tokens: - print("Generating token: ", token_id) - trt_outputs = model(input_ids) - logits = trt_outputs.logits - next_token_logits = logits[:, -1, :] - next_tokens = torch.argmax(next_token_logits, dim=-1) - input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) - if stopping_criteria(input_ids, logits).item(): - break - token_id += 1 +# Auto-regressive generation loop for greedy search +generated_token_ids = generate(trt_model, input_ids, max_tokens, tokenizer.eos_token_id) +# Check output difference +print("Mean diff: ", torch.mean(torch.abs(pyt_out.logits - trt_out.logits))) # Decode the sentence print("=============================") -# print( -# "Pytorch model generated text: ", -# tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0], -# ) +print( + "Pytorch model generated text: ", + tokenizer.batch_decode( + pyt_gen_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False + )[0], +) print("=============================") print( "TensorRT model generated text: ", tokenizer.batch_decode( - input_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False + generated_token_ids, + skip_special_tokens=True, + clean_up_tokenization_spaces=False, )[0], ) From 8099003170945461ebe8b2c1000a95fffb6cb1ad Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 11 Jul 2024 11:36:15 -0700 Subject: [PATCH 14/55] chore: updates --- examples/dynamo/utils.py | 63 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 examples/dynamo/utils.py diff --git a/examples/dynamo/utils.py b/examples/dynamo/utils.py new file mode 100644 index 0000000000..b2ce41ad68 --- /dev/null +++ b/examples/dynamo/utils.py @@ -0,0 +1,63 @@ +import torch +from transformers import StoppingCriteriaList +from transformers.generation.stopping_criteria import ( + EosTokenCriteria, + MaxLengthCriteria, +) + + +def export_llm(model, inputs, min_seq_len=1, max_seq_len=16): + """ + Exports the LLM model into an ExportedProgram with dynamic shapes. + In the case of guard failures due to some PyTorch kernel implements, we also + try to re-export the graph by expressing them as runtime assert nodes + """ + with torch.no_grad(): + # max=1024 has contraint violation error. https://github.com/pytorch/pytorch/issues/125604 + seq_len = torch.export.Dim("seq_len", min=min_seq_len, max=max_seq_len) + try: + print("Trying to export the model using torch.export.export()..") + # strict=False only enables aotautograd tracing and excludes dynamo. + ep = torch.export.export( + model, (inputs,), dynamic_shapes=({1: seq_len},), strict=False + ) + except: + print( + "Trying torch.export._trace._export to trace the graph since torch.export.export() failed" + ) + # This API is used to express the constraint violation guards as asserts in the graph. + ep = torch.export._trace._export( + model, + (inputs,), + dynamic_shapes=({1: seq_len},), + strict=False, + _allow_complex_guards_as_runtime_asserts=True, + ) + + return ep + + +def generate(model, input_seq, max_tokens, eos_token_id): + """ + Greedy decoding of the model. This generates up to max_tokens. + """ + max_length = len(input_seq) + max_tokens + stopping_criteria = StoppingCriteriaList( + [ + MaxLengthCriteria(max_length=max_length), + EosTokenCriteria(eos_token_id=eos_token_id), + ] + ) + token_id = 0 + while token_id < max_tokens: + print("Generating token: ", token_id) + outputs = model(input_seq) + logits = outputs.logits + next_token_logits = logits[:, -1, :] + next_tokens = torch.argmax(next_token_logits, dim=-1) + input_seq = torch.cat([input_seq, next_tokens[:, None]], dim=-1) + if stopping_criteria(input_seq, logits).item(): + break + token_id += 1 + + return input_seq From 457f706f803dcfc469d44bf1576ae861866c9f50 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 11 Jul 2024 12:02:46 -0700 Subject: [PATCH 15/55] chore: updates --- .../lowering/passes/lower_scaled_dot_product_attention.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/py/torch_tensorrt/dynamo/lowering/passes/lower_scaled_dot_product_attention.py b/py/torch_tensorrt/dynamo/lowering/passes/lower_scaled_dot_product_attention.py index a9933ee67b..91dd7f9a9a 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/lower_scaled_dot_product_attention.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/lower_scaled_dot_product_attention.py @@ -55,7 +55,11 @@ def lower_scaled_dot_product_attention( ) # Copy the metadata of the replaced attention node to the new node - new_attention_node.meta = copy.copy(attention_node_replaced.meta) + # TODO: Investigate why there are multiple FakeTensors in the metadata. + # We only use the first one as it contains the output shape information for this node. + new_attention_node.meta["val"] = copy.copy( + attention_node_replaced.meta["val"][0] + ) # If the attention operator had keyword-args, copy them to the new node if attention_node_replaced.kwargs: From ce3b2f80282e418ff3af5b7a4e5210b1ca20ac6b Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 11 Jul 2024 12:54:40 -0700 Subject: [PATCH 16/55] chore: updates --- .../runtime/_PythonTorchTensorRTModule.py | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py index 6c94b112a7..44e740870b 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py @@ -6,6 +6,7 @@ import tensorrt as trt import torch +import torch_tensorrt from torch.nn import Module from torch_tensorrt._Device import Device from torch_tensorrt._enums import dtype @@ -18,8 +19,6 @@ from torch_tensorrt.dynamo.utils import DYNAMIC_DIM from torch_tensorrt.logging import TRT_LOGGER -import torch_tensorrt - logger = logging.getLogger(__name__) @@ -103,11 +102,11 @@ def _initialize(self) -> None: # Set the active stream using the current device current_stream = torch.cuda.current_stream() - if current_stream == torch.cuda.default_stream(): - self.active_stream = torch.cuda.Stream() - torch.cuda.set_stream(self.active_stream) - else: - self.active_stream = current_stream + # if current_stream == torch.cuda.default_stream(): + # self.active_stream = torch.cuda.Stream() + # torch.cuda.set_stream(self.active_stream) + # else: + self.active_stream = current_stream def _check_initialized(self) -> None: if not self.initialized: @@ -210,12 +209,12 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, . torch.cuda.set_device(device_id) # Update current stream - current_stream = torch.cuda.current_stream(device) - if current_stream == torch.cuda.default_stream(device): - self.active_stream = torch.cuda.Stream(device) - torch.cuda.set_stream(self.active_stream) - else: - self.active_stream = current_stream + # current_stream = torch.cuda.current_stream(device) + # if current_stream == torch.cuda.default_stream(device): + # self.active_stream = torch.cuda.Stream(device) + # torch.cuda.set_stream(self.active_stream) + # else: + # self.active_stream = current_stream contiguous_inputs = [ tensor.to(device) for tensor in contiguous_inputs From d8acadc0f79b42817476c0737957b4de2efa0653 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 11 Jul 2024 17:09:58 -0700 Subject: [PATCH 17/55] chore: updates --- examples/dynamo/torch_export_llama2.py | 7 +++-- .../lowering/passes/_aten_lowering_pass.py | 2 ++ .../lowering/passes/remove_assert_scalar.py | 29 +++++++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) create mode 100644 py/torch_tensorrt/dynamo/lowering/passes/remove_assert_scalar.py diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py index 759cc09d35..e154f41538 100644 --- a/examples/dynamo/torch_export_llama2.py +++ b/examples/dynamo/torch_export_llama2.py @@ -16,17 +16,17 @@ ) tokenizer = AutoTokenizer.from_pretrained(llama_path) -base_prompt = "Can you explain what is dynamic programming?" +base_prompt = "What is dynamic programming?" base_inputs = tokenizer(base_prompt, return_tensors="pt").to("cuda:0") input_ids = base_inputs.input_ids -max_tokens = 40 +max_tokens = 16 pyt_out = model(input_ids) # Auto-regressive generation loop for greedy search using PyTorch model pyt_gen_tokens = generate(model, input_ids, max_tokens, tokenizer.eos_token_id) -llama2_ep = export_llm(model, input_ids) +llama2_ep = export_llm(model, input_ids, max_seq_len=32) trt_model = torch_tensorrt.dynamo.compile( llama2_ep, inputs=[input_ids], @@ -34,6 +34,7 @@ min_block_size=1, truncate_double=True, debug=True, + use_python_runtime=True, disable_tf32=True, ) diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py index 3d1663fe0b..03af69ae02 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py @@ -8,6 +8,7 @@ from .lower_linear import lower_linear from .lower_scaled_dot_product_attention import lower_scaled_dot_product_attention from .pass_manager import DynamoPassManager +from .remove_assert_scalar import remove_assert_scalar from .remove_detach import remove_detach from .remove_input_alias_fixing_clones import remove_input_alias_fixing_clones from .repair_input_as_output import repair_input_as_output @@ -24,6 +25,7 @@ fuse_prims_broadcast, replace_max_pool_with_indices, view_to_reshape, + remove_assert_scalar, ] ) diff --git a/py/torch_tensorrt/dynamo/lowering/passes/remove_assert_scalar.py b/py/torch_tensorrt/dynamo/lowering/passes/remove_assert_scalar.py new file mode 100644 index 0000000000..a0a741805f --- /dev/null +++ b/py/torch_tensorrt/dynamo/lowering/passes/remove_assert_scalar.py @@ -0,0 +1,29 @@ +import logging +from typing import Sequence + +import torch +from torch_tensorrt.dynamo.lowering.passes.pass_utils import ( + clean_up_graph_after_modifications, +) + +logger = logging.getLogger(__name__) + + +def remove_assert_scalar( + gm: torch.fx.GraphModule, sample_inputs: Sequence[torch.Tensor] +) -> torch.fx.GraphModule: + """Remove assert_scalar ops in the graph""" + count = 0 + for node in gm.graph.nodes: + # node.target = "detach" in torch.compile workflow + if node.target == torch.ops.aten._assert_scalar.default: + # Detach node has only one input + gm.graph.erase_node(node) + count += 1 + + if count > 0: + gm = clean_up_graph_after_modifications(gm) + + logger.debug(f"Removed {count} assert_scalar nodes:\n{gm.graph}") + + return gm From 262c87dab545973fd4a3a6b26a22df11a5fe841f Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 12 Jul 2024 15:28:01 -0700 Subject: [PATCH 18/55] chore: updates --- examples/dynamo/torch_export_llama2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py index e154f41538..9c8f61dad9 100644 --- a/examples/dynamo/torch_export_llama2.py +++ b/examples/dynamo/torch_export_llama2.py @@ -9,7 +9,7 @@ llama_path = "meta-llama/Llama-2-7b-chat-hf" model = ( AutoModelForCausalLM.from_pretrained( - llama_path, use_cache=False, attn_implementation="sdpa" + llama_path, use_cache=False, attn_implementation="eager" ) .eval() .cuda() @@ -20,13 +20,13 @@ base_inputs = tokenizer(base_prompt, return_tensors="pt").to("cuda:0") input_ids = base_inputs.input_ids -max_tokens = 16 +max_tokens = 32 pyt_out = model(input_ids) # Auto-regressive generation loop for greedy search using PyTorch model pyt_gen_tokens = generate(model, input_ids, max_tokens, tokenizer.eos_token_id) -llama2_ep = export_llm(model, input_ids, max_seq_len=32) +llama2_ep = export_llm(model, input_ids, max_seq_len=64) trt_model = torch_tensorrt.dynamo.compile( llama2_ep, inputs=[input_ids], From 736b83974980b8b0e0cc31945e284398d90aa39d Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 17 Jul 2024 15:09:58 -0700 Subject: [PATCH 19/55] chore: updates --- .../dynamo/conversion/aten_ops_converters.py | 1 + .../runtime/_PythonTorchTensorRTModule.py | 22 +++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index 184060f339..e85b3cd1aa 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -2728,6 +2728,7 @@ def attention_validator(node: Node) -> bool: @dynamo_tensorrt_converter( torch.nn.functional.scaled_dot_product_attention, capability_validator=attention_validator, + supports_dynamic_shapes=True, ) def tensorrt_scaled_dot_product_attention( ctx: ConversionContext, diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py index 44e740870b..c911c44879 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py @@ -102,11 +102,11 @@ def _initialize(self) -> None: # Set the active stream using the current device current_stream = torch.cuda.current_stream() - # if current_stream == torch.cuda.default_stream(): - # self.active_stream = torch.cuda.Stream() - # torch.cuda.set_stream(self.active_stream) - # else: - self.active_stream = current_stream + if current_stream == torch.cuda.default_stream(): + self.active_stream = torch.cuda.Stream() + torch.cuda.set_stream(self.active_stream) + else: + self.active_stream = current_stream def _check_initialized(self) -> None: if not self.initialized: @@ -209,12 +209,12 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, . torch.cuda.set_device(device_id) # Update current stream - # current_stream = torch.cuda.current_stream(device) - # if current_stream == torch.cuda.default_stream(device): - # self.active_stream = torch.cuda.Stream(device) - # torch.cuda.set_stream(self.active_stream) - # else: - # self.active_stream = current_stream + current_stream = torch.cuda.current_stream(device) + if current_stream == torch.cuda.default_stream(device): + self.active_stream = torch.cuda.Stream(device) + torch.cuda.set_stream(self.active_stream) + else: + self.active_stream = current_stream contiguous_inputs = [ tensor.to(device) for tensor in contiguous_inputs From 313380e79af33ccaf144f96a1c33f167f9d1a059 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 18 Jul 2024 13:45:45 -0700 Subject: [PATCH 20/55] chore: bug fixes --- .../dynamo/conversion/impl/full.py | 10 +++- .../dynamo/conversion/impl/slice/ops.py | 57 ++++++++++++------- 2 files changed, 43 insertions(+), 24 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/full.py b/py/torch_tensorrt/dynamo/conversion/impl/full.py index 4c4a4d6b44..6886564ec9 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/full.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/full.py @@ -21,9 +21,15 @@ def full( shape: Union[List[int], TRTTensor], fill_value: Union[int, float, bool], ) -> TRTTensor: - # in static shape scenario, shape is a list of int + if isinstance(shape, List): - return np.full(shape, fill_value) + # in static shape scenario, shape is a list of int + if all(isinstance(dim, int) for dim in shape): + return np.full(shape, fill_value) + else: + shape = impl.cat.cat( + ctx, target, source_ir, name + "_concat_shape", shape, 0 + ) # in dynamic shape scenario, shape is a shap tensor # use IFillLayer to fill the shape tensor with LINSPACE value diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index 776e2bec8e..4f6d66c44c 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -226,6 +226,7 @@ def expand( ) -> TRTTensor: shape_rank = len(shape) initial_tensor_rank = len(input_t.shape) + # If the rank of the input tensor is less than the shape's rank, pad with ones if initial_tensor_rank < shape_rank: input_t = prepend_ones( @@ -244,39 +245,49 @@ def expand( # After the above padding, the shape and tensor rank must be equal assert len(input_t.shape) == shape_rank - shape_t = [] - for i in range(shape_rank): - if shape[i] == -1: - shape_t.append( - get_shape(ctx, target, source_ir, name + f"_shape_dim{i}", input_t, i) - ) - else: - shape_t.append(shape[i]) - - # Establish the desired output shape, strides, and starting indices - input_tensor_shape = tuple(input_t.shape) + # Configure the start, strides and output shape tensors start = tuple([0] * shape_rank) - # TODO: Revisit stride calculation. stride[dim]=0 implies that dimension is being broadcasted. + # stride[dim]=0 implies that dimension is being broadcasted. # stride should be 1 for all non-broadcasted dims stride = [] - for i, o in zip(input_tensor_shape, shape_t): - # If the shape has ITensor, we treat it as a reshape dim instead of a broadcasted dim - # shape_t cannot have -1. If the input at this dimension has a shape of -1, set the stride to 1. This indicates that the input is dynamic and does not imply broadcasting at that specific dimension. - if isinstance(i, int) and isinstance(o, int) and i != DYNAMIC_DIM: + input_tensor_shape = tuple(input_t.shape) + for i, o in zip(input_tensor_shape, shape): + # If input dim and target shape dim are static, broadcast if they are not equal + # If a dimension of target shape has ITensor, we treat it as a broadcasted dim + if ( + isinstance(i, int) + and i != DYNAMIC_DIM + and isinstance(o, int) + and o != DYNAMIC_DIM + ): stride.append(int(i == o)) + elif isinstance(o, TRTTensor): + stride.append(0) else: + # No broadcasting is happening. The output should have the same size as input at this dimension. stride.append(1) - shape_ = shape_t + # Resolve dynamic dimensions in the target shape. These are not broadcasted dims. + # The value at this dimension should be same as input. + target_shape = [] + for i in range(shape_rank): + if shape[i] == DYNAMIC_DIM: + target_shape.append( + get_shape(ctx, target, source_ir, name + f"_shape_dim{i}", input_t, i) + ) + else: + target_shape.append(shape[i]) + + target_shape_t = target_shape # Handle dynamic shapes case where shape has dynamic dimension - if any(isinstance(ele, TRTTensor) for ele in shape_t): - shape_ = cat( + if any(isinstance(ele, TRTTensor) for ele in target_shape_t): + target_shape_t = cat( ctx, target, source_ir, name + "_shape_concat", - shape_t, + target_shape_t, 0, cast_dtype=trt.int32, ) @@ -302,10 +313,12 @@ def expand( input_t, start=trt.Dims(), shape=trt.Dims(), stride=trt.Dims() ) layer.set_input(1, start_tensor) - layer.set_input(2, shape_) + layer.set_input(2, target_shape_t) layer.set_input(3, stride_tensor) else: - layer = ctx.net.add_slice(input_t, start=start, shape=shape_, stride=stride) + layer = ctx.net.add_slice( + input_t, start=start, shape=target_shape_t, stride=stride + ) set_layer_name(layer, target, name, source_ir) return layer.get_output(0) From 1057d839cac7ab078bc1cf41a0460f09ca2b3c09 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 19 Jul 2024 13:19:14 -0700 Subject: [PATCH 21/55] chore: updates --- py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index 4f6d66c44c..f64836d9fa 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -254,7 +254,7 @@ def expand( input_tensor_shape = tuple(input_t.shape) for i, o in zip(input_tensor_shape, shape): # If input dim and target shape dim are static, broadcast if they are not equal - # If a dimension of target shape has ITensor, we treat it as a broadcasted dim + # If input dim is known and target shape dim is dynamic we treat it as a broadcasted dim if ( isinstance(i, int) and i != DYNAMIC_DIM @@ -262,7 +262,7 @@ def expand( and o != DYNAMIC_DIM ): stride.append(int(i == o)) - elif isinstance(o, TRTTensor): + elif isinstance(i, int) and i != DYNAMIC_DIM and isinstance(o, TRTTensor): stride.append(0) else: # No broadcasting is happening. The output should have the same size as input at this dimension. From bfd0cf235f2d66820d2ac813ba8babaf787254de Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 19 Jul 2024 17:50:36 -0700 Subject: [PATCH 22/55] chore: fixes --- core/runtime/execute_engine.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp index 6868bc47ce..5fe97b961b 100644 --- a/core/runtime/execute_engine.cpp +++ b/core/runtime/execute_engine.cpp @@ -114,6 +114,9 @@ std::vector execute_engine(std::vector inputs, c10::intr // Whether cudagraphs needs to record the graph on this pass bool need_cudagraphs_record = (CUDAGRAPHS_MODE && !_cudagraphs_validate_shapes(inputs, compiled_engine)); + // this is a buffer to store shape tensor input addresses throughout the runtime scope + std::list> inputShapeTensorValues; + // Intialize outputs to be available throughout the succeeding scopes std::vector outputs(compiled_engine->num_io.second); @@ -177,8 +180,6 @@ std::vector execute_engine(std::vector inputs, c10::intr } } - // this is a buffer to store shape tensor input addresses throughout the runtime scope - std::list> inputShapeTensorValues; { std::unique_ptr input_profiler_guard; if (compiled_engine->profile_execution) { @@ -200,12 +201,12 @@ std::vector execute_engine(std::vector inputs, c10::intr at::Tensor contig_input; if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) { - // Shape tensor inputs are casted to int32 explicitly. + // Shape tensor inputs are casted to int64 explicitly // Refer to // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435 - auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt32); - std::vector inputs_cpu_vec( - input_cpu.data_ptr(), input_cpu.data_ptr() + input_cpu.numel()); + auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64); + std::vector inputs_cpu_vec( + input_cpu.data_ptr(), input_cpu.data_ptr() + input_cpu.numel()); inputShapeTensorValues.emplace_back(inputs_cpu_vec); TORCHTRT_CHECK( compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()), @@ -273,6 +274,7 @@ std::vector execute_engine(std::vector inputs, c10::intr if (!CUDAGRAPHS_MODE) { // If not in cudagraphs mode, proceed with enqueueV3 as normal + c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream(inputs[0].device().index()); compiled_engine->exec_ctx->enqueueV3(compiled_engine->active_stream); } else if (need_cudagraphs_record) { // If cudagraphs needs to record a graph, capture the enqueueV3 call in a graph From 17ddb3191ac6dda26cdcb9f9abb4bcf3b2ebcb74 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 19 Jul 2024 17:51:44 -0700 Subject: [PATCH 23/55] chore: updates --- core/runtime/execute_engine.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp index 5fe97b961b..e6d21474f2 100644 --- a/core/runtime/execute_engine.cpp +++ b/core/runtime/execute_engine.cpp @@ -274,7 +274,6 @@ std::vector execute_engine(std::vector inputs, c10::intr if (!CUDAGRAPHS_MODE) { // If not in cudagraphs mode, proceed with enqueueV3 as normal - c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream(inputs[0].device().index()); compiled_engine->exec_ctx->enqueueV3(compiled_engine->active_stream); } else if (need_cudagraphs_record) { // If cudagraphs needs to record a graph, capture the enqueueV3 call in a graph From 88be4fa8c37e475502d8d904cd0d84c25dc02753 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 22 Jul 2024 14:49:52 -0700 Subject: [PATCH 24/55] chore: add torch compile gpt2 example --- examples/dynamo/torch_compile_gpt2.py | 50 ++++++++++----------------- 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/examples/dynamo/torch_compile_gpt2.py b/examples/dynamo/torch_compile_gpt2.py index ddbb2573e6..6285266973 100644 --- a/examples/dynamo/torch_compile_gpt2.py +++ b/examples/dynamo/torch_compile_gpt2.py @@ -1,17 +1,17 @@ import torch import torch_tensorrt -from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteriaList -from transformers.generation.stopping_criteria import ( - EosTokenCriteria, - MaxLengthCriteria, -) +from transformers import AutoModelForCausalLM, AutoTokenizer +from utils import generate # Define tokenizer and model torch_device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained("gpt2") model = ( AutoModelForCausalLM.from_pretrained( - "gpt2", pad_token_id=tokenizer.eos_token_id, use_cache=False + "gpt2", + pad_token_id=tokenizer.eos_token_id, + use_cache=False, + attn_implementation="eager", ) .eval() .to(torch_device) @@ -22,14 +22,10 @@ torch_device ) input_ids = model_inputs["input_ids"] -max_tokens = 40 +max_tokens = 20 -# Pyt model outputs -greedy_output = model.generate(**model_inputs, max_new_tokens=max_tokens) -print( - "Pytorch model generated text: ", - tokenizer.decode(greedy_output[0], skip_special_tokens=True), -) +# Auto-regressive generation loop for greedy search using PyTorch model +pyt_gen_tokens = generate(model, input_ids, max_tokens, tokenizer.eos_token_id) # Compile Torch-TRT model torch._dynamo.mark_dynamic(input_ids, 1, min=7, max=1023) @@ -39,32 +35,22 @@ dynamic=None, options={ "enabled_precisions": {torch.float}, - "torch_executed_ops": {"torch.ops.aten.slice.Tensor"}, "debug": True, "disable_tf32": True, }, ) -# Auto-regressive generation loop for greedy search -stopping_criteria = StoppingCriteriaList( - [ - MaxLengthCriteria(max_length=max_tokens), - EosTokenCriteria(eos_token_id=tokenizer.eos_token_id), - ] -) -token_id = 0 -while token_id < 20: - trt_outputs = model(input_ids) - logits = trt_outputs.logits - next_token_logits = logits[:, -1, :] - next_tokens = torch.argmax(next_token_logits, dim=-1) - input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) - if stopping_criteria(input_ids, logits).item(): - break - token_id += 1 +# Auto-regressive generation loop for greedy search using Torch-TensorRT model +generated_token_ids = generate(model, input_ids, max_tokens, tokenizer.eos_token_id) # Decode the sentence +print("=============================") +print( + "Pytorch model generated text: ", + tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True), +) +print("=============================") print( "TensorRT model generated text: ", - tokenizer.decode(input_ids[0], skip_special_tokens=True), + tokenizer.decode(generated_token_ids[0], skip_special_tokens=True), ) From df825aba0b627df861bade011a18ff822a84593c Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 22 Jul 2024 16:11:14 -0700 Subject: [PATCH 25/55] chore: updates --- WORKSPACE | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/WORKSPACE b/WORKSPACE index 225bc0688e..f38d606dbe 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -56,19 +56,19 @@ new_local_repository( # Tarballs and fetched dependencies (default - use in cases when building from precompiled bin and tarballs) ############################################################################################################# -http_archive( - name = "libtorch", - build_file = "@//third_party/libtorch:BUILD", - strip_prefix = "libtorch", - urls = ["https://download.pytorch.org/libtorch/nightly/cu124/libtorch-cxx11-abi-shared-with-deps-latest.zip"], -) - -http_archive( - name = "libtorch_pre_cxx11_abi", - build_file = "@//third_party/libtorch:BUILD", - strip_prefix = "libtorch", - urls = ["https://download.pytorch.org/libtorch/nightly/cu124/libtorch-shared-with-deps-latest.zip"], -) +# http_archive( +# name = "libtorch", +# build_file = "@//third_party/libtorch:BUILD", +# strip_prefix = "libtorch", +# urls = ["https://download.pytorch.org/libtorch/nightly/cu124/libtorch-cxx11-abi-shared-with-deps-latest.zip"], +# ) + +# http_archive( +# name = "libtorch_pre_cxx11_abi", +# build_file = "@//third_party/libtorch:BUILD", +# strip_prefix = "libtorch", +# urls = ["https://download.pytorch.org/libtorch/nightly/cu124/libtorch-shared-with-deps-latest.zip"], +# ) http_archive( name = "libtorch_win", @@ -112,17 +112,17 @@ http_archive( # x86_64 python distribution. If using NVIDIA's version just point to the root of the package # for both versions here and do not use --config=pre-cxx11-abi -#new_local_repository( -# name = "libtorch", -# path = "/usr/local/lib/python3.6/dist-packages/torch", -# build_file = "third_party/libtorch/BUILD" -#) +new_local_repository( + name = "libtorch", + build_file = "third_party/libtorch/BUILD", + path = "/usr/local/lib/python3.6/dist-packages/torch", +) -#new_local_repository( -# name = "libtorch_pre_cxx11_abi", -# path = "/usr/local/lib/python3.6/dist-packages/torch", -# build_file = "third_party/libtorch/BUILD" -#) +new_local_repository( + name = "libtorch_pre_cxx11_abi", + build_file = "third_party/libtorch/BUILD", + path = "/usr/local/lib/python3.6/dist-packages/torch", +) #new_local_repository( # name = "tensorrt", From ff072959cc33a5ec1d9d2104e8445a9a0bad85f6 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 24 Jul 2024 09:22:24 -0700 Subject: [PATCH 26/55] chore: add timing calculation --- examples/dynamo/torch_export_gpt2.py | 42 +++++++++++++++++++++++++--- examples/dynamo/utils.py | 6 ++-- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py index fc6cead6c6..7b1df0824b 100644 --- a/examples/dynamo/torch_export_gpt2.py +++ b/examples/dynamo/torch_export_gpt2.py @@ -1,8 +1,27 @@ +import copy +import time + +import numpy as np import torch import torch_tensorrt from transformers import AutoModelForCausalLM, AutoTokenizer from utils import export_llm, generate + +def time_generate(model, inputs, max_tokens, eos_token_id, iterations=10): + timings = [] + for _ in range(iterations): + start_time = time.time() + inputs_copy = copy.copy(inputs) + generate(model, inputs_copy, max_tokens, eos_token_id) + timings.append(time.time() - start_time) + + time_mean = np.mean(timings) * 1000 # convert to ms + time_med = np.median(timings) * 1000 # convert to ms + + return time_mean, time_med + + # Define tokenizer and model torch_device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained("gpt2") @@ -15,6 +34,7 @@ ) .eval() .to(torch_device) + .half() ) # Input prompt @@ -25,20 +45,30 @@ max_tokens = 20 # Auto-regressive generation loop for greedy search using PyTorch model -pyt_gen_tokens = generate(model, input_ids, max_tokens, tokenizer.eos_token_id) +pyt_gen_tokens, num_tokens_gen = generate( + model, input_ids, max_tokens, tokenizer.eos_token_id +) +pyt_mean_time, pyt_med_time = time_generate( + model, input_ids, max_tokens, tokenizer.eos_token_id +) # Compile Torch-TRT model gpt2_ep = export_llm(model, input_ids, max_seq_len=1024) trt_model = torch_tensorrt.dynamo.compile( gpt2_ep, inputs=[input_ids], - enabled_precisions={torch.float32}, + enabled_precisions={torch.float16}, truncate_double=True, - debug=True, + debug=False, ) # Auto-regressive generation loop for greedy search using Torch-TensorRT model -generated_token_ids = generate(trt_model, input_ids, max_tokens, tokenizer.eos_token_id) +generated_token_ids, num_tokens_gen = generate( + trt_model, input_ids, max_tokens, tokenizer.eos_token_id +) +trt_mean_time, trt_med_time = time_generate( + trt_model, input_ids, max_tokens, tokenizer.eos_token_id +) # Decode the sentence print("=============================") @@ -46,8 +76,12 @@ "Pytorch model generated text: ", tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True), ) +print(f"Pytorch total tokens generated: {num_tokens_gen}") +print(f"Pytorch total mean time in ms: {pyt_mean_time} median time: {pyt_med_time}") print("=============================") print( "TensorRT model generated text: ", tokenizer.decode(generated_token_ids[0], skip_special_tokens=True), ) +print(f"TensorRT total tokens generated: {num_tokens_gen}") +print(f"TensorRT total mean time in ms: {trt_mean_time} median time: {trt_med_time}") diff --git a/examples/dynamo/utils.py b/examples/dynamo/utils.py index b2ce41ad68..a7450652b1 100644 --- a/examples/dynamo/utils.py +++ b/examples/dynamo/utils.py @@ -31,7 +31,7 @@ def export_llm(model, inputs, min_seq_len=1, max_seq_len=16): (inputs,), dynamic_shapes=({1: seq_len},), strict=False, - _allow_complex_guards_as_runtime_asserts=True, + allow_complex_guards_as_runtime_asserts=True, ) return ep @@ -50,7 +50,7 @@ def generate(model, input_seq, max_tokens, eos_token_id): ) token_id = 0 while token_id < max_tokens: - print("Generating token: ", token_id) + # print("Generating token: ", token_id) outputs = model(input_seq) logits = outputs.logits next_token_logits = logits[:, -1, :] @@ -60,4 +60,4 @@ def generate(model, input_seq, max_tokens, eos_token_id): break token_id += 1 - return input_seq + return input_seq, token_id From 6c9b9fe42401d219868189dbeda289898d4543cd Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 5 Aug 2024 12:54:26 -0700 Subject: [PATCH 27/55] chore: updates --- MODULE.bazel | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/MODULE.bazel b/MODULE.bazel index c747a8d1d2..958ea92f1b 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -51,19 +51,19 @@ http_archive = use_repo_rule("@bazel_tools//tools/build_defs/repo:http.bzl", "ht # Tarballs and fetched dependencies (default - use in cases when building from precompiled bin and tarballs) ############################################################################################################# -# http_archive( -# name = "libtorch", -# build_file = "@//third_party/libtorch:BUILD", -# strip_prefix = "libtorch", -# urls = ["https://download.pytorch.org/libtorch/nightly/cu124/libtorch-cxx11-abi-shared-with-deps-latest.zip"], -# ) - -# http_archive( -# name = "libtorch_pre_cxx11_abi", -# build_file = "@//third_party/libtorch:BUILD", -# strip_prefix = "libtorch", -# urls = ["https://download.pytorch.org/libtorch/nightly/cu124/libtorch-shared-with-deps-latest.zip"], -# ) +http_archive( + name = "libtorch", + build_file = "@//third_party/libtorch:BUILD", + strip_prefix = "libtorch", + urls = ["https://download.pytorch.org/libtorch/nightly/cu124/libtorch-cxx11-abi-shared-with-deps-latest.zip"], +) + +http_archive( + name = "libtorch_pre_cxx11_abi", + build_file = "@//third_party/libtorch:BUILD", + strip_prefix = "libtorch", + urls = ["https://download.pytorch.org/libtorch/nightly/cu124/libtorch-shared-with-deps-latest.zip"], +) http_archive( name = "libtorch_win", @@ -107,17 +107,17 @@ http_archive( # x86_64 python distribution. If using NVIDIA's version just point to the root of the package # for both versions here and do not use --config=pre-cxx11-abi -new_local_repository( - name = "libtorch", - build_file = "third_party/libtorch/BUILD", - path = "/usr/local/lib/python3.6/dist-packages/torch", -) +#new_local_repository( +# name = "libtorch", +# path = "/usr/local/lib/python3.6/dist-packages/torch", +# build_file = "third_party/libtorch/BUILD" +#) -new_local_repository( - name = "libtorch_pre_cxx11_abi", - build_file = "third_party/libtorch/BUILD", - path = "/usr/local/lib/python3.6/dist-packages/torch", -) +#new_local_repository( +# name = "libtorch_pre_cxx11_abi", +# path = "/usr/local/lib/python3.6/dist-packages/torch", +# build_file = "third_party/libtorch/BUILD" +#) #new_local_repository( # name = "tensorrt", From 6313b1c1fa52b076863e07c2a9336889604560a9 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 9 Aug 2024 18:50:44 +0000 Subject: [PATCH 28/55] chore: updates --- examples/dynamo/torch_export_llama2.py | 39 ++++---- examples/dynamo/utils.py | 14 +-- py/torch_tensorrt/dynamo/_DryRunTracker.py | 57 ++++++----- py/torch_tensorrt/dynamo/_compiler.py | 51 ++-------- .../dynamo/conversion/_conversion.py | 14 ++- .../dynamo/conversion/impl/matmul.py | 1 + .../dynamo/lowering/_decompositions.py | 2 +- .../lowering/passes/constant_folding.py | 4 +- .../passes/replace_max_pool_with_indices.py | 3 +- .../dynamo/partitioning/common.py | 33 +------ py/torch_tensorrt/dynamo/utils.py | 99 +++++++++++++++++++ 11 files changed, 189 insertions(+), 128 deletions(-) diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py index 9c8f61dad9..e08ebb307c 100644 --- a/examples/dynamo/torch_export_llama2.py +++ b/examples/dynamo/torch_export_llama2.py @@ -3,29 +3,31 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from utils import export_llm, generate +# Define the parameters +MAX_TOKENS = 32 +DEVICE = torch.device("cuda:0") + # Define the Llama2 model from hugging face # kv_cache is not supported in Torch-TRT currently. -# attn_implementation=sdpa has tracing issues +# CPU is used here so that GPU memory is reserved for TRT compilation. llama_path = "meta-llama/Llama-2-7b-chat-hf" -model = ( - AutoModelForCausalLM.from_pretrained( - llama_path, use_cache=False, attn_implementation="eager" +with torch.no_grad(): + model = ( + AutoModelForCausalLM.from_pretrained( + llama_path, use_cache=False, attn_implementation="eager" + ) + .eval() ) - .eval() - .cuda() -) -tokenizer = AutoTokenizer.from_pretrained(llama_path) +tokenizer = AutoTokenizer.from_pretrained(llama_path) base_prompt = "What is dynamic programming?" -base_inputs = tokenizer(base_prompt, return_tensors="pt").to("cuda:0") +base_inputs = tokenizer(base_prompt, return_tensors="pt") input_ids = base_inputs.input_ids -max_tokens = 32 -pyt_out = model(input_ids) - # Auto-regressive generation loop for greedy search using PyTorch model -pyt_gen_tokens = generate(model, input_ids, max_tokens, tokenizer.eos_token_id) +pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) +# Export the llama2 model into an ExportedProgram which is input of TRT compilation llama2_ep = export_llm(model, input_ids, max_seq_len=64) trt_model = torch_tensorrt.dynamo.compile( llama2_ep, @@ -34,17 +36,14 @@ min_block_size=1, truncate_double=True, debug=True, - use_python_runtime=True, + device=DEVICE, disable_tf32=True, ) -trt_out = trt_model(input_ids) - # Auto-regressive generation loop for greedy search -generated_token_ids = generate(trt_model, input_ids, max_tokens, tokenizer.eos_token_id) - -# Check output difference -print("Mean diff: ", torch.mean(torch.abs(pyt_out.logits - trt_out.logits))) +# Move inputs to GPU +input_ids = input_ids.to(DEVICE) +generated_token_ids = generate(trt_model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) # Decode the sentence print("=============================") diff --git a/examples/dynamo/utils.py b/examples/dynamo/utils.py index a7450652b1..25ad99c12d 100644 --- a/examples/dynamo/utils.py +++ b/examples/dynamo/utils.py @@ -41,23 +41,23 @@ def generate(model, input_seq, max_tokens, eos_token_id): """ Greedy decoding of the model. This generates up to max_tokens. """ - max_length = len(input_seq) + max_tokens + # Max length of output seq = current input_seq length + max_tokens allowed to generate + max_output_seq_length = input_seq.shape[1] + max_tokens stopping_criteria = StoppingCriteriaList( [ - MaxLengthCriteria(max_length=max_length), + MaxLengthCriteria(max_length=max_output_seq_length), EosTokenCriteria(eos_token_id=eos_token_id), ] ) - token_id = 0 - while token_id < max_tokens: - # print("Generating token: ", token_id) + + while True: outputs = model(input_seq) logits = outputs.logits next_token_logits = logits[:, -1, :] next_tokens = torch.argmax(next_token_logits, dim=-1) input_seq = torch.cat([input_seq, next_tokens[:, None]], dim=-1) + # TODO: Handle batch in this check if stopping_criteria(input_seq, logits).item(): break - token_id += 1 - return input_seq, token_id + return input_seq diff --git a/py/torch_tensorrt/dynamo/_DryRunTracker.py b/py/torch_tensorrt/dynamo/_DryRunTracker.py index 46d99ffe31..ecb6dfaef4 100644 --- a/py/torch_tensorrt/dynamo/_DryRunTracker.py +++ b/py/torch_tensorrt/dynamo/_DryRunTracker.py @@ -20,18 +20,18 @@ class PerSubgraphData: Args: subgraph_name (str): Name of the subgraph in the GraphModule subgraph_op_count (int): Number of operations in the subgraph - subgraph_input_shapes (Any): Shapes of input Tensors of the subgraph - subgraph_input_dtypes (Any): Input data types of the subgraph - subgraph_output_shapes (Any): Shapes of output Tensors of the subgraph - subgraph_output_dtypes (Any): Output data types of the subgraph + input_shapes (Any): Shapes of input Tensors of the subgraph + input_dtypes (Any): Input data types of the subgraph + output_shapes (Any): Shapes of output Tensors of the subgraph + output_dtypes (Any): Output data types of the subgraph """ subgraph_name: str = "" subgraph_op_count: int = 0 - subgraph_input_shapes: Any = field(default_factory=list) - subgraph_input_dtypes: Any = field(default_factory=list) - subgraph_output_shapes: Any = field(default_factory=list) - subgraph_output_dtypes: Any = field(default_factory=list) + input_shapes: Any = field(default_factory=list) + input_dtypes: Any = field(default_factory=list) + output_shapes: Any = field(default_factory=list) + output_dtypes: Any = field(default_factory=list) @dataclass @@ -41,10 +41,10 @@ class DryRunTracker: Args: total_ops_in_graph (int): Total number of operators in graph supported_ops_in_graph (int): Number of supported operators in graph - graph_input_shapes (Any): Shapes of input Tensors of the graph - graph_input_dtypes (Any): Input data types of the graph - graph_output_shapes (Any): Shapes of output Tensors of the graph - graph_output_dtypes (Any): Output data types of the graph + input_shapes (Any): Shapes of input Tensors of the graph + input_dtypes (Any): Input data types of the graph + output_shapes (Any): Shapes of output Tensors of the graph + output_dtypes (Any): Output data types of the graph per_subgraph_data (List[PerSubgraphData]): Per-subgraph data, see above class tensorrt_graph_count (int): Number of TensorRT engines to be generated compilation_settings (CompilationSettings): User Compilation Settings @@ -54,10 +54,10 @@ class DryRunTracker: total_ops_in_graph: int = 0 supported_ops_in_graph: int = 0 - graph_input_shapes: Any = field(default_factory=list) - graph_input_dtypes: Any = field(default_factory=list) - graph_output_shapes: Any = field(default_factory=list) - graph_output_dtypes: Any = field(default_factory=list) + input_shapes: Any = field(default_factory=list) + input_dtypes: Any = field(default_factory=list) + output_shapes: Any = field(default_factory=list) + output_dtypes: Any = field(default_factory=list) per_subgraph_data: List[PerSubgraphData] = field(default_factory=list) tensorrt_graph_count: int = 0 compilation_settings: CompilationSettings = field( @@ -111,7 +111,7 @@ def dryrun_stats_display( formatted_stats += " " * 2 + "Graph Structure:\n\n" formatted_stats += ( " " * 3 - + f"Inputs: {input_formatter(dryrun_tracker.graph_input_shapes, dryrun_tracker.graph_input_dtypes)}\n" + + f"Inputs: {input_formatter(dryrun_tracker.input_shapes, dryrun_tracker.input_dtypes)}\n" ) for i, trt_subgraph_data in enumerate(dryrun_tracker.per_subgraph_data): @@ -122,7 +122,7 @@ def dryrun_stats_display( ) formatted_stats += ( " " * 5 - + f"Engine Inputs: {input_formatter(trt_subgraph_data.subgraph_input_shapes, trt_subgraph_data.subgraph_input_dtypes)}\n" + + f"Engine Inputs: {input_formatter(trt_subgraph_data.input_shapes, trt_subgraph_data.input_dtypes)}\n" ) formatted_stats += ( " " * 5 @@ -130,13 +130,13 @@ def dryrun_stats_display( ) formatted_stats += ( " " * 5 - + f"Engine Outputs: {input_formatter(trt_subgraph_data.subgraph_output_shapes, trt_subgraph_data.subgraph_output_dtypes)}\n" + + f"Engine Outputs: {input_formatter(trt_subgraph_data.output_shapes, trt_subgraph_data.output_dtypes)}\n" ) formatted_stats += " " * 4 + "...\n" formatted_stats += ( " " * 3 - + f"Outputs: {input_formatter(dryrun_tracker.graph_output_shapes, dryrun_tracker.graph_output_dtypes)}\n" + + f"Outputs: {input_formatter(dryrun_tracker.output_shapes, dryrun_tracker.output_dtypes)}\n" ) # Print aggregate statistics about the graph structure, including recommended "min_block_size" options @@ -225,11 +225,18 @@ def input_formatter(shapes: Any, dtypes: Any) -> str: def input_formatter_helper(shapes: Any, dtypes: Any) -> str: """Helper for input formatter""" - # Base case - single shape, single dtype - if isinstance(shapes, tuple) and all(isinstance(elt, int) for elt in shapes): - return f"Tensor: {shapes}@{str(dtypes)[6:]}, " - - # Base case - dynamic shape, single dtype + # Base case 1 - single static/dynamic shape, single dtype + if isinstance(shapes, tuple) and all(isinstance(elt, (int, tuple)) for elt in shapes): + input_shape_string = "Tensor: (" + for elt in shapes: + if isinstance(elt, tuple): + input_shape_string+= f"(min={elt[0]}, max={elt[1]}), " + else: + input_shape_string+= f"{elt}, " + input_shape_string = input_shape_string[:-2] + ")" + f"@{str(dtypes)[6:]}, " + return input_shape_string + + # Base case 2 - dynamic shape, single dtype elif ( isinstance(shapes, dict) and len(shapes) == 3 diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 5ecebd9c6b..4a9fff0d96 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -35,7 +35,7 @@ ) from torch_tensorrt.dynamo.utils import ( get_torch_inputs, - parse_complex_tensor_structs, + parse_graph_io, prepare_inputs, set_log_level, to_torch_device, @@ -194,12 +194,14 @@ def compile( raise AssertionError( f"Input graph should be an ExportedProgram but got type {type(exported_program)}" ) + exported_program = pre_export_lowering(exported_program, torch_inputs) exported_program = exported_program.run_decompositions( get_decompositions(enable_experimental_decompositions) ) gm = exported_program.module() logger.debug("Input graph: " + str(gm.graph)) + # Apply lowering on the graph module gm = post_lowering(gm, torch_inputs) logger.debug("Lowered Input graph: " + str(gm.graph)) @@ -275,12 +277,6 @@ def compile_module( dryrun_tracker.total_ops_in_graph = total_ops dryrun_tracker.supported_ops_in_graph = num_supported_ops - dryrun_tracker.graph_input_shapes = parse_complex_tensor_structs( - sample_inputs, "shape", lambda x: dict(x) if isinstance(x, dict) else tuple(x) - ) - dryrun_tracker.graph_input_dtypes = parse_complex_tensor_structs( - sample_inputs, "dtype", lambda t: t.to(torch.dtype, use_default=True) - ) dryrun_tracker.compilation_settings = settings if settings.dryrun and settings.min_block_size > 1: @@ -406,28 +402,8 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool: name, ) - subgraph_data.subgraph_input_shapes = parse_complex_tensor_structs( - submodule_inputs, - "shape", - lambda x: dict(x) if isinstance(x, dict) else tuple(x), - ) - subgraph_data.subgraph_input_dtypes = parse_complex_tensor_structs( - submodule_inputs, "dtype", lambda t: t.to(torch.dtype) - ) - - submodule_outputs = submodule( - *get_torch_inputs(submodule_inputs, to_torch_device(settings.device)) - ) - - subgraph_data.subgraph_output_shapes = parse_complex_tensor_structs( - submodule_outputs, - "shape", - lambda x: dict(x) if isinstance(x, dict) else tuple(x), - ) - subgraph_data.subgraph_output_dtypes = parse_complex_tensor_structs( - submodule_outputs, "dtype" - ) - + # Parse the subgraph I/O and store it + parse_graph_io(submodule, subgraph_data) dryrun_tracker.tensorrt_graph_count += 1 dryrun_tracker.per_subgraph_data.append(subgraph_data) @@ -441,20 +417,9 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool: ) trt_modules[name] = trt_module - - sample_outputs = gm( - *get_torch_inputs(sample_inputs, to_torch_device(settings.device)) - ) - - if not isinstance(sample_outputs, (list, tuple)): - sample_outputs = [sample_outputs] - - dryrun_tracker.graph_output_shapes = parse_complex_tensor_structs( - sample_outputs, "shape", lambda x: dict(x) if isinstance(x, dict) else tuple(x) - ) - dryrun_tracker.graph_output_dtypes = parse_complex_tensor_structs( - sample_outputs, "dtype" - ) + + # Parse the graph I/O and store it in dryrun tracker + parse_graph_io(gm, dryrun_tracker) # Replace all FX Modules with TRT Modules for name, trt_module in trt_modules.items(): diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py index ea3034cb8c..e9ea09cb51 100644 --- a/py/torch_tensorrt/dynamo/conversion/_conversion.py +++ b/py/torch_tensorrt/dynamo/conversion/_conversion.py @@ -16,7 +16,7 @@ TRTInterpreterResult, ) from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule -from torch_tensorrt.dynamo.utils import get_torch_inputs +from torch_tensorrt.dynamo.utils import get_torch_inputs, get_model_device import tensorrt as trt @@ -29,9 +29,18 @@ def infer_module_output_dtypes( device: Device, truncate_double: bool = False, ) -> List[dtype]: + """ + This function performs model inference to determine the output dtypes + and truncates them accordingly. + """ + # TODO: We can also determine output dtypes from the module.graph based on node metadata. + # However, our converter tests use fx.symbolic_trace which sometimes does not provide metadata, + # so we stick to the model inference approach currently. with maybe_disable_fake_tensor_mode(): + # Get the device on which the model exists + # For large models, this can be done on CPU to save GPU memory allocation for TRT. + device = get_model_device(module) torch_inputs = get_torch_inputs(inputs, device) - module = module.to(device.to(torch.device)) module_outputs = module(*torch_inputs) if not isinstance(module_outputs, (list, tuple)): module_outputs = [module_outputs] @@ -85,6 +94,7 @@ def interpret_module_to_result( output_dtypes=output_dtypes, compilation_settings=settings, ) + interpreter_result = interpreter.run() return interpreter_result diff --git a/py/torch_tensorrt/dynamo/conversion/impl/matmul.py b/py/torch_tensorrt/dynamo/conversion/impl/matmul.py index 5ea29622c8..91a660908f 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/matmul.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/matmul.py @@ -22,6 +22,7 @@ def matrix_multiply( input_matrix_op: trt.MatrixOperation = trt.MatrixOperation.NONE, other_matrix_op: trt.MatrixOperation = trt.MatrixOperation.NONE, ) -> TRTTensor: + if not isinstance(input, trt.ITensor): input = get_trt_tensor(ctx, input, f"{name}_input") if not isinstance(other, trt.ITensor): diff --git a/py/torch_tensorrt/dynamo/lowering/_decompositions.py b/py/torch_tensorrt/dynamo/lowering/_decompositions.py index 2729e38ff5..aa6da2171a 100644 --- a/py/torch_tensorrt/dynamo/lowering/_decompositions.py +++ b/py/torch_tensorrt/dynamo/lowering/_decompositions.py @@ -211,7 +211,7 @@ def slice_scatter_decomposition( index_tensor_shape.append(src_each_dim) for index in range(start, end, step): cat_tensors.append(index * torch.ones(index_tensor_shape, dtype=torch.int64)) - index_tensor = torch.stack(cat_tensors, dim).cuda() + index_tensor = torch.stack(cat_tensors, dim).to(input_tensor.device) index_tensor_64 = index_tensor.to(torch.int64) output_tensor = torch.scatter(input_tensor, dim, index_tensor_64, src_tensor) return output_tensor diff --git a/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py b/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py index 76e79ac100..e8543ed290 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py @@ -32,9 +32,11 @@ def constant_fold( cf = _TorchTensorRTConstantFolder(gm, skip_constructors=False) cf.run() + # The constants are created on CPU to save GPU memory for TensorRT compilation. + # For TRT INetwork construction the constants are moved to CPU in get_attr call. for node, constant in cf.node_replacements.items(): replace_node_with_constant( - gm, node, torch.nn.Parameter(constant.cuda(), requires_grad=False) + gm, node, torch.nn.Parameter(constant, requires_grad=False) ) erased_params = [] diff --git a/py/torch_tensorrt/dynamo/lowering/passes/replace_max_pool_with_indices.py b/py/torch_tensorrt/dynamo/lowering/passes/replace_max_pool_with_indices.py index 6e3762e73c..0ad9d398e7 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/replace_max_pool_with_indices.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/replace_max_pool_with_indices.py @@ -44,7 +44,8 @@ def replace_max_pool_with_indices( kwargs=node.kwargs, ) maxpool_fused.meta = node.meta - + # The metadata for this node should exclude the indices metadata + maxpool_fused.meta["val"] = maxpool_fused.meta["val"][0] logger.debug( f"Replacing all uses of nodes {node}, {getitem_node} with fused maxpool node {maxpool_fused} " f"is the only user of placeholder {node} and was inserted by the compiler." diff --git a/py/torch_tensorrt/dynamo/partitioning/common.py b/py/torch_tensorrt/dynamo/partitioning/common.py index 40900106b0..6c68efe10b 100644 --- a/py/torch_tensorrt/dynamo/partitioning/common.py +++ b/py/torch_tensorrt/dynamo/partitioning/common.py @@ -6,17 +6,11 @@ from torch.fx.experimental.proxy_tensor import maybe_disable_fake_tensor_mode from torch_tensorrt._Input import Input from torch_tensorrt.dynamo._defaults import DEBUG +from torch_tensorrt.dynamo.utils import contains_sym_int, extract_var_range_info logger = logging.getLogger(__name__) -def contains_sym_int(tensor: torch.Tensor) -> bool: - """ - Returns true if the given tensor has symbolic shape. - """ - return any(isinstance(dim, torch.SymInt) for dim in tensor) - - def construct_dynamic_input( input_shape: torch.Size, input_dtype: torch.dtype, @@ -35,27 +29,10 @@ def construct_dynamic_input( max_shape = [] for dim in input_shape: if isinstance(dim, torch.SymInt): - node = dim.node - expr = node.expr - shape_env = node.shape_env - # An expr can be a independent SymInt node (eg: s0 or s1) or a composition of them eg: (48*s0 or s0*s1). - # In the case of expr which has symbolic computation, bound_sympy evaluates them. - # https://pytorch.org/docs/stable/generated/torch.fx.experimental.symbolic_shapes.ShapeEnv.html#torch.fx.experimental.symbolic_shapes.ShapeEnv.bound_sympy - # expr.xreplace replaces the symbolic variables with their current values and computes the expression. - var_range = shape_env.var_to_range.get(expr, None) or shape_env.bound_sympy( - expr - ) - var_val = shape_env.var_to_val.get(expr, None) or expr.xreplace( - shape_env.var_to_val - ) - assert var_range, var_val - # Torchdynamo 0/1 specialization outlier - if var_range.lower == 2: - min_shape.append(1) - else: - min_shape.append(int(var_range.lower)) - opt_shape.append(int(var_val)) - max_shape.append(int(var_range.upper)) + min_max_opt = extract_var_range_info(dim) + min_shape.append(min_max_opt["min"]) + opt_shape.append(min_max_opt["opt"]) + max_shape.append(min_max_opt["max"]) else: min_shape.append(dim) opt_shape.append(dim) diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index acfb2b0094..625bbf30fc 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -147,6 +147,16 @@ def get_torch_inputs( for input in inputs ] +def get_model_device(module : torch.fx.GraphModule) -> Union[Device, torch.device, str]: + """ + Returns the device on which the module parameters exist. + """ + for node in module.graph.nodes: + if "device" in node.kwargs: + return node.kwargs["device"] + + return torch.device("cpu") + def set_log_level(parent_logger: Any, level: Any) -> None: """ @@ -249,6 +259,95 @@ def parse_complex_tensor_structs( + "Allowed input types: {torch_tensorrt.Input, torch.Tensor, list, tuple, dict}" ) +def contains_sym_int(tensor: torch.Tensor) -> bool: + """ + Returns true if the given tensor has symbolic shape. + """ + return any(isinstance(dim, torch.SymInt) for dim in tensor) + +def extract_var_range_info(symbolic_integer: torch.SymInt) -> Dict[str, Any]: + """ + This function returns the min, max, opt values of a symbolic integer. + """ + node = symbolic_integer.node + expr = node.expr + shape_env = node.shape_env + # An expr can be a independent SymInt node (eg: s0 or s1) or a composition of them eg: (48*s0 or s0*s1). + # In the case of expr which has symbolic computation, bound_sympy evaluates them. + # https://pytorch.org/docs/stable/generated/torch.fx.experimental.symbolic_shapes.ShapeEnv.html#torch.fx.experimental.symbolic_shapes.ShapeEnv.bound_sympy + # expr.xreplace replaces the symbolic variables with their current values and computes the expression. + var_range = shape_env.var_to_range.get(expr, None) or shape_env.bound_sympy( + expr + ) + var_val = shape_env.var_to_val.get(expr, None) or expr.xreplace( + shape_env.var_to_val + ) + assert var_range, var_val + min_val, max_val, opt_val = int(var_range.lower), int(var_range.upper), int(var_val) + # Torchdynamo 0/1 specialization outlier + min_val = 1 if min_val == 2 else min_val + min_max_opt = {} + min_max_opt["min"] = min_val + min_max_opt["max"] = max_val + min_max_opt["opt"] = opt_val + + return min_max_opt + +def unwrap_tensor_shape(tensor): + """ + This is a helper function used to print/return the shape of the tensor. + For regular torch.tensor's, it returns the static shape. + For symbolic tensors, eg:(1, s0, 4), this function returns [1, [min, max], 4]. The min + and max correspond to the lower and upper values of s0 symbolic dimension. + """ + tensor_shape = [] + for dimension in tensor.shape: + if isinstance(dimension, int): + tensor_shape.append(dimension) + elif isinstance(dimension, torch.SymInt): + min_max_opt = extract_var_range_info(dimension) + tensor_shape.append((min_max_opt["min"], min_max_opt["max"])) + + return tuple(tensor_shape) + +def get_graph_io_attrs(io_nodes: Sequence[torch.fx.Node], attr_type: str) -> Sequence[Any]: + """ + Returns a list of attributes (shapes or dtypes) of the I/O nodes + """ + assert attr_type in ["shape", "dtype"] + attr_fn = unwrap_tensor_shape if attr_type == "shape" else lambda x : x.dtype + graph_io_attrs = [] + for node in io_nodes: + if "val" in node.meta: + metadata = node.meta["val"] + if isinstance(metadata, (tuple, list)): + for tensor in metadata: + graph_io_attrs.append(attr_fn(tensor)) + else: + graph_io_attrs.append(attr_fn(metadata)) + + return graph_io_attrs + +def parse_graph_io(module: torch.fx.GraphModule, dryrun_tracker: Any) -> None: + """ + Parse the graph I/O shape/dtype info for the whole graph and store in the dryrun tracker + """ + # Parse inputs of the graph + input_nodes = [node for node in module.graph.nodes if node.op == "placeholder"] + input_shapes = get_graph_io_attrs(input_nodes, "shape") + input_dtypes = get_graph_io_attrs(input_nodes, "dtype") + dryrun_tracker.input_shapes = input_shapes + dryrun_tracker.input_dtypes = input_dtypes + + # Parse outputs of the graph + mark_output_nodes = [node for node in module.graph.nodes if node.op == "output"] + output_nodes = [] + for node in mark_output_nodes: + output_nodes.extend(node.all_input_nodes) + output_shapes = get_graph_io_attrs(output_nodes, "shape") + output_dtypes = get_graph_io_attrs(output_nodes, "dtype") + dryrun_tracker.output_shapes = output_shapes + dryrun_tracker.output_dtypes = output_dtypes def to_torch_device(device: Optional[Union[Device, torch.device, str]]) -> torch.device: """Cast a device-type to torch.device From 132778254cc8e694498297dba2304e6462a67177 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 9 Aug 2024 13:29:45 -0700 Subject: [PATCH 29/55] chore: rebase fixes --- .../dynamo/lowering/passes/remove_assert_scalar.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/py/torch_tensorrt/dynamo/lowering/passes/remove_assert_scalar.py b/py/torch_tensorrt/dynamo/lowering/passes/remove_assert_scalar.py index a0a741805f..5a82f7e711 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/remove_assert_scalar.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/remove_assert_scalar.py @@ -1,5 +1,4 @@ import logging -from typing import Sequence import torch from torch_tensorrt.dynamo.lowering.passes.pass_utils import ( @@ -9,9 +8,7 @@ logger = logging.getLogger(__name__) -def remove_assert_scalar( - gm: torch.fx.GraphModule, sample_inputs: Sequence[torch.Tensor] -) -> torch.fx.GraphModule: +def remove_assert_scalar(gm: torch.fx.GraphModule) -> torch.fx.GraphModule: """Remove assert_scalar ops in the graph""" count = 0 for node in gm.graph.nodes: From 0980778929b0bf85f00ad9bdad4de7934e7a2f6c Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 9 Aug 2024 14:38:32 -0700 Subject: [PATCH 30/55] chore: updates --- examples/dynamo/torch_compile_gpt2.py | 45 +++++++++------- examples/dynamo/torch_export_gpt2.py | 75 +++++++++----------------- examples/dynamo/torch_export_llama2.py | 22 ++++---- 3 files changed, 59 insertions(+), 83 deletions(-) diff --git a/examples/dynamo/torch_compile_gpt2.py b/examples/dynamo/torch_compile_gpt2.py index 6285266973..d4c7767510 100644 --- a/examples/dynamo/torch_compile_gpt2.py +++ b/examples/dynamo/torch_compile_gpt2.py @@ -1,33 +1,38 @@ +import copy +import time + +import numpy as np import torch import torch_tensorrt from transformers import AutoModelForCausalLM, AutoTokenizer -from utils import generate +from utils import export_llm, generate + +# Define the parameters +MAX_TOKENS = 32 +DEVICE = torch.device("cuda:0") -# Define tokenizer and model -torch_device = "cuda" if torch.cuda.is_available() else "cpu" -tokenizer = AutoTokenizer.from_pretrained("gpt2") -model = ( - AutoModelForCausalLM.from_pretrained( +# Define the GPT2 model from hugging face +# kv_cache is not supported in Torch-TRT currently. +# CPU is used here so that GPU memory is reserved for TRT compilation. +with torch.no_grad(): + tokenizer = AutoTokenizer.from_pretrained("gpt2") + model = AutoModelForCausalLM.from_pretrained( "gpt2", pad_token_id=tokenizer.eos_token_id, use_cache=False, attn_implementation="eager", - ) - .eval() - .to(torch_device) -) + ).eval() # Input prompt -model_inputs = tokenizer("I enjoy walking with my cute dog", return_tensors="pt").to( - torch_device -) +prompt = "Roses are red, violets are blue" +model_inputs = tokenizer(prompt, return_tensors="pt") input_ids = model_inputs["input_ids"] -max_tokens = 20 # Auto-regressive generation loop for greedy search using PyTorch model -pyt_gen_tokens = generate(model, input_ids, max_tokens, tokenizer.eos_token_id) +pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) -# Compile Torch-TRT model +# Compile the model using torch.compile with tensorrt backend and +# mark the input sequence length to be dynamic torch._dynamo.mark_dynamic(input_ids, 1, min=7, max=1023) model.forward = torch.compile( model.forward, @@ -40,8 +45,10 @@ }, ) -# Auto-regressive generation loop for greedy search using Torch-TensorRT model -generated_token_ids = generate(model, input_ids, max_tokens, tokenizer.eos_token_id) +# Auto-regressive generation loop for greedy decoding using TensorRT model +# Move inputs to GPU +input_ids = input_ids.to(DEVICE) +trt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) # Decode the sentence print("=============================") @@ -52,5 +59,5 @@ print("=============================") print( "TensorRT model generated text: ", - tokenizer.decode(generated_token_ids[0], skip_special_tokens=True), + tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True), ) diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py index 7b1df0824b..618dbc15ed 100644 --- a/examples/dynamo/torch_export_gpt2.py +++ b/examples/dynamo/torch_export_gpt2.py @@ -7,68 +7,45 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from utils import export_llm, generate - -def time_generate(model, inputs, max_tokens, eos_token_id, iterations=10): - timings = [] - for _ in range(iterations): - start_time = time.time() - inputs_copy = copy.copy(inputs) - generate(model, inputs_copy, max_tokens, eos_token_id) - timings.append(time.time() - start_time) - - time_mean = np.mean(timings) * 1000 # convert to ms - time_med = np.median(timings) * 1000 # convert to ms - - return time_mean, time_med - - -# Define tokenizer and model -torch_device = "cuda" if torch.cuda.is_available() else "cpu" -tokenizer = AutoTokenizer.from_pretrained("gpt2") -model = ( - AutoModelForCausalLM.from_pretrained( +# Define the parameters +MAX_TOKENS = 32 +DEVICE = torch.device("cuda:0") + +# Define the GPT2 model from hugging face +# kv_cache is not supported in Torch-TRT currently. +# CPU is used here so that GPU memory is reserved for TRT compilation. +with torch.no_grad(): + tokenizer = AutoTokenizer.from_pretrained("gpt2") + model = AutoModelForCausalLM.from_pretrained( "gpt2", pad_token_id=tokenizer.eos_token_id, use_cache=False, attn_implementation="eager", - ) - .eval() - .to(torch_device) - .half() -) + ).eval() # Input prompt -model_inputs = tokenizer("I enjoy walking with my cute dog", return_tensors="pt").to( - torch_device -) +prompt = "I enjoy walking with my cute dog" +model_inputs = tokenizer(prompt, return_tensors="pt") input_ids = model_inputs["input_ids"] -max_tokens = 20 -# Auto-regressive generation loop for greedy search using PyTorch model -pyt_gen_tokens, num_tokens_gen = generate( - model, input_ids, max_tokens, tokenizer.eos_token_id -) -pyt_mean_time, pyt_med_time = time_generate( - model, input_ids, max_tokens, tokenizer.eos_token_id -) +# Auto-regressive generation loop for greedy decoding using PyTorch model +pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) -# Compile Torch-TRT model +# Export the GPT2 model into an ExportedProgram which is input of TRT compilation gpt2_ep = export_llm(model, input_ids, max_seq_len=1024) trt_model = torch_tensorrt.dynamo.compile( gpt2_ep, inputs=[input_ids], - enabled_precisions={torch.float16}, + enabled_precisions={torch.float32}, truncate_double=True, - debug=False, + device=DEVICE, + disable_tf32=True, ) -# Auto-regressive generation loop for greedy search using Torch-TensorRT model -generated_token_ids, num_tokens_gen = generate( - trt_model, input_ids, max_tokens, tokenizer.eos_token_id -) -trt_mean_time, trt_med_time = time_generate( - trt_model, input_ids, max_tokens, tokenizer.eos_token_id -) +# Auto-regressive generation loop for greedy decoding using TensorRT model +# Move inputs to GPU +input_ids = input_ids.to(DEVICE) +trt_gen_tokens = generate(trt_model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) # Decode the sentence print("=============================") @@ -76,12 +53,8 @@ def time_generate(model, inputs, max_tokens, eos_token_id, iterations=10): "Pytorch model generated text: ", tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True), ) -print(f"Pytorch total tokens generated: {num_tokens_gen}") -print(f"Pytorch total mean time in ms: {pyt_mean_time} median time: {pyt_med_time}") print("=============================") print( "TensorRT model generated text: ", - tokenizer.decode(generated_token_ids[0], skip_special_tokens=True), + tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True), ) -print(f"TensorRT total tokens generated: {num_tokens_gen}") -print(f"TensorRT total mean time in ms: {trt_mean_time} median time: {trt_med_time}") diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py index e08ebb307c..69a21bee16 100644 --- a/examples/dynamo/torch_export_llama2.py +++ b/examples/dynamo/torch_export_llama2.py @@ -12,17 +12,14 @@ # CPU is used here so that GPU memory is reserved for TRT compilation. llama_path = "meta-llama/Llama-2-7b-chat-hf" with torch.no_grad(): - model = ( - AutoModelForCausalLM.from_pretrained( - llama_path, use_cache=False, attn_implementation="eager" - ) - .eval() - ) + model = AutoModelForCausalLM.from_pretrained( + llama_path, use_cache=False, attn_implementation="eager" + ).eval() tokenizer = AutoTokenizer.from_pretrained(llama_path) -base_prompt = "What is dynamic programming?" -base_inputs = tokenizer(base_prompt, return_tensors="pt") -input_ids = base_inputs.input_ids +prompt = "What is dynamic programming?" +model_inputs = tokenizer(base_prompt, return_tensors="pt") +input_ids = model_inputs.input_ids # Auto-regressive generation loop for greedy search using PyTorch model pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) @@ -35,15 +32,14 @@ enabled_precisions={torch.float32}, min_block_size=1, truncate_double=True, - debug=True, device=DEVICE, disable_tf32=True, ) -# Auto-regressive generation loop for greedy search +# Auto-regressive generation loop for greedy decoding # Move inputs to GPU input_ids = input_ids.to(DEVICE) -generated_token_ids = generate(trt_model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) +trt_gen_tokens = generate(trt_model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) # Decode the sentence print("=============================") @@ -57,7 +53,7 @@ print( "TensorRT model generated text: ", tokenizer.batch_decode( - generated_token_ids, + trt_gen_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False, )[0], From 94b2ba1b02d44b6b8c78f9220dae581fd9758105 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 9 Aug 2024 16:17:20 -0700 Subject: [PATCH 31/55] chore: updates --- py/torch_tensorrt/dynamo/utils.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index 102b86e8e9..4c25c64af4 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -173,11 +173,17 @@ def get_model_device(module: torch.fx.GraphModule) -> Union[Device, torch.device """ Returns the device on which the module parameters exist. """ - for node in module.graph.nodes: - if "device" in node.kwargs: - return node.kwargs["device"] + device = None + for parameter in list(module.parameters()): + if isinstance(parameter, (torch.nn.parameter.Parameter, torch.Tensor)): + device = parameter.device - return torch.device("cpu") + if device is None: + device = torch.device("cpu") + logger.warning( + "Could not detect the device on which the model exists. Assuming the model is on CPU" + ) + return device def set_log_level(parent_logger: Any, level: Any) -> None: From 2b1db2902ba4e8fd6e962ab86efee50830fa378b Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 9 Aug 2024 16:22:24 -0700 Subject: [PATCH 32/55] chore: updates --- core/runtime/execute_engine.cpp | 2 +- py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp index e6d21474f2..82b868d131 100644 --- a/core/runtime/execute_engine.cpp +++ b/core/runtime/execute_engine.cpp @@ -201,7 +201,7 @@ std::vector execute_engine(std::vector inputs, c10::intr at::Tensor contig_input; if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) { - // Shape tensor inputs are casted to int64 explicitly + // Shape tensor inputs are casted to int64 explicitly. // Refer to // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435 auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64); diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index 55f225c89f..6bb71f26dd 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -3090,7 +3090,7 @@ def upsample_compute_output_size( input_size: torch.Size, output_size: Optional[Sequence[int]], scale_factors: Optional[Sequence[float]], -) -> Optional[Sequence[int]]: +) -> Sequence[int] | None: spatial_dimensions = len(input_size) - 2 if output_size is None and scale_factors is None: From 9f606fca34ee774dc2a339261839ca09443bea8a Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 9 Aug 2024 16:29:06 -0700 Subject: [PATCH 33/55] chore: updates --- py/torch_tensorrt/dynamo/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index 4c25c64af4..b95ef90eaa 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -177,6 +177,7 @@ def get_model_device(module: torch.fx.GraphModule) -> Union[Device, torch.device for parameter in list(module.parameters()): if isinstance(parameter, (torch.nn.parameter.Parameter, torch.Tensor)): device = parameter.device + break if device is None: device = torch.device("cpu") From 3228c574e059a66adb0752fefcbf95dc988dc899 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 15 Aug 2024 10:46:28 -0700 Subject: [PATCH 34/55] chore: Update perf tooling with support for HF models (#3034) Co-authored-by: Chengzhe Xu --- py/requirements.txt | 2 +- .../dynamo/conversion/_conversion.py | 4 +- .../dynamo/lowering/_decompositions.py | 2 +- .../dynamo/partitioning/common.py | 4 +- py/torch_tensorrt/dynamo/utils.py | 42 +- tools/perf/hf_models/benchmark_gpt2.sh | 21 + tools/perf/hf_models/benchmark_llama2.sh | 68 +++ .../hf_models/benchmark_llama3.1_instruct.sh | 68 +++ tools/perf/hf_models/benchmark_llama3.sh | 68 +++ tools/perf/hf_models/benchmark_mistral7b.sh | 68 +++ tools/perf/perf_run.py | 389 ++++++++++-------- tools/perf/requirements.txt | 4 +- tools/perf/run_hf_model.sh | 26 ++ tools/perf/stage1.sh | 49 +++ tools/perf/stage2.sh | 29 ++ tools/perf/utils.py | 140 ++++++- 16 files changed, 788 insertions(+), 196 deletions(-) create mode 100644 tools/perf/hf_models/benchmark_gpt2.sh create mode 100644 tools/perf/hf_models/benchmark_llama2.sh create mode 100644 tools/perf/hf_models/benchmark_llama3.1_instruct.sh create mode 100644 tools/perf/hf_models/benchmark_llama3.sh create mode 100644 tools/perf/hf_models/benchmark_mistral7b.sh create mode 100644 tools/perf/run_hf_model.sh create mode 100644 tools/perf/stage1.sh create mode 100644 tools/perf/stage2.sh diff --git a/py/requirements.txt b/py/requirements.txt index 649ea09706..288d766fa6 100644 --- a/py/requirements.txt +++ b/py/requirements.txt @@ -3,7 +3,7 @@ packaging pybind11==2.6.2 --extra-index-url https://download.pytorch.org/whl/nightly/cu124 torch>=2.5.0.dev,<2.6.0 -torchvision>=0.19.0.dev,<0.20.0 +torchvision>=0.20.0.dev,<0.21.0 --extra-index-url https://pypi.ngc.nvidia.com pyyaml tensorrt==10.1.0 diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py index a1bf2880a7..430a4e9e00 100644 --- a/py/torch_tensorrt/dynamo/conversion/_conversion.py +++ b/py/torch_tensorrt/dynamo/conversion/_conversion.py @@ -5,7 +5,7 @@ import tensorrt as trt import torch -from torch.fx.experimental.proxy_tensor import maybe_disable_fake_tensor_mode +from torch.fx.experimental.proxy_tensor import unset_fake_temporarily from torch_tensorrt._Device import Device from torch_tensorrt._enums import dtype from torch_tensorrt._features import ENABLED_FEATURES @@ -36,7 +36,7 @@ def infer_module_output_dtypes( # TODO: We can also determine output dtypes from the module.graph based on node metadata. # However, our converter tests use fx.symbolic_trace which sometimes does not provide metadata, # so we stick to the model inference approach currently. - with maybe_disable_fake_tensor_mode(): + with unset_fake_temporarily(): # Get the device on which the model exists # For large models, this can be done on CPU to save GPU memory allocation for TRT. device = get_model_device(module) diff --git a/py/torch_tensorrt/dynamo/lowering/_decompositions.py b/py/torch_tensorrt/dynamo/lowering/_decompositions.py index aa6da2171a..a74c5077de 100644 --- a/py/torch_tensorrt/dynamo/lowering/_decompositions.py +++ b/py/torch_tensorrt/dynamo/lowering/_decompositions.py @@ -267,7 +267,7 @@ def scatter_add_decomposition( index_slice = torch.unsqueeze(index_slice, dim) # moving tensor to default device - device = to_torch_device(default_device()) + device = input_tensor.device scatter_add_tensor = scatter_add_tensor.to(device) to_scatter_tensor = to_scatter_tensor.to(device) index_slice = index_slice.to(device) diff --git a/py/torch_tensorrt/dynamo/partitioning/common.py b/py/torch_tensorrt/dynamo/partitioning/common.py index e79e9cbe3c..bd4fe73406 100644 --- a/py/torch_tensorrt/dynamo/partitioning/common.py +++ b/py/torch_tensorrt/dynamo/partitioning/common.py @@ -3,7 +3,7 @@ import torch from torch._subclasses.fake_tensor import FakeTensor -from torch.fx.experimental.proxy_tensor import maybe_disable_fake_tensor_mode +from torch.fx.experimental.proxy_tensor import unset_fake_temporarily from torch_tensorrt._Input import Input from torch_tensorrt.dynamo._defaults import DEBUG from torch_tensorrt.dynamo.utils import contains_sym_int, extract_var_range_info @@ -76,7 +76,7 @@ def construct_submodule_inputs(module: torch.fx.GraphModule) -> Sequence[Input]: Returns: Sequence of torch_tensorrt.Input's representing inputs to given module """ - with maybe_disable_fake_tensor_mode(): + with unset_fake_temporarily(): torchtrt_inputs = [] module_inputs = [ node for node in module.graph.nodes if node.op == "placeholder" diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index b95ef90eaa..157bce8f97 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -131,42 +131,38 @@ def get_torch_inputs( inputs: Sequence[Input] | Dict[Any, Any], device: Union[Device, torch.device, str], mode: str = "", -) -> Sequence[torch.tensor] | Dict[Any, Any]: +) -> Sequence[torch.Tensor] | Dict[str, torch.Tensor]: """ Return the torch_tensor from the Input object. If mode is set, this implies user is using dynamic shaped inputs and return the corresponding input based on the mode requested. """ device = to_torch_device(device) - if mode: - if isinstance(inputs, dict): - result = {} - for k, v in inputs.items(): - if isinstance(v, (list, tuple, dict)): - result[k] = get_torch_inputs(v, device) - else: - result[k] = v.example_tensor(mode).to(device) - return result - else: - return [ - input.example_tensor(mode).to(device) - for input in inputs - if isinstance(input, Input) - ] if isinstance(inputs, dict): result = {} for k, v in inputs.items(): if isinstance(v, (list, tuple, dict)): result[k] = get_torch_inputs(v, device) - else: - result[k] = v.torch_tensor.to(device) - return result + elif isinstance(v, Input): + if len(mode) > 0: + result[k] = v.example_tensor(mode).to(device) + else: + result[k] = v.torch_tensor.to(device) else: - return [ - input.torch_tensor.to(device) if isinstance(input, Input) else input - for input in inputs - ] + result = [] + for input in inputs: + if isinstance(input, Input): + if len(mode) > 0: + result.append(input.example_tensor(mode).to(device)) + else: + result.append(input.torch_tensor.to(device)) + elif isinstance(input, torch.Tensor): + result.append(input.to(device)) + else: + raise AssertionError(f"Input type {type(input)} is not a valid type") + + return result def get_model_device(module: torch.fx.GraphModule) -> Union[Device, torch.device, str]: diff --git a/tools/perf/hf_models/benchmark_gpt2.sh b/tools/perf/hf_models/benchmark_gpt2.sh new file mode 100644 index 0000000000..83d08ab5d1 --- /dev/null +++ b/tools/perf/hf_models/benchmark_gpt2.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Usage : bash run_hf_model.sh + +# GPT2 model torch backend +bash run_hf_model.sh 1 "torch" "gpt2" 128 256 "fp16" 1 +bash run_hf_model.sh 16 "torch" "gpt2" 128 256 "fp16" 1 +bash run_hf_model.sh 64 "torch" "gpt2" 128 256 "fp16" 1 +bash run_hf_model.sh 256 "torch" "gpt2" 128 256 "fp16" 1 + +# GPT2 model dynamo backend +bash run_hf_model.sh 1 "dynamo" "gpt2" 128 256 "fp16" 1 +bash run_hf_model.sh 16 "dynamo" "gpt2" 128 256 "fp16" 1 +bash run_hf_model.sh 64 "dynamo" "gpt2" 128 256 "fp16" 1 +bash run_hf_model.sh 256 "dynamo" "gpt2" 128 256 "fp16" 1 + +# GPT2 model inductor backend +bash run_hf_model.sh 1 "inductor" "gpt2" 128 256 "fp16" 1 +bash run_hf_model.sh 16 "inductor" "gpt2" 128 256 "fp16" 1 +bash run_hf_model.sh 64 "inductor" "gpt2" 128 256 "fp16" 1 +bash run_hf_model.sh 256 "inductor" "gpt2" 128 256 "fp16" 1 + diff --git a/tools/perf/hf_models/benchmark_llama2.sh b/tools/perf/hf_models/benchmark_llama2.sh new file mode 100644 index 0000000000..9138c63818 --- /dev/null +++ b/tools/perf/hf_models/benchmark_llama2.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# Usage : bash run_hf_model.sh + +# "meta-llama/Llama-2-7b-chat-hf" model torch backend +# isl, osl = 128, 256 +bash run_hf_model.sh 1 "torch" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 +bash run_hf_model.sh 16 "torch" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 +bash run_hf_model.sh 64 "torch" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 +bash run_hf_model.sh 256 "torch" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 +# isl, osl = 128, 2176 +bash run_hf_model.sh 1 "torch" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 +bash run_hf_model.sh 16 "torch" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 +bash run_hf_model.sh 64 "torch" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 +bash run_hf_model.sh 256 "torch" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 +# isl, osl = 2048, 2176 +bash run_hf_model.sh 1 "torch" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 +bash run_hf_model.sh 16 "torch" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 +bash run_hf_model.sh 64 "torch" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 +bash run_hf_model.sh 256 "torch" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 +# isl, osl = 2048, 4096 +bash run_hf_model.sh 1 "torch" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 +bash run_hf_model.sh 16 "torch" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 +bash run_hf_model.sh 64 "torch" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 +bash run_hf_model.sh 256 "torch" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 + +# "meta-llama/Llama-2-7b-chat-hf" model dynamo backend +# isl, osl = 128, 256 +bash run_hf_model.sh 1 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 +bash run_hf_model.sh 16 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 +bash run_hf_model.sh 64 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 +bash run_hf_model.sh 256 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 +# isl, osl = 128, 2176 +bash run_hf_model.sh 1 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 +bash run_hf_model.sh 16 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 +bash run_hf_model.sh 64 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 +bash run_hf_model.sh 256 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 +# isl, osl = 2048, 2176 +bash run_hf_model.sh 1 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 +bash run_hf_model.sh 16 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 +bash run_hf_model.sh 64 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 +bash run_hf_model.sh 256 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 +# isl, osl = 2048, 4096 +bash run_hf_model.sh 1 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 +bash run_hf_model.sh 16 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 +bash run_hf_model.sh 64 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 +bash run_hf_model.sh 256 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 + +# "meta-llama/Llama-2-7b-chat-hf" model inductor backend +# isl, osl = 128, 256 +bash run_hf_model.sh 1 "inductor" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 +bash run_hf_model.sh 16 "inductor" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 +bash run_hf_model.sh 64 "inductor" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 +bash run_hf_model.sh 256 "inductor" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 +# isl, osl = 128, 2176 +bash run_hf_model.sh 1 "inductor" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 +bash run_hf_model.sh 16 "inductor" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 +bash run_hf_model.sh 64 "inductor" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 +bash run_hf_model.sh 256 "inductor" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 +# isl, osl = 2048, 2176 +bash run_hf_model.sh 1 "inductor" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 +bash run_hf_model.sh 16 "inductor" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 +bash run_hf_model.sh 64 "inductor" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 +bash run_hf_model.sh 256 "inductor" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 +# isl, osl = 2048, 4096 +bash run_hf_model.sh 1 "inductor" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 +bash run_hf_model.sh 16 "inductor" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 +bash run_hf_model.sh 64 "inductor" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 +bash run_hf_model.sh 256 "inductor" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 \ No newline at end of file diff --git a/tools/perf/hf_models/benchmark_llama3.1_instruct.sh b/tools/perf/hf_models/benchmark_llama3.1_instruct.sh new file mode 100644 index 0000000000..d6f0bca8ab --- /dev/null +++ b/tools/perf/hf_models/benchmark_llama3.1_instruct.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# Usage : bash run_hf_model.sh + +# "meta-llama/Meta-Llama-3.1-8B-Instruct" model torch backend +# isl, osl = 128, 256 +bash run_hf_model.sh 1 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 +bash run_hf_model.sh 16 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 +bash run_hf_model.sh 64 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 +bash run_hf_model.sh 256 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 +# isl, osl = 128, 2176 +bash run_hf_model.sh 1 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 +bash run_hf_model.sh 16 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 +bash run_hf_model.sh 64 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 +bash run_hf_model.sh 256 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 +# isl, osl = 2048, 2176 +bash run_hf_model.sh 1 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 +bash run_hf_model.sh 16 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 +bash run_hf_model.sh 64 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 +bash run_hf_model.sh 256 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 +# isl, osl = 2048, 4096 +bash run_hf_model.sh 1 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 +bash run_hf_model.sh 16 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 +bash run_hf_model.sh 64 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 +bash run_hf_model.sh 256 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 + +# "meta-llama/Meta-Llama-3.1-8B-Instruct" model dynamo backend +# isl, osl = 128, 256 +bash run_hf_model.sh 1 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 +bash run_hf_model.sh 16 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 +bash run_hf_model.sh 64 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 +bash run_hf_model.sh 256 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 +# isl, osl = 128, 2176 +bash run_hf_model.sh 1 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 +bash run_hf_model.sh 16 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 +bash run_hf_model.sh 64 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 +bash run_hf_model.sh 256 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 +# isl, osl = 2048, 2176 +bash run_hf_model.sh 1 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 +bash run_hf_model.sh 16 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 +bash run_hf_model.sh 64 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 +bash run_hf_model.sh 256 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 +# isl, osl = 2048, 4096 +bash run_hf_model.sh 1 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 +bash run_hf_model.sh 16 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 +bash run_hf_model.sh 64 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 +bash run_hf_model.sh 256 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 + +# "meta-llama/Meta-Llama-3.1-8B-Instruct" model inductor backend +# isl, osl = 128, 256 +bash run_hf_model.sh 1 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 +bash run_hf_model.sh 16 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 +bash run_hf_model.sh 64 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 +bash run_hf_model.sh 256 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 +# isl, osl = 128, 2176 +bash run_hf_model.sh 1 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 +bash run_hf_model.sh 16 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 +bash run_hf_model.sh 64 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 +bash run_hf_model.sh 256 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 +# isl, osl = 2048, 2176 +bash run_hf_model.sh 1 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 +bash run_hf_model.sh 16 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 +bash run_hf_model.sh 64 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 +bash run_hf_model.sh 256 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 +# isl, osl = 2048, 4096 +bash run_hf_model.sh 1 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 +bash run_hf_model.sh 16 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 +bash run_hf_model.sh 64 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 +bash run_hf_model.sh 256 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 \ No newline at end of file diff --git a/tools/perf/hf_models/benchmark_llama3.sh b/tools/perf/hf_models/benchmark_llama3.sh new file mode 100644 index 0000000000..a0c48c3c51 --- /dev/null +++ b/tools/perf/hf_models/benchmark_llama3.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# Usage : bash run_hf_model.sh + +# "meta-llama/Meta-Llama-3-8B" model torch backend +# isl, osl = 128, 256 +bash run_hf_model.sh 1 "torch" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 +bash run_hf_model.sh 16 "torch" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 +bash run_hf_model.sh 64 "torch" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 +bash run_hf_model.sh 256 "torch" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 +# isl, osl = 128, 2176 +bash run_hf_model.sh 1 "torch" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 +bash run_hf_model.sh 16 "torch" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 +bash run_hf_model.sh 64 "torch" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 +bash run_hf_model.sh 256 "torch" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 +# isl, osl = 2048, 2176 +bash run_hf_model.sh 1 "torch" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 +bash run_hf_model.sh 16 "torch" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 +bash run_hf_model.sh 64 "torch" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 +bash run_hf_model.sh 256 "torch" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 +# isl, osl = 2048, 4096 +bash run_hf_model.sh 1 "torch" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 +bash run_hf_model.sh 16 "torch" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 +bash run_hf_model.sh 64 "torch" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 +bash run_hf_model.sh 256 "torch" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 + +# "meta-llama/Meta-Llama-3-8B" model dynamo backend +# isl, osl = 128, 256 +bash run_hf_model.sh 1 "dynamo" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 +bash run_hf_model.sh 16 "dynamo" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 +bash run_hf_model.sh 64 "dynamo" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 +bash run_hf_model.sh 256 "dynamo" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 +# isl, osl = 128, 2176 +bash run_hf_model.sh 1 "dynamo" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 +bash run_hf_model.sh 16 "dynamo" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 +bash run_hf_model.sh 64 "dynamo" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 +bash run_hf_model.sh 256 "dynamo" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 +# isl, osl = 2048, 2176 +bash run_hf_model.sh 1 "dynamo" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 +bash run_hf_model.sh 16 "dynamo" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 +bash run_hf_model.sh 64 "dynamo" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 +bash run_hf_model.sh 256 "dynamo" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 +# isl, osl = 2048, 4096 +bash run_hf_model.sh 1 "dynamo" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 +bash run_hf_model.sh 16 "dynamo" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 +bash run_hf_model.sh 64 "dynamo" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 +bash run_hf_model.sh 256 "dynamo" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 + +# "meta-llama/Meta-Llama-3-8B" model inductor backend +# isl, osl = 128, 256 +bash run_hf_model.sh 1 "inductor" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 +bash run_hf_model.sh 16 "inductor" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 +bash run_hf_model.sh 64 "inductor" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 +bash run_hf_model.sh 256 "inductor" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 +# isl, osl = 128, 2176 +bash run_hf_model.sh 1 "inductor" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 +bash run_hf_model.sh 16 "inductor" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 +bash run_hf_model.sh 64 "inductor" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 +bash run_hf_model.sh 256 "inductor" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 +# isl, osl = 2048, 2176 +bash run_hf_model.sh 1 "inductor" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 +bash run_hf_model.sh 16 "inductor" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 +bash run_hf_model.sh 64 "inductor" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 +bash run_hf_model.sh 256 "inductor" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 +# isl, osl = 2048, 4096 +bash run_hf_model.sh 1 "inductor" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 +bash run_hf_model.sh 16 "inductor" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 +bash run_hf_model.sh 64 "inductor" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 +bash run_hf_model.sh 256 "inductor" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 \ No newline at end of file diff --git a/tools/perf/hf_models/benchmark_mistral7b.sh b/tools/perf/hf_models/benchmark_mistral7b.sh new file mode 100644 index 0000000000..7d1e71cba4 --- /dev/null +++ b/tools/perf/hf_models/benchmark_mistral7b.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# Usage : bash run_hf_model.sh + +# "mistralai/Mistral-7B-Instruct-v0.3" model torch backend +# isl, osl = 128, 256 +bash run_hf_model.sh 1 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 +bash run_hf_model.sh 16 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 +bash run_hf_model.sh 64 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 +bash run_hf_model.sh 256 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 +# isl, osl = 128, 2176 +bash run_hf_model.sh 1 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 +bash run_hf_model.sh 16 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 +bash run_hf_model.sh 64 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 +bash run_hf_model.sh 256 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 +# isl, osl = 2048, 2176 +bash run_hf_model.sh 1 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 +bash run_hf_model.sh 16 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 +bash run_hf_model.sh 64 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 +bash run_hf_model.sh 256 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 +# isl, osl = 2048, 4096 +bash run_hf_model.sh 1 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 +bash run_hf_model.sh 16 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 +bash run_hf_model.sh 64 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 +bash run_hf_model.sh 256 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 + +# "mistralai/Mistral-7B-Instruct-v0.3" model dynamo backend +# isl, osl = 128, 256 +bash run_hf_model.sh 1 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 +bash run_hf_model.sh 16 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 +bash run_hf_model.sh 64 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 +bash run_hf_model.sh 256 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 +# isl, osl = 128, 2176 +bash run_hf_model.sh 1 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 +bash run_hf_model.sh 16 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 +bash run_hf_model.sh 64 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 +bash run_hf_model.sh 256 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 +# isl, osl = 2048, 2176 +bash run_hf_model.sh 1 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 +bash run_hf_model.sh 16 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 +bash run_hf_model.sh 64 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 +bash run_hf_model.sh 256 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 +# isl, osl = 2048, 4096 +bash run_hf_model.sh 1 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 +bash run_hf_model.sh 16 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 +bash run_hf_model.sh 64 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 +bash run_hf_model.sh 256 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 + +# "mistralai/Mistral-7B-Instruct-v0.3" model inductor backend +# isl, osl = 128, 256 +bash run_hf_model.sh 1 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 +bash run_hf_model.sh 16 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 +bash run_hf_model.sh 64 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 +bash run_hf_model.sh 256 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 +# isl, osl = 128, 2176 +bash run_hf_model.sh 1 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 +bash run_hf_model.sh 16 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 +bash run_hf_model.sh 64 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 +bash run_hf_model.sh 256 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 +# isl, osl = 2048, 2176 +bash run_hf_model.sh 1 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 +bash run_hf_model.sh 16 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 +bash run_hf_model.sh 64 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 +bash run_hf_model.sh 256 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 +# isl, osl = 2048, 4096 +bash run_hf_model.sh 1 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 +bash run_hf_model.sh 16 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 +bash run_hf_model.sh 64 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 +bash run_hf_model.sh 256 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 \ No newline at end of file diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index c52fb6ba56..5d41df13d6 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -18,10 +18,14 @@ import torch_tensorrt as torchtrt from utils import ( BENCHMARK_MODELS, + export_llm, parse_backends, parse_inputs, parse_precisions, precision_to_dtype, + time_generate, + torch_device_from_trt, + torch_dtype_from_trt, ) WARMUP_ITER = 10 @@ -41,30 +45,120 @@ def wrapper_func(*args, **kwargs): return wrapper_func -# Runs inference using Torch backend -@run_with_try_except -def run_torch(model, input_tensors, params, precision, batch_size): - print("Running Torch for precision: ", precision, " batch_size : ", batch_size) - iters = params.get("iterations", 20) +def recordStats(backend, timings, precision, batch_size=1, compile_time_s=None): + """ + Records different timing stats and adds it to the result + """ + times = np.array(timings) + speeds = batch_size / times + time_mean = np.mean(times) + time_med = np.median(times) + time_99th = np.percentile(times, 99) + time_std = np.std(times, ddof=0) + speed_mean = np.mean(speeds) + speed_med = np.median(speeds) + stats = { + "Backend": backend, + "Precision": precision, + "Batch size": batch_size, + "Median(FPS)": speed_med, + "Mean(FPS)": speed_mean, + "Median-Latency(ms)": time_med * 1000, + "Mean-Latency(ms)": time_mean * 1000, + "Latency-StdDev(ms)": time_std * 1000, + "Compile Time(s)": compile_time_s, + } + results.append(stats) + + +def record_llm_perf( + model, + backend, + input_tensors, + precision, + output_seq_length, + batch_size, + iterations, + compile_time_s=None, +): + """ + Measure LLM generation time and record the stats + """ + # We only support single input (B x seq_len) for LLMs now + input_seq = input_tensors[0] + with torch.no_grad(): + # Warm up for 3 iterations + _ = time_generate(model, input_seq, output_seq_length, iterations=iterations) + + torch.cuda.synchronize() + + # Actual perf measurement + timings = time_generate( + model, input_seq, output_seq_length, iterations=iterations + ) + + recordStats( + "Torch-TensorRT " + backend, timings, precision, batch_size, compile_time_s + ) + + +def record_perf( + model, + backend, + input_tensors, + precision, + iterations, + batch_size, + compile_time_s=None, +): + """ + Run the model for certain number of iterations and record the perf. + Model is warmed up initially + """ # Warm up with torch.no_grad(): for _ in range(WARMUP_ITER): - features = model(*input_tensors) + model(*input_tensors) torch.cuda.synchronize() timings = [] with torch.no_grad(): - for i in range(iters): + for i in range(iterations): start_time = timeit.default_timer() - features = model(*input_tensors) + _ = model(*input_tensors) torch.cuda.synchronize() end_time = timeit.default_timer() - meas_time = end_time - start_time - timings.append(meas_time) + timings.append(end_time - start_time) + + recordStats( + "Torch-TensorRT " + backend, timings, precision, batch_size, compile_time_s + ) + + +# Runs inference using Torch backend +@run_with_try_except +def run_torch(model, input_tensors, params, precision, batch_size): + print("Running Torch for precision: ", precision, " batch_size : ", batch_size) + iters = params.get("iterations", 20) + model = model.to("cuda:0") + if params["is_text_llm"]: + output_seq_length = params["output_sequence_length"] + return record_llm_perf( + model, + "Torch", + input_tensors, + precision, + output_seq_length, + batch_size, + iters, + None, + ) - recordStats("Torch", timings, precision, batch_size) + record_perf( + model, "Torch", input_tensors, precision, iters, batch_size, compile_time_s=None + ) # Runs inference using Torch-TensorRT backend @@ -86,31 +180,55 @@ def run_ts_trt(model, input_tensors, params, precision, batch_size): if precision == "int8": compile_settings.update({"calib": params.get("calibration_cache")}) - start_compile = time.time_ns() + start_compile = timeit.default_timer() model = torchtrt.compile(model, ir="ts", **compile_settings) - end_compile = time.time_ns() - compile_time_s = (end_compile - start_compile) / 1e9 + end_compile = timeit.default_timer() + compile_time_s = end_compile - start_compile iters = params.get("iterations", 20) - # Warm up - with torch.no_grad(): - for _ in range(WARMUP_ITER): - features = model(*input_tensors) - torch.cuda.synchronize() + record_perf( + model, + "Torchscript", + input_tensors, + precision, + iters, + batch_size, + compile_time_s, + ) - timings = [] - with torch.no_grad(): - for i in range(iters): - start_time = timeit.default_timer() - features = model(*input_tensors) - torch.cuda.synchronize() - end_time = timeit.default_timer() - meas_time = end_time - start_time - timings.append(meas_time) - recordStats( - "Torch-TensorRT [Torchscript]", timings, precision, batch_size, compile_time_s +@run_with_try_except +def run_hf_dynamo(model, input_tensors, params, precision, batch_size): + """ + Compile the huggingface model using Torch-TensorRT dynamo frontend and record performance stats + """ + + osl = params["output_sequence_length"] + iters = params.get("iterations", 20) + # Move the model and inputs to cpu and trace it. + model = model.to("cpu") + inputs_cpu = [tensor.clone().cpu() for tensor in input_tensors] + exp_program = export_llm(model, inputs_cpu, min_seq_len=1, max_seq_len=osl) + start_compile = timeit.default_timer() + + trt_model = torchtrt.dynamo.compile( + exp_program, + inputs=input_tensors, + enabled_precisions={precision_to_dtype(precision)}, + truncate_double=params.get("truncate", False), + ) + end_compile = timeit.default_timer() + compile_time_s = end_compile - start_compile + record_llm_perf( + trt_model, + "Dynamo", + input_tensors, + precision, + osl, + batch_size, + iters, + compile_time_s, ) @@ -125,7 +243,10 @@ def run_dynamo(model, input_tensors, params, precision, batch_size): " batch_size : ", batch_size, ) - start_compile = time.time_ns() + if params["is_text_llm"]: + return run_hf_dynamo(model, input_tensors, params, precision, batch_size) + + start_compile = timeit.default_timer() model = torchtrt.compile( model, inputs=input_tensors, @@ -135,28 +256,12 @@ def run_dynamo(model, input_tensors, params, precision, batch_size): debug=False, truncate_long_and_double=params.get("truncate", False), ) - end_compile = time.time_ns() - compile_time_s = (end_compile - start_compile) / 1e9 + end_compile = timeit.default_timer() + compile_time_s = end_compile - start_compile iters = params.get("iterations", 20) - # Warm up - with torch.no_grad(): - for _ in range(WARMUP_ITER): - features = model(*input_tensors) - - torch.cuda.synchronize() - - timings = [] - with torch.no_grad(): - for i in range(iters): - start_time = timeit.default_timer() - features = model(*input_tensors) - torch.cuda.synchronize() - end_time = timeit.default_timer() - meas_time = end_time - start_time - timings.append(meas_time) - recordStats( - "Torch-TensorRT [Dynamo]", timings, precision, batch_size, compile_time_s + record_perf( + model, "Dynamo", input_tensors, precision, iters, batch_size, compile_time_s ) @@ -165,6 +270,8 @@ def run_torch_compile(model, input_tensors, params, precision, batch_size): """ Compile the given model using Torch-TensorRT torch.compile frontend and record performance stats """ + # Move the model to GPU + model = model.to("cuda:0") torch._dynamo.reset() print( @@ -176,41 +283,52 @@ def run_torch_compile(model, input_tensors, params, precision, batch_size): compile_spec = { "inputs": input_tensors, "enabled_precisions": {precision_to_dtype(precision)}, - "truncate_long_and_double": params.get("truncate", False), + "truncate": params.get("truncate", False), "min_block_size": params.get("min_block_size", 1), } - start_compile = time.time_ns() - model = torch.compile( - model, backend="tensorrt", dynamic=False, options=compile_spec - ) + start_compile = timeit.default_timer() + model = torch.compile(model, backend="tensorrt", dynamic=None, options=compile_spec) model(*input_tensors) - end_compile = time.time_ns() - compile_time_s = (end_compile - start_compile) / 1e9 + end_compile = timeit.default_timer() + compile_time_s = end_compile - start_compile iters = params.get("iterations", 20) - # Warm up - with torch.no_grad(): - for _ in range(WARMUP_ITER): - features = model(*input_tensors) - torch.cuda.synchronize() + record_perf( + model, + "torch_compile", + input_tensors, + precision, + iters, + batch_size, + compile_time_s, + ) - timings = [] - with torch.no_grad(): - for i in range(iters): - start_time = timeit.default_timer() - features = model(*input_tensors) - torch.cuda.synchronize() - end_time = timeit.default_timer() - meas_time = end_time - start_time - timings.append(meas_time) - # Reset torch dynamo cache - torch._dynamo.reset() - recordStats( - "Torch-TensorRT [torch_compile]", - timings, +@run_with_try_except +def run_hf_inductor(model, input_tensors, params, precision, batch_size): + """ + Compile the huggingface model using torch inductor and record performance stats + """ + osl = params["output_sequence_length"] + # Mark dynamic shapes for input sequence + input_seq = input_tensors[0] + torch._dynamo.mark_dynamic(input_seq, 1, min=1, max=osl) + start_compile = timeit.default_timer() + # Compile the model + model = torch.compile(model, backend="inductor", dynamic=None, mode="max-autotune") + model(input_seq) + end_compile = timeit.default_timer() + compile_time_s = end_compile - start_compile + iters = params.get("iterations", 20) + + record_llm_perf( + model, + "Inductor", + input_tensors, precision, + osl, batch_size, + iters, compile_time_s, ) @@ -221,72 +339,28 @@ def run_inductor(model, input_tensors, params, precision, batch_size): Compile the given model using torch inductor and record performance stats """ torch._dynamo.reset() - + model = model.to("cuda:0") print( "Running Torch [inductor] for precision: ", precision, " batch_size : ", batch_size, ) + if params["is_text_llm"]: + return run_hf_inductor(model, input_tensors, params, precision, batch_size) - start_compile = time.time_ns() - model = torch.compile(model, backend="inductor", dynamic=False, mode="max-autotune") + start_compile = timeit.default_timer() + model = torch.compile(model, backend="inductor", dynamic=None, mode="max-autotune") model(*input_tensors) - end_compile = time.time_ns() - compile_time_s = (end_compile - start_compile) / 1e9 + end_compile = timeit.default_timer() + compile_time_s = end_compile - start_compile iters = params.get("iterations", 20) - # Warm up - with torch.no_grad(): - for _ in range(WARMUP_ITER): - features = model(*input_tensors) - - torch.cuda.synchronize() - - timings = [] - with torch.no_grad(): - for i in range(iters): - start_time = timeit.default_timer() - features = model(*input_tensors) - torch.cuda.synchronize() - end_time = timeit.default_timer() - meas_time = end_time - start_time - timings.append(meas_time) - # Reset torch dynamo cache - torch._dynamo.reset() - recordStats( - "Torch [inductor]", - timings, - precision, - batch_size, - compile_time_s, + record_perf( + model, "inductor", input_tensors, precision, iters, batch_size, compile_time_s ) -def torch_dtype_from_trt(dtype): - if dtype == trt.int8: - return torch.int8 - elif dtype == trt.bool: - return torch.bool - elif dtype == trt.int32: - return torch.int32 - elif dtype == trt.float16: - return torch.float16 - elif dtype == trt.float32: - return torch.float32 - else: - raise TypeError("%s is not supported by torch" % dtype) - - -def torch_device_from_trt(device): - if device == trt.TensorLocation.DEVICE: - return torch.device("cuda") - elif device == trt.TensorLocation.HOST: - return torch.device("cpu") - else: - return TypeError("%s is not supported by torch" % device) - - @run_with_try_except def run_tensorrt( model, @@ -310,10 +384,10 @@ def run_tensorrt( config = builder.create_builder_config() if precision == "fp16": config.set_flag(trt.BuilderFlag.FP16) - start_compile = time.time_ns() + start_compile = timeit.default_timer() serialized_engine = builder.build_serialized_network(network, config) - end_compile = time.time_ns() - compile_time_s = (end_compile - start_compile) / 1e9 + end_compile = timeit.default_timer() + compile_time_s = end_compile - start_compile # Deserialize the TensorRT engine with trt.Runtime(logger) as runtime: engine = runtime.deserialize_cuda_engine(serialized_engine) @@ -443,32 +517,6 @@ def run( run_inductor(model_torch, input_tensors, params, precision, batch_size) -# Generate report -def recordStats(backend, timings, precision, batch_size=1, compile_time_s=None): - times = np.array(timings) - steps = len(times) - speeds = batch_size / times - time_mean = np.mean(times) - time_med = np.median(times) - time_99th = np.percentile(times, 99) - time_std = np.std(times, ddof=0) - speed_mean = np.mean(speeds) - speed_med = np.median(speeds) - - stats = { - "Backend": backend, - "Precision": precision, - "Batch size": batch_size, - "Median(FPS)": speed_med, - "Mean(FPS)": speed_mean, - "Median-Latency(ms)": time_med * 1000, - "Mean-Latency(ms)": time_mean * 1000, - "Latency-StdDev(ms)": time_std * 1000, - "Compile Time(s)": compile_time_s, - } - results.append(stats) - - if __name__ == "__main__": arg_parser = argparse.ArgumentParser( description="Run inference on a model with random input values" @@ -493,9 +541,24 @@ def recordStats(backend, timings, precision, batch_size=1, compile_time_s=None): type=str, help="List of input shapes. Eg: (1, 3, 224, 224)@fp32 for Resnet or (1, 128)@int32;(1, 128)@int32 for BERT", ) + arg_parser.add_argument( + "--is_text_llm", + action="store_true", + help="Boolean flag to determine if model is a huggingface model", + ) + arg_parser.add_argument( + "-osl", + "--output_sequence_length", + type=int, + help="Length of output sequence to HF model", + default=128, + ) arg_parser.add_argument( "--batch_size", type=int, default=1, help="Batch size to build and run" ) + arg_parser.add_argument( + "--iterations", type=int, default=20, help="Iterations to measure the perf" + ) arg_parser.add_argument( "--precision", default="fp32", @@ -542,16 +605,14 @@ def recordStats(backend, timings, precision, batch_size=1, compile_time_s=None): # Load PyTorch Model, if provided if len(model_name_torch) > 0 and os.path.exists(model_name_torch): print("Loading user provided torch model: ", model_name_torch) - model_torch = torch.load(model_name_torch).eval().cuda() + model_torch = torch.load(model_name_torch).eval() elif model_name_torch in BENCHMARK_MODELS: - model_torch = BENCHMARK_MODELS[model_name_torch]["model"].eval().cuda() + model_torch = BENCHMARK_MODELS[model_name_torch]["model"].eval() # If neither model type was provided if (model is None) and (model_torch is None): raise ValueError( - "No valid models specified. Please provide a torchscript model file or model name " - + "(among the following options vgg16|resnet50|efficientnet_b0|vit) " - + "or provide a torch model file" + "No valid models specified. Please provide a torchscript model file or model name (defined in hub.py) or model_hf name in huggingface models " ) backends = parse_backends(params["backends"]) diff --git a/tools/perf/requirements.txt b/tools/perf/requirements.txt index 881241e24d..fcfb0b3d53 100644 --- a/tools/perf/requirements.txt +++ b/tools/perf/requirements.txt @@ -2,8 +2,8 @@ numpy argparse pyyaml onnx -transformers==4.38.0 +pandas +transformers diffusers==0.21.4 -pandas==2.0.1 timm==0.9.8 diff --git a/tools/perf/run_hf_model.sh b/tools/perf/run_hf_model.sh new file mode 100644 index 0000000000..ced2a46dfb --- /dev/null +++ b/tools/perf/run_hf_model.sh @@ -0,0 +1,26 @@ +#!/bin/bash +batch_size=$1 +backend=$2 +model_name=$3 +isl=$4 +osl=$5 +precision=$6 +iterations=$7 +modified_model_name=$(echo "$model_name" | sed 's/\//-/g') +echo "Benchmarking ${model_name} model for bs ${batch_size} with ISL ${isl}, OSL ${osl} and backend ${backend} for ${iterations} iterations" +python perf_run.py --model_torch ${model_name} \ + --is_text_llm \ + --precision ${precision} \ + --inputs "(${batch_size}, ${isl})@int64" \ + --output_sequence_length ${osl} \ + --batch_size ${batch_size} \ + --truncate \ + --backends ${backend} \ + --iterations ${iterations} \ + --report "${modified_model_name}_perf_bs${batch_size}_backend_${backend}_isl${isl}_osl${osl}.csv" + +# Move the report file to the mounted volume in the docker +mv "${modified_model_name}_perf_bs${batch_size}_backend_${backend}_isl${isl}_osl${osl}.csv" /work + +# Clear HF cache +rm -rf ~/.cache/huggingface/hub/ diff --git a/tools/perf/stage1.sh b/tools/perf/stage1.sh new file mode 100644 index 0000000000..3cdf292a4a --- /dev/null +++ b/tools/perf/stage1.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# global parameters +precision="fp16" +iterations=1 +backends=("dynamo" "inductor") +batch_sizes=(1 16) +hf_token="" +image_name="gitlab-master.nvidia.com:5005/torch-tensorrt/tensorrt/torch_tensorrt:latest" + +# Stage 1 : GPT2 experiment +models=("gpt2") +isl=(128) +osl=(256) +for model in ${models[@]} + do + for bs in ${batch_sizes[@]} + do + for backend in ${backends[@]} + do + for i in ${!isl[@]}; + do + docker run --rm -it --gpus 0 --shm-size=10.24g --ulimit stack=67108864 -v "$PWD:/work" --ipc=host ${image_name} /bin/bash -c "cd /opt/torch_tensorrt/tools/perf; HF_TOKEN="${hf_token}" bash run_hf_model.sh "${bs}" "$backend" "$model" "${isl[i]}" "${osl[i]}" "${precision}" "${iterations}"; exit" + done + done + done + done +# Clear HF cache +rm -rf ~/.cache/huggingface/hub/ + +# Stage 2 : non-GPT2 experiments +isl=(128 128) +osl=(256 2176) +models=("meta-llama/Meta-Llama-3.1-8B-Instruct" "meta-llama/Llama-2-7b-chat-hf" "mistralai/Mistral-7B-Instruct-v0.3") +backends=("dynamo" "inductor") +for model in ${models[@]} + do + for bs in ${batch_sizes[@]} + do + for backend in ${backends[@]} + do + for i in ${!isl[@]}; + do + docker run --rm -it --gpus 0 --shm-size=10.24g --ulimit stack=67108864 -v "$PWD:/work" --ipc=host ${image_name} /bin/bash -c "cd /opt/torch_tensorrt/tools/perf; HF_TOKEN="${hf_token}" bash run_hf_model.sh "${bs}" "$backend" "$model" "${isl[i]}" "${osl[i]}" "${precision}" "${iterations}"; exit" + done + done + done + done +# Clear HF cache +rm -rf ~/.cache/huggingface/hub/ \ No newline at end of file diff --git a/tools/perf/stage2.sh b/tools/perf/stage2.sh new file mode 100644 index 0000000000..8ede056ff2 --- /dev/null +++ b/tools/perf/stage2.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# global parameters +precision="fp16" +iterations=1 +backends=("dynamo" "inductor") +batch_sizes=(1 16) +hf_token="" +image_name="gitlab-master.nvidia.com:5005/torch-tensorrt/tensorrt/torch_tensorrt:latest" + +# Stage 2 : slower non-GPT2 experiments +isl=(2048) +osl=(2176) +models=("meta-llama/Meta-Llama-3.1-8B-Instruct" "meta-llama/Llama-2-7b-chat-hf" "mistralai/Mistral-7B-Instruct-v0.3") +backends=("dynamo" "inductor") +for model in ${models[@]} + do + for bs in ${batch_sizes[@]} + do + for backend in ${backends[@]} + do + for i in ${!isl[@]}; + do + docker run --rm -it --gpus 0 --shm-size=10.24g --ulimit stack=67108864 -v "$PWD:/work" --ipc=host ${image_name} /bin/bash -c "cd /opt/torch_tensorrt/tools/perf; HF_TOKEN="${hf_token}" bash run_hf_model.sh "${bs}" "$backend" "$model" "${isl[i]}" "${osl[i]}" "${precision}" "${iterations}"; exit" + done + done + done + done +# Clear HF cache +rm -rf ~/.cache/huggingface/hub/ \ No newline at end of file diff --git a/tools/perf/utils.py b/tools/perf/utils.py index a6f8ba236d..9e91c2a28d 100644 --- a/tools/perf/utils.py +++ b/tools/perf/utils.py @@ -1,7 +1,13 @@ +import copy +import timeit + import custom_models as cm +import numpy as np +import tensorrt as trt import timm import torch import torchvision.models as models +from transformers import AutoModel, AutoModelForCausalLM BENCHMARK_MODEL_NAMES = { "vgg16", @@ -12,9 +18,31 @@ "vit_large", "bert_base_uncased", "sd_unet", + "meta-llama/Llama-2-7b-chat-hf", + "gpt2", + "meta-llama/Meta-Llama-3-8B", + "meta-llama/Meta-Llama-3.1-8B-Instruct", + "apple/DCLM-7B", + "mistralai/Mistral-7B-Instruct-v0.3", + "microsoft/Phi-3-mini-4k-instruct", } +def load_hf_model(model_name_hf): + print("Loading user-specified HF model: ", model_name_hf) + model_hf = ( + AutoModelForCausalLM.from_pretrained( + model_name_hf, + trust_remote_code=True, + use_cache=False, + attn_implementation="eager", + ) + .eval() + ) + + return {"model": model_hf} + + class ModelStorage: def __contains__(self, name: str): return name in BENCHMARK_MODEL_NAMES @@ -63,6 +91,26 @@ def __getitem__(self, name: str): "model": cm.StableDiffusionUnet(), "path": "pytorch", } + elif name in [ + "gpt2", + "meta-llama/Meta-Llama-3-8B", + "meta-llama/Llama-2-7b-chat-hf", + "meta-llama/Meta-Llama-3.1-8B-Instruct", + "mistralai/Mistral-7B-Instruct-v0.3", + "microsoft/Phi-3-mini-4k-instruct", + ]: + hf_artifact = load_hf_model(name) + return { + "model": hf_artifact["model"], + "path": "pytorch", + } + elif name == "apple/DCLM-7B": + # Load model directly + hf_artifact = AutoModel.from_pretrained("apple/DCLM-7B") + return { + "model": hf_artifact["model"], + "path": "pytorch", + } else: raise AssertionError(f"Invalid model name {name}") @@ -81,6 +129,8 @@ def precision_to_dtype(pr): return torch.half elif pr == "int32": return torch.int32 + elif pr == "int64": + return torch.int64 elif pr == "bool": return torch.bool else: @@ -102,7 +152,7 @@ def parse_inputs(user_inputs, dtype): input_shape.append(int(input_dim)) if input_shape != [1]: - if dtype == torch.int32: + if dtype == torch.int32 or dtype == torch.int64: torchtrt_inputs.append( torch.randint(0, 5, input_shape, dtype=dtype).cuda() ) @@ -120,3 +170,91 @@ def parse_backends(backends): def parse_precisions(precisions): return precisions.split(",") + + +def torch_dtype_from_trt(dtype): + if dtype == trt.int8: + return torch.int8 + elif dtype == trt.bool: + return torch.bool + elif dtype == trt.int32: + return torch.int32 + elif dtype == trt.float16: + return torch.float16 + elif dtype == trt.float32: + return torch.float32 + else: + raise TypeError("%s is not supported by torch" % dtype) + + +def torch_device_from_trt(device): + if device == trt.TensorLocation.DEVICE: + return torch.device("cuda") + elif device == trt.TensorLocation.HOST: + return torch.device("cpu") + else: + return TypeError("%s is not supported by torch" % device) + + +def export_llm(model, inputs, min_seq_len=1, max_seq_len=16): + """ + Exports the LLM model into an ExportedProgram with dynamic shapes. + In the case of guard failures due to some PyTorch kernel implements, we also + try to re-export the graph by expressing them as runtime assert nodes + """ + assert isinstance(inputs, list) + + with torch.no_grad(): + # max=1024 has contraint violation error. https://github.com/pytorch/pytorch/issues/125604 + seq_len = torch.export.Dim("seq_len", min=min_seq_len, max=max_seq_len) + try: + print("Trying to export the model using torch.export.export()..") + # strict=False only enables aotautograd tracing and excludes dynamo. + ep = torch.export.export( + model, tuple(inputs), dynamic_shapes=({1: seq_len},), strict=False + ) + except: + print( + "Trying torch.export._trace._export to trace the graph since torch.export.export() failed" + ) + # This API is used to express the constraint violation guards as asserts in the graph. + ep = torch.export._trace._export( + model, + (inputs,), + dynamic_shapes=({1: seq_len},), + strict=False, + allow_complex_guards_as_runtime_asserts=True, + ) + + return ep + + +def generate(model, input_seq, output_seq_length): + """ + Greedy decoding of the model. This generates up to max_tokens. + """ + + while input_seq.shape[1] <= output_seq_length: + outputs = model(input_seq) + logits = outputs.logits + next_token_logits = logits[:, -1, :] + next_tokens = torch.argmax(next_token_logits, dim=-1) + input_seq = torch.cat([input_seq, next_tokens[:, None]], dim=-1) + + return input_seq + + +def time_generate(model, inputs, output_seq_length, iterations=10): + """ + Measure the time for generating a sentence over certain number of iterations + """ + timings = [] + for _ in range(iterations): + start_time = timeit.default_timer() + inputs_copy = copy.copy(inputs) + _ = generate(model, inputs_copy, output_seq_length) + torch.cuda.synchronize() + end_time = timeit.default_timer() + timings.append(end_time - start_time) + + return timings From e4873d0140931a271f69ed92057dcebc0e972517 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 19 Aug 2024 11:10:00 -0700 Subject: [PATCH 35/55] chore: updates --- .../dynamo/conversion/impl/cast.py | 11 +++--- .../conversion/impl/normalization/ops.py | 7 ++++ py/torch_tensorrt/dynamo/utils.py | 36 ++++++++++++++----- 3 files changed, 40 insertions(+), 14 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/cast.py b/py/torch_tensorrt/dynamo/conversion/impl/cast.py index 6f49547a3d..0b69f98fc9 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/cast.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/cast.py @@ -9,7 +9,10 @@ from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext from torch_tensorrt.dynamo.conversion._ConverterRegistry import ConverterRegistry -from torch_tensorrt.dynamo.conversion.converter_utils import cast_trt_tensor +from torch_tensorrt.dynamo.conversion.converter_utils import ( + cast_trt_tensor, + get_trt_tensor, +) from torch_tensorrt.fx.types import TRTDataType, TRTTensor LOGGER: logging.Logger = logging.getLogger(__name__) @@ -20,14 +23,12 @@ def to_copy( target: Target, source_ir: Optional[SourceIR], name: str, - input: TRTTensor, + input: Union[TRTTensor, torch.Tensor, np.ndarray], dtype: Union[TRTDataType, torch.dtype, np.dtype, _enums.dtype], force_layer: bool = False, ) -> TRTTensor: if not isinstance(input, TRTTensor): - raise RuntimeError( - f"to_copy received input {input} that is not a TensorRT ITensor" - ) + input = get_trt_tensor(ctx, input, f"{name}_copy_tensor") # If cast is forced, insert identity layer regardless of whether the dtype # doesn't change diff --git a/py/torch_tensorrt/dynamo/conversion/impl/normalization/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/normalization/ops.py index c9599c7476..a796c041e1 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/normalization/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/normalization/ops.py @@ -116,6 +116,13 @@ def layer_norm( axes = get_axes_for_reduce_op(dims) weight = get_trt_tensor(ctx, weight, f"{name}_weight") bias = get_trt_tensor(ctx, bias, f"{name}_bias") + # Cast weight and bias to have same dtype as input + weight = cast_trt_tensor( + ctx, weight, input.dtype, f"{name}_weight_cast", target, source_ir + ) + bias = cast_trt_tensor( + ctx, bias, input.dtype, f"{name}_bias_cast", target, source_ir + ) if tuple(input.shape) != tuple(weight.shape): weight = impl.slice.expand( ctx, target, source_ir, f"{name}_expand_weight", weight, input.shape diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index 157bce8f97..85d8148d40 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -8,6 +8,7 @@ import numpy as np import tensorrt as trt import torch +from torch._subclasses.fake_tensor import FakeTensor from torch_tensorrt._Device import Device from torch_tensorrt._enums import dtype from torch_tensorrt._Input import Input @@ -319,24 +320,41 @@ def extract_var_range_info(symbolic_integer: torch.SymInt) -> Dict[str, Any]: return min_max_opt -def unwrap_tensor_shape(tensor: torch.Tensor) -> Sequence[Any]: +def unwrap_tensor_shape( + tensor: Union[torch.Tensor, FakeTensor, torch.SymInt] +) -> Sequence[Any]: """ This is a helper function used to print/return the shape of the tensor. For regular torch.tensor's, it returns the static shape. For symbolic tensors, eg:(1, s0, 4), this function returns [1, [min, max], 4]. The min and max correspond to the lower and upper values of s0 symbolic dimension. """ - tensor_shape: Sequence[Union[int | Sequence[int]]] = [] - for dimension in tensor.shape: - if isinstance(dimension, int): - tensor_shape.append(dimension) - elif isinstance(dimension, torch.SymInt): - min_max_opt = extract_var_range_info(dimension) - tensor_shape.append((min_max_opt["min"], min_max_opt["max"])) + tensor_shape = [] + # for dimension in tensor.shape: + if isinstance(tensor, int): + tensor_shape.append(tensor) + elif isinstance(tensor, torch.SymInt): + min_max_opt = extract_var_range_info(tensor) + tensor_shape.append((min_max_opt["min"], min_max_opt["max"])) + elif isinstance(tensor, (torch.Tensor, FakeTensor)): + for dimension in tensor.shape: + tensor_shape.extend(unwrap_tensor_shape(dimension)) return tuple(tensor_shape) +def unwrap_tensor_dtype(tensor: Union[torch.Tensor, FakeTensor, torch.SymInt]) -> Any: + """ + Returns the dtype of torch.tensor or FakeTensor. For symbolic integers, we return int64 + """ + if isinstance(tensor, (torch.Tensor, FakeTensor)): + return tensor.dtype + elif isinstance(tensor, torch.SymInt): + return torch.int64 + else: + raise ValueError(f"Found invalid tensor type {type(tensor)}") + + def get_graph_io_attrs( io_nodes: Sequence[torch.fx.Node], attr_type: str ) -> Sequence[Any]: @@ -344,7 +362,7 @@ def get_graph_io_attrs( Returns a list of attributes (shapes or dtypes) of the I/O nodes """ assert attr_type in ["shape", "dtype"] - attr_fn = unwrap_tensor_shape if attr_type == "shape" else lambda x: x.dtype + attr_fn = unwrap_tensor_shape if attr_type == "shape" else unwrap_tensor_dtype graph_io_attrs = [] for node in io_nodes: if "val" in node.meta: From bb10de4d6cb4742c24ea2bd78753fb189501f694 Mon Sep 17 00:00:00 2001 From: Hoonkyung Cho Date: Tue, 13 Aug 2024 02:29:19 +0900 Subject: [PATCH 36/55] feat: lowering replace aten.full_like with aten.full --- .../dynamo/lowering/_decompositions.py | 14 ++-- .../lowering/passes/_aten_lowering_pass.py | 2 + .../passes/replace_full_like_with_full.py | 43 ++++++++++++ .../py/dynamo/lowering/test_decompositions.py | 68 +++++++++++++++++-- 4 files changed, 115 insertions(+), 12 deletions(-) create mode 100644 py/torch_tensorrt/dynamo/lowering/passes/replace_full_like_with_full.py diff --git a/py/torch_tensorrt/dynamo/lowering/_decompositions.py b/py/torch_tensorrt/dynamo/lowering/_decompositions.py index a74c5077de..7fe0032d80 100644 --- a/py/torch_tensorrt/dynamo/lowering/_decompositions.py +++ b/py/torch_tensorrt/dynamo/lowering/_decompositions.py @@ -168,7 +168,7 @@ def var_decomposition( @register_torch_trt_decomposition( torch.ops.aten.empty_permuted.default, registry=TORCH_TRT_DECOMPOSITIONS ) -def empty_permuted_decomposition(*args, **kwargs) -> torch.Tensor: +def empty_permuted_decomposition(*args, **kwargs) -> torch.Tensor: # type: ignore empty_size = args[0] empty_permute = args[1] perm = [0] * len(empty_size) @@ -188,7 +188,7 @@ def slice_scatter_decomposition( start: Optional[int] = None, end: Optional[int] = None, step: Optional[int] = None, -): +) -> torch.Tensor: dim_size = input_tensor.shape[dim] start = get_positive_dim(start, input_tensor.shape[dim]) if end is None: @@ -197,6 +197,11 @@ def slice_scatter_decomposition( if step is None: step = 1 + # Ensure start, end, and step are all integers + assert isinstance(start, int), "start must be an integer" + assert isinstance(end, int), "end must be an integer" + assert isinstance(step, int), "step must be an integer" + src_dim = src_tensor.shape # step == 0 is not a valid torch case # also src_dim should be equal to slice dimension @@ -233,7 +238,7 @@ def select_scatter_decomposition( @register_torch_trt_decomposition( torch.ops.aten.empty_strided.default, registry=TORCH_TRT_DECOMPOSITIONS ) -def empty_strided_decomposition(*args, **kwargs) -> torch.Tensor: +def empty_strided_decomposition(*args, **kwargs) -> torch.Tensor: # type: ignore empty_size = args[0] empty_stride = args[1] return torch.as_strided( @@ -256,8 +261,7 @@ def scatter_add_decomposition( src_shape = list(src_tensor.shape) src_dim = src_shape[dim] for i in range(0, src_dim): - to_scatter_tensor = torch.zeros_like(input_tensor) - + to_scatter_tensor = torch.zeros(input_tensor.shape, dtype=input_tensor.dtype) # index and src slice src_slice = torch.select(src_tensor, dim, i) index_slice = torch.select(index, dim, i) diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py index fd9891c12e..b7c65f1880 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py @@ -12,6 +12,7 @@ from .remove_detach import remove_detach from .remove_input_alias_fixing_clones import remove_input_alias_fixing_clones from .repair_input_as_output import repair_input_as_output +from .replace_full_like_with_full import replace_full_like_with_full from .replace_max_pool_with_indices import replace_max_pool_with_indices from .view_to_reshape import view_to_reshape @@ -24,6 +25,7 @@ lower_linear, fuse_prims_broadcast, replace_max_pool_with_indices, + replace_full_like_with_full, view_to_reshape, remove_assert_scalar, ] diff --git a/py/torch_tensorrt/dynamo/lowering/passes/replace_full_like_with_full.py b/py/torch_tensorrt/dynamo/lowering/passes/replace_full_like_with_full.py new file mode 100644 index 0000000000..35f9b1cd3f --- /dev/null +++ b/py/torch_tensorrt/dynamo/lowering/passes/replace_full_like_with_full.py @@ -0,0 +1,43 @@ +import logging +from typing import Sequence + +import torch +import torch.fx +from torch_tensorrt.dynamo.lowering.passes.pass_utils import ( + clean_up_graph_after_modifications, +) + +logger = logging.getLogger(__name__) + + +def replace_full_like_with_full( + gm: torch.fx.GraphModule, sample_inputs: Sequence[torch.Tensor] +) -> torch.fx.GraphModule: + """Replace full_like nodes with equivalent full nodes""" + modified_graph = False + + for node in gm.graph.nodes: + if node.target == torch.ops.aten.full_like.default: + modified_graph = True + + # Extract arguments from full_like + input_tensor = node.args[0] + fill_value = node.args[1] + shape = list(input_tensor.meta["tensor_meta"].shape) + + # Replace full_like with full, using the shape as a list + with gm.graph.inserting_after(node): + full_node = gm.graph.call_function( + torch.ops.aten.full.default, + args=(shape, fill_value), + kwargs=node.kwargs, + ) + full_node.meta = node.meta + + node.replace_all_uses_with(full_node) + gm.graph.erase_node(node) + + if modified_graph: + gm = clean_up_graph_after_modifications(gm) + + return gm diff --git a/tests/py/dynamo/lowering/test_decompositions.py b/tests/py/dynamo/lowering/test_decompositions.py index a1416c00db..74ac6cde62 100644 --- a/tests/py/dynamo/lowering/test_decompositions.py +++ b/tests/py/dynamo/lowering/test_decompositions.py @@ -421,6 +421,66 @@ def forward(self, x): f"MaxPool3d TRT outputs don't match with the original model.", ) + def test_lowering_full_like_module(self): + class FullLike(torch.nn.Module): + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + def forward(self, x): + y = torch.full_like(x, 2.0) + return y + + # Operations expected to be removed in the traced graph after decompositions + expected_ops = {torch.ops.aten.full.default} + unexpected_ops = {torch.ops.aten.full_like.default} + + inputs = [torch.randn(3, 3, dtype=torch.float32).cuda()] + + fx_graph = torch.fx.symbolic_trace(FullLike()) + unexpected_ops_seen, expected_ops_unseen = lower_graph_testing( + fx_graph, + inputs, + expected_ops=expected_ops, + unexpected_ops=unexpected_ops, + min_block_size=1, + ) + + self.assertEqual( + len(unexpected_ops_seen), + 0, + f"The following unexpected ops were encountered: {unexpected_ops_seen}", + ) + + self.assertEqual( + len(expected_ops_unseen), + 0, + f"The following expected ops were not encountered: {expected_ops_unseen}", + ) + + torch._dynamo.reset() + + # Validate that the results between Torch and Torch-TRT are similar + optimized_model = torch_tensorrt.compile( + fx_graph, + "torch_compile", + inputs, + min_block_size=1, + truncate_double=True, + pass_through_build_failures=True, + ) + optimized_model_results = optimized_model(*inputs).detach().cpu() + torch_model_results = fx_graph(*inputs).detach().cpu() + + max_diff = float( + torch.max(torch.abs(optimized_model_results - torch_model_results)) + ) + self.assertAlmostEqual( + max_diff, + 0, + DECIMALS_OF_AGREEMENT, + f"FullLike TRT outputs don't match with the original model.", + ) + def test_lowering_empty_like_module(self): class emptyLike(torch.nn.Module): def __init__(self, *args, **kwargs) -> None: @@ -976,7 +1036,7 @@ def forward(self, input): 0, torch.tensor([[0, 1, 2, 0], [1, 2, 1, 1]]).cuda(), torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=torch.int32).cuda(), - {torch.ops.aten.add.Tensor, torch.ops.aten.scatter.src}, + {torch.ops.aten.add.Tensor}, ), ( "scatter_add_one_dim_indexOne_constant", @@ -985,8 +1045,6 @@ def forward(self, input): torch.tensor([[1, 2, 3, 1]], dtype=torch.int32).cuda(), { torch.ops.aten.add.Tensor, - torch.ops.aten.scatter.src, - torch.ops.aten.full_like.default, }, ), ( @@ -996,8 +1054,6 @@ def forward(self, input): torch.tensor([[1, 2, 3, 1], [5, 6, 5, 5]], dtype=torch.int32).cuda(), { torch.ops.aten.add.Tensor, - torch.ops.aten.scatter.src, - torch.ops.aten.full_like.default, }, ), ( @@ -1009,8 +1065,6 @@ def forward(self, input): ).cuda(), { torch.ops.aten.add.Tensor, - torch.ops.aten.scatter.src, - torch.ops.aten.full_like.default, }, ), ] From 1527aa057d95dc0695d992d0549b4be72564f202 Mon Sep 17 00:00:00 2001 From: Hoonkyung Cho Date: Tue, 13 Aug 2024 02:40:46 +0900 Subject: [PATCH 37/55] chore: minor linting --- .../dynamo/lowering/passes/replace_full_like_with_full.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/torch_tensorrt/dynamo/lowering/passes/replace_full_like_with_full.py b/py/torch_tensorrt/dynamo/lowering/passes/replace_full_like_with_full.py index 35f9b1cd3f..d09778f3c6 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/replace_full_like_with_full.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/replace_full_like_with_full.py @@ -11,7 +11,7 @@ def replace_full_like_with_full( - gm: torch.fx.GraphModule, sample_inputs: Sequence[torch.Tensor] + gm: torch.fx.GraphModule, ) -> torch.fx.GraphModule: """Replace full_like nodes with equivalent full nodes""" modified_graph = False From 67e33c31ab69c4109b87b350f980e92b4dd8c873 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 19 Aug 2024 13:52:24 -0700 Subject: [PATCH 38/55] chore: updates --- examples/dynamo/torch_compile_gpt2.py | 63 ----------------- examples/dynamo/torch_export_gpt2.py | 38 +++++++++-- examples/dynamo/torch_export_llama2.py | 40 +++++++++-- .../lowering/passes/remove_assert_scalar.py | 2 - tools/perf/hf_models/benchmark_gpt2.sh | 21 ------ tools/perf/hf_models/benchmark_llama2.sh | 68 ------------------- .../hf_models/benchmark_llama3.1_instruct.sh | 68 ------------------- tools/perf/hf_models/benchmark_llama3.sh | 68 ------------------- tools/perf/hf_models/benchmark_mistral7b.sh | 68 ------------------- tools/perf/stage1.sh | 4 +- tools/perf/stage2.sh | 4 +- 11 files changed, 71 insertions(+), 373 deletions(-) delete mode 100644 examples/dynamo/torch_compile_gpt2.py delete mode 100644 tools/perf/hf_models/benchmark_gpt2.sh delete mode 100644 tools/perf/hf_models/benchmark_llama2.sh delete mode 100644 tools/perf/hf_models/benchmark_llama3.1_instruct.sh delete mode 100644 tools/perf/hf_models/benchmark_llama3.sh delete mode 100644 tools/perf/hf_models/benchmark_mistral7b.sh diff --git a/examples/dynamo/torch_compile_gpt2.py b/examples/dynamo/torch_compile_gpt2.py deleted file mode 100644 index d4c7767510..0000000000 --- a/examples/dynamo/torch_compile_gpt2.py +++ /dev/null @@ -1,63 +0,0 @@ -import copy -import time - -import numpy as np -import torch -import torch_tensorrt -from transformers import AutoModelForCausalLM, AutoTokenizer -from utils import export_llm, generate - -# Define the parameters -MAX_TOKENS = 32 -DEVICE = torch.device("cuda:0") - -# Define the GPT2 model from hugging face -# kv_cache is not supported in Torch-TRT currently. -# CPU is used here so that GPU memory is reserved for TRT compilation. -with torch.no_grad(): - tokenizer = AutoTokenizer.from_pretrained("gpt2") - model = AutoModelForCausalLM.from_pretrained( - "gpt2", - pad_token_id=tokenizer.eos_token_id, - use_cache=False, - attn_implementation="eager", - ).eval() - -# Input prompt -prompt = "Roses are red, violets are blue" -model_inputs = tokenizer(prompt, return_tensors="pt") -input_ids = model_inputs["input_ids"] - -# Auto-regressive generation loop for greedy search using PyTorch model -pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) - -# Compile the model using torch.compile with tensorrt backend and -# mark the input sequence length to be dynamic -torch._dynamo.mark_dynamic(input_ids, 1, min=7, max=1023) -model.forward = torch.compile( - model.forward, - backend="tensorrt", - dynamic=None, - options={ - "enabled_precisions": {torch.float}, - "debug": True, - "disable_tf32": True, - }, -) - -# Auto-regressive generation loop for greedy decoding using TensorRT model -# Move inputs to GPU -input_ids = input_ids.to(DEVICE) -trt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) - -# Decode the sentence -print("=============================") -print( - "Pytorch model generated text: ", - tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True), -) -print("=============================") -print( - "TensorRT model generated text: ", - tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True), -) diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py index 618dbc15ed..a26305e4a3 100644 --- a/examples/dynamo/torch_export_gpt2.py +++ b/examples/dynamo/torch_export_gpt2.py @@ -1,13 +1,22 @@ -import copy -import time +""" +.. _torch_export_gpt2: -import numpy as np +Compiling GPT2 using the Torch-TensorRT with dynamo backend +========================================================== + +This interactive script is intended as a sample of the Torch-TensorRT workflow with dynamo backend on a GPT2 model.""" + +# %% +# Imports and Model Definition +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ import torch import torch_tensorrt from transformers import AutoModelForCausalLM, AutoTokenizer from utils import export_llm, generate -# Define the parameters +# %% + +# Define the parameters and initialize the model MAX_TOKENS = 32 DEVICE = torch.device("cuda:0") @@ -23,14 +32,21 @@ attn_implementation="eager", ).eval() -# Input prompt +# %% +# Tokenize a sample input prompt and get pytorch model outputs prompt = "I enjoy walking with my cute dog" model_inputs = tokenizer(prompt, return_tensors="pt") input_ids = model_inputs["input_ids"] # Auto-regressive generation loop for greedy decoding using PyTorch model +# We use a custom generate function which is very similar to the huggingface one. pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) + +# %% +# Compilation with `Torch-TensorRT` using dynamo backend and generate TensorRT outputs +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + # Export the GPT2 model into an ExportedProgram which is input of TRT compilation gpt2_ep = export_llm(model, input_ids, max_seq_len=1024) trt_model = torch_tensorrt.dynamo.compile( @@ -43,11 +59,14 @@ ) # Auto-regressive generation loop for greedy decoding using TensorRT model +# We use a custom generate function which is very similar to the huggingface one. # Move inputs to GPU input_ids = input_ids.to(DEVICE) trt_gen_tokens = generate(trt_model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) -# Decode the sentence +# %% +# Decode the output sentences of PyTorch and TensorRT +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ print("=============================") print( "Pytorch model generated text: ", @@ -58,3 +77,10 @@ "TensorRT model generated text: ", tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True), ) + +# %% +# The output sentences should look like +# ============================= +# Pytorch model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my +# ============================= +# TensorRT model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py index 69a21bee16..195944688b 100644 --- a/examples/dynamo/torch_export_llama2.py +++ b/examples/dynamo/torch_export_llama2.py @@ -1,9 +1,21 @@ +""" +.. _torch_export_llama2: + +Compiling Llama2 using the Torch-TensorRT with dynamo backend +========================================================== + +This interactive script is intended as a sample of the Torch-TensorRT workflow with dynamo backend on a Llama2 model.""" + +# %% +# Imports and Model Definition +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ import torch import torch_tensorrt from transformers import AutoModelForCausalLM, AutoTokenizer from utils import export_llm, generate -# Define the parameters +# %% +# Define the parameters and initialize the model MAX_TOKENS = 32 DEVICE = torch.device("cuda:0") @@ -17,13 +29,21 @@ ).eval() tokenizer = AutoTokenizer.from_pretrained(llama_path) + +# %% +# Tokenize a sample input prompt and get pytorch model outputs prompt = "What is dynamic programming?" -model_inputs = tokenizer(base_prompt, return_tensors="pt") +model_inputs = tokenizer(prompt, return_tensors="pt") input_ids = model_inputs.input_ids -# Auto-regressive generation loop for greedy search using PyTorch model +# Auto-regressive generation loop for greedy decoding using PyTorch model +# We use a custom generate function which is very similar to the huggingface one. pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) +# %% +# Compilation with `Torch-TensorRT` using dynamo backend and generate TensorRT outputs +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + # Export the llama2 model into an ExportedProgram which is input of TRT compilation llama2_ep = export_llm(model, input_ids, max_seq_len=64) trt_model = torch_tensorrt.dynamo.compile( @@ -36,12 +56,15 @@ disable_tf32=True, ) -# Auto-regressive generation loop for greedy decoding +# Auto-regressive generation loop for greedy decoding using TensorRT model +# We use a custom generate function which is very similar to the huggingface one. # Move inputs to GPU input_ids = input_ids.to(DEVICE) trt_gen_tokens = generate(trt_model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) -# Decode the sentence +# %% +# Decode the output sentences of PyTorch and TensorRT +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ print("=============================") print( "Pytorch model generated text: ", @@ -58,3 +81,10 @@ clean_up_tokenization_spaces=False, )[0], ) + +# %% +# The output sentences should look like +# ============================= +# Pytorch model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my +# ============================= +# TensorRT model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my diff --git a/py/torch_tensorrt/dynamo/lowering/passes/remove_assert_scalar.py b/py/torch_tensorrt/dynamo/lowering/passes/remove_assert_scalar.py index 5a82f7e711..ee468145f6 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/remove_assert_scalar.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/remove_assert_scalar.py @@ -12,9 +12,7 @@ def remove_assert_scalar(gm: torch.fx.GraphModule) -> torch.fx.GraphModule: """Remove assert_scalar ops in the graph""" count = 0 for node in gm.graph.nodes: - # node.target = "detach" in torch.compile workflow if node.target == torch.ops.aten._assert_scalar.default: - # Detach node has only one input gm.graph.erase_node(node) count += 1 diff --git a/tools/perf/hf_models/benchmark_gpt2.sh b/tools/perf/hf_models/benchmark_gpt2.sh deleted file mode 100644 index 83d08ab5d1..0000000000 --- a/tools/perf/hf_models/benchmark_gpt2.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -# Usage : bash run_hf_model.sh - -# GPT2 model torch backend -bash run_hf_model.sh 1 "torch" "gpt2" 128 256 "fp16" 1 -bash run_hf_model.sh 16 "torch" "gpt2" 128 256 "fp16" 1 -bash run_hf_model.sh 64 "torch" "gpt2" 128 256 "fp16" 1 -bash run_hf_model.sh 256 "torch" "gpt2" 128 256 "fp16" 1 - -# GPT2 model dynamo backend -bash run_hf_model.sh 1 "dynamo" "gpt2" 128 256 "fp16" 1 -bash run_hf_model.sh 16 "dynamo" "gpt2" 128 256 "fp16" 1 -bash run_hf_model.sh 64 "dynamo" "gpt2" 128 256 "fp16" 1 -bash run_hf_model.sh 256 "dynamo" "gpt2" 128 256 "fp16" 1 - -# GPT2 model inductor backend -bash run_hf_model.sh 1 "inductor" "gpt2" 128 256 "fp16" 1 -bash run_hf_model.sh 16 "inductor" "gpt2" 128 256 "fp16" 1 -bash run_hf_model.sh 64 "inductor" "gpt2" 128 256 "fp16" 1 -bash run_hf_model.sh 256 "inductor" "gpt2" 128 256 "fp16" 1 - diff --git a/tools/perf/hf_models/benchmark_llama2.sh b/tools/perf/hf_models/benchmark_llama2.sh deleted file mode 100644 index 9138c63818..0000000000 --- a/tools/perf/hf_models/benchmark_llama2.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/bash -# Usage : bash run_hf_model.sh - -# "meta-llama/Llama-2-7b-chat-hf" model torch backend -# isl, osl = 128, 256 -bash run_hf_model.sh 1 "torch" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 -bash run_hf_model.sh 16 "torch" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 -bash run_hf_model.sh 64 "torch" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 -bash run_hf_model.sh 256 "torch" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 -# isl, osl = 128, 2176 -bash run_hf_model.sh 1 "torch" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 -bash run_hf_model.sh 16 "torch" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 -bash run_hf_model.sh 64 "torch" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 -bash run_hf_model.sh 256 "torch" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 -# isl, osl = 2048, 2176 -bash run_hf_model.sh 1 "torch" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 -bash run_hf_model.sh 16 "torch" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 -bash run_hf_model.sh 64 "torch" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 -bash run_hf_model.sh 256 "torch" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 -# isl, osl = 2048, 4096 -bash run_hf_model.sh 1 "torch" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 -bash run_hf_model.sh 16 "torch" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 -bash run_hf_model.sh 64 "torch" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 -bash run_hf_model.sh 256 "torch" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 - -# "meta-llama/Llama-2-7b-chat-hf" model dynamo backend -# isl, osl = 128, 256 -bash run_hf_model.sh 1 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 -bash run_hf_model.sh 16 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 -bash run_hf_model.sh 64 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 -bash run_hf_model.sh 256 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 -# isl, osl = 128, 2176 -bash run_hf_model.sh 1 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 -bash run_hf_model.sh 16 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 -bash run_hf_model.sh 64 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 -bash run_hf_model.sh 256 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 -# isl, osl = 2048, 2176 -bash run_hf_model.sh 1 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 -bash run_hf_model.sh 16 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 -bash run_hf_model.sh 64 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 -bash run_hf_model.sh 256 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 -# isl, osl = 2048, 4096 -bash run_hf_model.sh 1 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 -bash run_hf_model.sh 16 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 -bash run_hf_model.sh 64 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 -bash run_hf_model.sh 256 "dynamo" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 - -# "meta-llama/Llama-2-7b-chat-hf" model inductor backend -# isl, osl = 128, 256 -bash run_hf_model.sh 1 "inductor" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 -bash run_hf_model.sh 16 "inductor" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 -bash run_hf_model.sh 64 "inductor" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 -bash run_hf_model.sh 256 "inductor" "meta-llama/Llama-2-7b-chat-hf" 128 256 "fp16" 1 -# isl, osl = 128, 2176 -bash run_hf_model.sh 1 "inductor" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 -bash run_hf_model.sh 16 "inductor" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 -bash run_hf_model.sh 64 "inductor" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 -bash run_hf_model.sh 256 "inductor" "meta-llama/Llama-2-7b-chat-hf" 128 2176 "fp16" 1 -# isl, osl = 2048, 2176 -bash run_hf_model.sh 1 "inductor" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 -bash run_hf_model.sh 16 "inductor" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 -bash run_hf_model.sh 64 "inductor" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 -bash run_hf_model.sh 256 "inductor" "meta-llama/Llama-2-7b-chat-hf" 2048 2176 "fp16" 1 -# isl, osl = 2048, 4096 -bash run_hf_model.sh 1 "inductor" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 -bash run_hf_model.sh 16 "inductor" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 -bash run_hf_model.sh 64 "inductor" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 -bash run_hf_model.sh 256 "inductor" "meta-llama/Llama-2-7b-chat-hf" 2048 4096 "fp16" 1 \ No newline at end of file diff --git a/tools/perf/hf_models/benchmark_llama3.1_instruct.sh b/tools/perf/hf_models/benchmark_llama3.1_instruct.sh deleted file mode 100644 index d6f0bca8ab..0000000000 --- a/tools/perf/hf_models/benchmark_llama3.1_instruct.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/bash -# Usage : bash run_hf_model.sh - -# "meta-llama/Meta-Llama-3.1-8B-Instruct" model torch backend -# isl, osl = 128, 256 -bash run_hf_model.sh 1 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 -bash run_hf_model.sh 16 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 -bash run_hf_model.sh 64 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 -bash run_hf_model.sh 256 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 -# isl, osl = 128, 2176 -bash run_hf_model.sh 1 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 -bash run_hf_model.sh 16 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 -bash run_hf_model.sh 64 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 -bash run_hf_model.sh 256 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 -# isl, osl = 2048, 2176 -bash run_hf_model.sh 1 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 -bash run_hf_model.sh 16 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 -bash run_hf_model.sh 64 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 -bash run_hf_model.sh 256 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 -# isl, osl = 2048, 4096 -bash run_hf_model.sh 1 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 -bash run_hf_model.sh 16 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 -bash run_hf_model.sh 64 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 -bash run_hf_model.sh 256 "torch" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 - -# "meta-llama/Meta-Llama-3.1-8B-Instruct" model dynamo backend -# isl, osl = 128, 256 -bash run_hf_model.sh 1 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 -bash run_hf_model.sh 16 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 -bash run_hf_model.sh 64 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 -bash run_hf_model.sh 256 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 -# isl, osl = 128, 2176 -bash run_hf_model.sh 1 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 -bash run_hf_model.sh 16 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 -bash run_hf_model.sh 64 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 -bash run_hf_model.sh 256 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 -# isl, osl = 2048, 2176 -bash run_hf_model.sh 1 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 -bash run_hf_model.sh 16 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 -bash run_hf_model.sh 64 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 -bash run_hf_model.sh 256 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 -# isl, osl = 2048, 4096 -bash run_hf_model.sh 1 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 -bash run_hf_model.sh 16 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 -bash run_hf_model.sh 64 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 -bash run_hf_model.sh 256 "dynamo" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 - -# "meta-llama/Meta-Llama-3.1-8B-Instruct" model inductor backend -# isl, osl = 128, 256 -bash run_hf_model.sh 1 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 -bash run_hf_model.sh 16 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 -bash run_hf_model.sh 64 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 -bash run_hf_model.sh 256 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 256 "fp16" 1 -# isl, osl = 128, 2176 -bash run_hf_model.sh 1 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 -bash run_hf_model.sh 16 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 -bash run_hf_model.sh 64 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 -bash run_hf_model.sh 256 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 128 2176 "fp16" 1 -# isl, osl = 2048, 2176 -bash run_hf_model.sh 1 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 -bash run_hf_model.sh 16 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 -bash run_hf_model.sh 64 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 -bash run_hf_model.sh 256 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 2176 "fp16" 1 -# isl, osl = 2048, 4096 -bash run_hf_model.sh 1 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 -bash run_hf_model.sh 16 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 -bash run_hf_model.sh 64 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 -bash run_hf_model.sh 256 "inductor" "meta-llama/Meta-Llama-3.1-8B-Instruct" 2048 4096 "fp16" 1 \ No newline at end of file diff --git a/tools/perf/hf_models/benchmark_llama3.sh b/tools/perf/hf_models/benchmark_llama3.sh deleted file mode 100644 index a0c48c3c51..0000000000 --- a/tools/perf/hf_models/benchmark_llama3.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/bash -# Usage : bash run_hf_model.sh - -# "meta-llama/Meta-Llama-3-8B" model torch backend -# isl, osl = 128, 256 -bash run_hf_model.sh 1 "torch" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 -bash run_hf_model.sh 16 "torch" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 -bash run_hf_model.sh 64 "torch" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 -bash run_hf_model.sh 256 "torch" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 -# isl, osl = 128, 2176 -bash run_hf_model.sh 1 "torch" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 -bash run_hf_model.sh 16 "torch" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 -bash run_hf_model.sh 64 "torch" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 -bash run_hf_model.sh 256 "torch" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 -# isl, osl = 2048, 2176 -bash run_hf_model.sh 1 "torch" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 -bash run_hf_model.sh 16 "torch" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 -bash run_hf_model.sh 64 "torch" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 -bash run_hf_model.sh 256 "torch" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 -# isl, osl = 2048, 4096 -bash run_hf_model.sh 1 "torch" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 -bash run_hf_model.sh 16 "torch" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 -bash run_hf_model.sh 64 "torch" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 -bash run_hf_model.sh 256 "torch" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 - -# "meta-llama/Meta-Llama-3-8B" model dynamo backend -# isl, osl = 128, 256 -bash run_hf_model.sh 1 "dynamo" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 -bash run_hf_model.sh 16 "dynamo" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 -bash run_hf_model.sh 64 "dynamo" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 -bash run_hf_model.sh 256 "dynamo" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 -# isl, osl = 128, 2176 -bash run_hf_model.sh 1 "dynamo" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 -bash run_hf_model.sh 16 "dynamo" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 -bash run_hf_model.sh 64 "dynamo" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 -bash run_hf_model.sh 256 "dynamo" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 -# isl, osl = 2048, 2176 -bash run_hf_model.sh 1 "dynamo" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 -bash run_hf_model.sh 16 "dynamo" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 -bash run_hf_model.sh 64 "dynamo" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 -bash run_hf_model.sh 256 "dynamo" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 -# isl, osl = 2048, 4096 -bash run_hf_model.sh 1 "dynamo" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 -bash run_hf_model.sh 16 "dynamo" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 -bash run_hf_model.sh 64 "dynamo" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 -bash run_hf_model.sh 256 "dynamo" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 - -# "meta-llama/Meta-Llama-3-8B" model inductor backend -# isl, osl = 128, 256 -bash run_hf_model.sh 1 "inductor" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 -bash run_hf_model.sh 16 "inductor" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 -bash run_hf_model.sh 64 "inductor" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 -bash run_hf_model.sh 256 "inductor" "meta-llama/Meta-Llama-3-8B" 128 256 "fp16" 1 -# isl, osl = 128, 2176 -bash run_hf_model.sh 1 "inductor" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 -bash run_hf_model.sh 16 "inductor" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 -bash run_hf_model.sh 64 "inductor" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 -bash run_hf_model.sh 256 "inductor" "meta-llama/Meta-Llama-3-8B" 128 2176 "fp16" 1 -# isl, osl = 2048, 2176 -bash run_hf_model.sh 1 "inductor" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 -bash run_hf_model.sh 16 "inductor" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 -bash run_hf_model.sh 64 "inductor" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 -bash run_hf_model.sh 256 "inductor" "meta-llama/Meta-Llama-3-8B" 2048 2176 "fp16" 1 -# isl, osl = 2048, 4096 -bash run_hf_model.sh 1 "inductor" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 -bash run_hf_model.sh 16 "inductor" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 -bash run_hf_model.sh 64 "inductor" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 -bash run_hf_model.sh 256 "inductor" "meta-llama/Meta-Llama-3-8B" 2048 4096 "fp16" 1 \ No newline at end of file diff --git a/tools/perf/hf_models/benchmark_mistral7b.sh b/tools/perf/hf_models/benchmark_mistral7b.sh deleted file mode 100644 index 7d1e71cba4..0000000000 --- a/tools/perf/hf_models/benchmark_mistral7b.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/bash -# Usage : bash run_hf_model.sh - -# "mistralai/Mistral-7B-Instruct-v0.3" model torch backend -# isl, osl = 128, 256 -bash run_hf_model.sh 1 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 -bash run_hf_model.sh 16 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 -bash run_hf_model.sh 64 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 -bash run_hf_model.sh 256 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 -# isl, osl = 128, 2176 -bash run_hf_model.sh 1 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 -bash run_hf_model.sh 16 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 -bash run_hf_model.sh 64 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 -bash run_hf_model.sh 256 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 -# isl, osl = 2048, 2176 -bash run_hf_model.sh 1 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 -bash run_hf_model.sh 16 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 -bash run_hf_model.sh 64 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 -bash run_hf_model.sh 256 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 -# isl, osl = 2048, 4096 -bash run_hf_model.sh 1 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 -bash run_hf_model.sh 16 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 -bash run_hf_model.sh 64 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 -bash run_hf_model.sh 256 "torch" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 - -# "mistralai/Mistral-7B-Instruct-v0.3" model dynamo backend -# isl, osl = 128, 256 -bash run_hf_model.sh 1 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 -bash run_hf_model.sh 16 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 -bash run_hf_model.sh 64 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 -bash run_hf_model.sh 256 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 -# isl, osl = 128, 2176 -bash run_hf_model.sh 1 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 -bash run_hf_model.sh 16 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 -bash run_hf_model.sh 64 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 -bash run_hf_model.sh 256 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 -# isl, osl = 2048, 2176 -bash run_hf_model.sh 1 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 -bash run_hf_model.sh 16 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 -bash run_hf_model.sh 64 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 -bash run_hf_model.sh 256 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 -# isl, osl = 2048, 4096 -bash run_hf_model.sh 1 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 -bash run_hf_model.sh 16 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 -bash run_hf_model.sh 64 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 -bash run_hf_model.sh 256 "dynamo" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 - -# "mistralai/Mistral-7B-Instruct-v0.3" model inductor backend -# isl, osl = 128, 256 -bash run_hf_model.sh 1 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 -bash run_hf_model.sh 16 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 -bash run_hf_model.sh 64 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 -bash run_hf_model.sh 256 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 128 256 "fp16" 1 -# isl, osl = 128, 2176 -bash run_hf_model.sh 1 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 -bash run_hf_model.sh 16 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 -bash run_hf_model.sh 64 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 -bash run_hf_model.sh 256 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 128 2176 "fp16" 1 -# isl, osl = 2048, 2176 -bash run_hf_model.sh 1 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 -bash run_hf_model.sh 16 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 -bash run_hf_model.sh 64 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 -bash run_hf_model.sh 256 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 2048 2176 "fp16" 1 -# isl, osl = 2048, 4096 -bash run_hf_model.sh 1 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 -bash run_hf_model.sh 16 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 -bash run_hf_model.sh 64 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 -bash run_hf_model.sh 256 "inductor" "mistralai/Mistral-7B-Instruct-v0.3" 2048 4096 "fp16" 1 \ No newline at end of file diff --git a/tools/perf/stage1.sh b/tools/perf/stage1.sh index 3cdf292a4a..412396ee0b 100644 --- a/tools/perf/stage1.sh +++ b/tools/perf/stage1.sh @@ -5,7 +5,7 @@ iterations=1 backends=("dynamo" "inductor") batch_sizes=(1 16) hf_token="" -image_name="gitlab-master.nvidia.com:5005/torch-tensorrt/tensorrt/torch_tensorrt:latest" +image_name="" # Stage 1 : GPT2 experiment models=("gpt2") @@ -29,7 +29,7 @@ rm -rf ~/.cache/huggingface/hub/ # Stage 2 : non-GPT2 experiments isl=(128 128) -osl=(256 2176) +osl=(256 2176) models=("meta-llama/Meta-Llama-3.1-8B-Instruct" "meta-llama/Llama-2-7b-chat-hf" "mistralai/Mistral-7B-Instruct-v0.3") backends=("dynamo" "inductor") for model in ${models[@]} diff --git a/tools/perf/stage2.sh b/tools/perf/stage2.sh index 8ede056ff2..9411cd9d09 100644 --- a/tools/perf/stage2.sh +++ b/tools/perf/stage2.sh @@ -5,11 +5,11 @@ iterations=1 backends=("dynamo" "inductor") batch_sizes=(1 16) hf_token="" -image_name="gitlab-master.nvidia.com:5005/torch-tensorrt/tensorrt/torch_tensorrt:latest" +image_name="" # Stage 2 : slower non-GPT2 experiments isl=(2048) -osl=(2176) +osl=(2176) models=("meta-llama/Meta-Llama-3.1-8B-Instruct" "meta-llama/Llama-2-7b-chat-hf" "mistralai/Mistral-7B-Instruct-v0.3") backends=("dynamo" "inductor") for model in ${models[@]} From 0ab0dbf62582da96c2308cd9aba207c1a747f3e5 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 21 Aug 2024 14:12:47 -0700 Subject: [PATCH 39/55] feat: add fp32 accumulation option for matmul layer --- examples/dynamo/torch_export_gpt2.py | 35 ++++++++----- py/torch_tensorrt/dynamo/_compiler.py | 14 ++++- py/torch_tensorrt/dynamo/_defaults.py | 2 + py/torch_tensorrt/dynamo/_settings.py | 6 +++ .../dynamo/conversion/_TRTInterpreter.py | 13 ++--- .../lowering/passes/_aten_lowering_pass.py | 11 ++-- .../lowering/passes/accumulate_fp32_matmul.py | 51 +++++++++++++++++++ 7 files changed, 108 insertions(+), 24 deletions(-) create mode 100644 py/torch_tensorrt/dynamo/lowering/passes/accumulate_fp32_matmul.py diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py index a26305e4a3..f30c8dc3e7 100644 --- a/examples/dynamo/torch_export_gpt2.py +++ b/examples/dynamo/torch_export_gpt2.py @@ -25,12 +25,16 @@ # CPU is used here so that GPU memory is reserved for TRT compilation. with torch.no_grad(): tokenizer = AutoTokenizer.from_pretrained("gpt2") - model = AutoModelForCausalLM.from_pretrained( - "gpt2", - pad_token_id=tokenizer.eos_token_id, - use_cache=False, - attn_implementation="eager", - ).eval() + model = ( + AutoModelForCausalLM.from_pretrained( + "gpt2", + pad_token_id=tokenizer.eos_token_id, + use_cache=False, + attn_implementation="eager", + ) + .eval() + .half() + ) # %% # Tokenize a sample input prompt and get pytorch model outputs @@ -49,14 +53,17 @@ # Export the GPT2 model into an ExportedProgram which is input of TRT compilation gpt2_ep = export_llm(model, input_ids, max_seq_len=1024) -trt_model = torch_tensorrt.dynamo.compile( - gpt2_ep, - inputs=[input_ids], - enabled_precisions={torch.float32}, - truncate_double=True, - device=DEVICE, - disable_tf32=True, -) +with torch_tensorrt.logging.debug(): + trt_model = torch_tensorrt.dynamo.compile( + gpt2_ep, + inputs=[input_ids], + enabled_precisions={torch.float16}, + truncate_double=True, + device=DEVICE, + disable_tf32=True, + use_strong_types=False, + use_fp32_acc=True, + ) # Auto-regressive generation loop for greedy decoding using TensorRT model # We use a custom generate function which is very similar to the huggingface one. diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index d2eb95f354..8d874c69e1 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -82,6 +82,8 @@ def compile( hardware_compatible: bool = _defaults.HARDWARE_COMPATIBLE, timing_cache_path: str = _defaults.TIMING_CACHE_PATH, lazy_engine_init: bool = _defaults.LAZY_ENGINE_INIT, + use_strong_types: bool = _defaults.USE_STRONG_TYPES, + use_fp32_acc: bool = _defaults.USE_FP32_ACC, **kwargs: Any, ) -> torch.fx.GraphModule: """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT @@ -147,6 +149,8 @@ def compile( hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer) timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime. + use_strong_types (bool): Enable strong typing in TensorRT compilation + use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions. **kwargs: Any, Returns: torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT @@ -221,7 +225,7 @@ def compile( logger.debug("Input graph: " + str(gm.graph)) # Apply lowering on the graph module - gm = post_lowering(gm) + gm = post_lowering(gm, use_fp32_acc=use_fp32_acc) logger.debug("Lowered Input graph: " + str(gm.graph)) compilation_options = { @@ -257,6 +261,8 @@ def compile( "hardware_compatible": hardware_compatible, "timing_cache_path": timing_cache_path, "lazy_engine_init": lazy_engine_init, + "use_strong_types": use_strong_types, + "use_fp32_acc": use_fp32_acc, } settings = CompilationSettings(**compilation_options) @@ -493,6 +499,8 @@ def convert_exported_program_to_serialized_trt_engine( calibrator: object = None, allow_shape_tensors: bool = False, timing_cache_path: str = _defaults.TIMING_CACHE_PATH, + use_strong_types: bool = _defaults.USE_STRONG_TYPES, + use_fp32_acc: bool = _defaults.USE_FP32_ACC, **kwargs: Any, ) -> bytes: """Convert an ExportedProgram to a serialized TensorRT engine @@ -551,6 +559,8 @@ def convert_exported_program_to_serialized_trt_engine( calibrator (Union(torch_tensorrt._C.IInt8Calibrator, tensorrt.IInt8Calibrator)): Calibrator object which will provide data to the PTQ system for INT8 Calibration allow_shape_tensors: (Experimental) Allow aten::size to output shape tensors using IShapeLayer in TensorRT timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation + use_strong_types (bool): Enable strong typing in TensorRT compilation + use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions. Returns: bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs """ @@ -624,6 +634,8 @@ def convert_exported_program_to_serialized_trt_engine( "dla_local_dram_size": dla_local_dram_size, "dla_global_dram_size": dla_global_dram_size, "timing_cache_path": timing_cache_path, + "use_strong_types": use_strong_types, + "use_fp32_acc": use_fp32_acc, } exported_program = pre_export_lowering(exported_program) diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py index 2696e26936..5917c9e6dd 100644 --- a/py/torch_tensorrt/dynamo/_defaults.py +++ b/py/torch_tensorrt/dynamo/_defaults.py @@ -33,6 +33,8 @@ SUPPORTED_KERNEL_PRECISIONS = {dtype.f32, dtype.f16, dtype.bf16, dtype.i8, dtype.f8} TIMING_CACHE_PATH = os.path.join(tempfile.gettempdir(), "timing_cache.bin") LAZY_ENGINE_INIT = False +USE_STRONG_TYPES = False +USE_FP32_ACC = False def default_device() -> Device: diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index 4a9792d3dc..7617462322 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -28,7 +28,9 @@ TIMING_CACHE_PATH, TRUNCATE_DOUBLE, USE_FAST_PARTITIONER, + USE_FP32_ACC, USE_PYTHON_RUNTIME, + USE_STRONG_TYPES, VERSION_COMPATIBLE, WORKSPACE_SIZE, default_device, @@ -74,6 +76,8 @@ class CompilationSettings: output to a file if a string path is specified hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer) timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation + use_strong_types (bool): Enable strong typing in TensorRT compilation + use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions. """ enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS) @@ -106,3 +110,5 @@ class CompilationSettings: hardware_compatible: bool = HARDWARE_COMPATIBLE timing_cache_path: str = TIMING_CACHE_PATH lazy_engine_init: bool = LAZY_ENGINE_INIT + use_strong_types: bool = USE_STRONG_TYPES + use_fp32_acc: bool = USE_FP32_ACC diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index 17437ceb6e..0526d6d9ae 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -6,6 +6,7 @@ from typing import Any, Callable, Dict, List, NamedTuple, Optional, Sequence, Set, Tuple import numpy as np +import tensorrt as trt import torch import torch.fx from torch.fx.node import _get_qualified_name @@ -30,7 +31,6 @@ from torch_tensorrt.fx.observer import Observer from torch_tensorrt.logging import TRT_LOGGER -import tensorrt as trt from packaging import version _LOGGER: logging.Logger = logging.getLogger(__name__) @@ -66,10 +66,11 @@ def __init__( self.builder = trt.Builder(self.logger) flag = 0 - - # It is deprecated to not use this flag - EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) - flag |= EXPLICIT_BATCH + if compilation_settings.use_strong_types: + STRONGLY_TYPED = 1 << (int)( + trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED + ) + flag |= STRONGLY_TYPED self.ctx = ConversionContext( self.builder.create_network(flag), compilation_settings @@ -103,8 +104,8 @@ def __init__( self._itensor_to_tensor_meta: Dict[trt.tensorrt.ITensor, TensorMetadata] = ( dict() ) - self.compilation_settings = compilation_settings + self.compilation_settings = compilation_settings # Data types for TRT Module output Tensors self.output_dtypes = ( [dtype._from(o) for o in output_dtypes] if output_dtypes else None diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py index b7c65f1880..b6435c0d8c 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py @@ -1,8 +1,9 @@ import logging -from typing import Callable, Optional, Sequence, Union +from typing import Any, Callable, Optional, Sequence, Union import torch +from .accumulate_fp32_matmul import accumulate_fp32_matmul from .constant_folding import constant_fold from .fuse_prims_broadcast import fuse_prims_broadcast from .lower_linear import lower_linear @@ -90,12 +91,16 @@ def _remove_lowering_pass(*, index: int) -> None: return -def post_lowering(gm: torch.fx.GraphModule) -> torch.fx.GraphModule: +def post_lowering(gm: torch.fx.GraphModule, **kwargs: Any) -> torch.fx.GraphModule: """Applies the lowering passes to a graph module after torch.export/ torch.compile and their decompositions, returns the modified GraphModule""" logging.debug( f"Invoking DynamoPassManager and applying lowering passes: {ATEN_POST_LOWERING_PASSES}" ) - return ATEN_POST_LOWERING_PASSES(gm) + gm = ATEN_POST_LOWERING_PASSES(gm) + if kwargs.get("use_fp32_acc", False): + gm = accumulate_fp32_matmul(gm) + + return gm def pre_export_lowering(ep: torch.export.ExportedProgram) -> torch.fx.GraphModule: diff --git a/py/torch_tensorrt/dynamo/lowering/passes/accumulate_fp32_matmul.py b/py/torch_tensorrt/dynamo/lowering/passes/accumulate_fp32_matmul.py new file mode 100644 index 0000000000..a80d12630c --- /dev/null +++ b/py/torch_tensorrt/dynamo/lowering/passes/accumulate_fp32_matmul.py @@ -0,0 +1,51 @@ +import logging + +import torch +from torch_tensorrt.dynamo.lowering.passes.pass_utils import ( + clean_up_graph_after_modifications, +) + +logger = logging.getLogger(__name__) + + +def accumulate_fp32_matmul(gm: torch.fx.GraphModule) -> torch.fx.GraphModule: + """Replace a matmul layer with fp32 accumulation nodes""" + matmul_targets = [torch.ops.aten.mm.default, torch.ops.aten.bmm.default] + matmul_nodes = [node for node in gm.graph.nodes if node.target in matmul_targets] + + for matmul_node in matmul_nodes: + # Prior to the matmul node, insert a cast to the 32-bit float32 node + node_inputs = matmul_node.all_input_nodes + + # Upcast only mm nodes in addmm and leave the bias + if matmul_node.target == torch.ops.aten.addmm.default: + node_inputs = node_inputs[1:] + + for node_input in node_inputs: + with gm.graph.inserting_before(matmul_node): + node_32bit = gm.graph.call_function( + torch.ops.aten._to_copy.default, + args=(node_input,), + kwargs={"dtype": torch.float32}, + ) + + # Replace the input to matmul node with new 32-bit cast node + matmul_node.replace_input_with(node_input, node_32bit) + + # Add a cast back to original precision + with gm.graph.inserting_after(matmul_node): + node_orig_precision = gm.graph.call_function( + torch.ops.aten._to_copy.default, + args=(matmul_node,), + kwargs={"dtype": torch.float16}, + ) + matmul_node.replace_all_uses_with(node_orig_precision, propagate_meta=False) + # This is a hack. replace_all_uses_with isn't working here. It complains node_orig_precision is already being used before created. + node_orig_precision.replace_input_with( + node_orig_precision.all_input_nodes[0], matmul_node + ) + + gm = clean_up_graph_after_modifications(gm) + logger.debug(f"Graph after changing matmuls to use FP32 accumulation:\n{gm.graph}") + + return gm From 3c815f86b3ae53a27f150dfc7b42cdd4ebffd80b Mon Sep 17 00:00:00 2001 From: Chengzhe Xu Date: Wed, 28 Aug 2024 20:40:23 +0000 Subject: [PATCH 40/55] chore: updates --- .../dynamo/conversion/impl/elementwise/base.py | 6 ++---- py/torch_tensorrt/dynamo/conversion/impl/full.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/elementwise/base.py b/py/torch_tensorrt/dynamo/conversion/impl/elementwise/base.py index e9e80593e9..ca605c3189 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/elementwise/base.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/elementwise/base.py @@ -13,13 +13,11 @@ broadcast_to_same_shape, cast_trt_tensor, get_trt_tensor, -) -from torch_tensorrt.fx.converters.converter_utils import ( broadcast, has_dynamic_shape, set_layer_name, ) -from torch_tensorrt.fx.types import TRTElementWiseOp, TRTTensor +from torch_tensorrt.dynamo.types import TRTElementWiseOp, TRTTensor def get_python_op_from_trt_elementwise_op( @@ -152,7 +150,7 @@ def convert_binary_elementwise( if has_dynamic_shape(lhs_val.shape) or has_dynamic_shape(rhs_val.shape): lhs_val, rhs_val = broadcast( - ctx.net, lhs_val, rhs_val, f"{name}_broadcast_lhs", f"{name}_broadcast_rhs" + ctx, lhs_val, rhs_val, f"{name}_broadcast_lhs", f"{name}_broadcast_rhs" ) else: lhs_val, rhs_val = broadcast_to_same_shape( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/full.py b/py/torch_tensorrt/dynamo/conversion/impl/full.py index 34a2af564f..8a977b04b3 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/full.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/full.py @@ -68,7 +68,7 @@ def full( if type(fill_value) in (int, float): if isinstance(fill_value, float): output = cast_trt_tensor( - ctx, output, trt.float32, name + "_casted", target, source_ir + ctx, output, output_dtype, name + "_casted", target, source_ir ) output = impl.elementwise.add( ctx, target, source_ir, name + "_add", output, fill_value From 5617c0a10f40b69433386071f53bfadcbaa22ca8 Mon Sep 17 00:00:00 2001 From: "Zewen (Evan) Li" Date: Fri, 23 Aug 2024 22:01:20 -0700 Subject: [PATCH 41/55] chore: Bump TRT version to 10.3.0.26 (#3071) --- MODULE.bazel | 12 ++++++------ README.md | 4 ++-- core/conversion/converters/impl/batch_norm.cpp | 13 ++++++++++++- dev_dep_versions.yml | 2 +- docker/README.md | 4 ++-- packaging/pre_build_script_windows.sh | 2 +- packaging/smoke_test_script.sh | 2 +- py/ci/Dockerfile.ci | 12 ++++++------ pyproject.toml | 8 ++++---- .../conversion/converters/test_instance_norm.cpp | 2 +- tests/py/ts/api/test_classes.py | 8 +++----- toolchains/ci_workspaces/MODULE.bazel.tmpl | 12 ++++++------ toolchains/legacy/WORKSPACE.win.release.tmpl | 2 +- .../legacy/WORKSPACE.x86_64.release.rhel.tmpl | 6 +++--- 14 files changed, 49 insertions(+), 40 deletions(-) diff --git a/MODULE.bazel b/MODULE.bazel index 958ea92f1b..add7821fcb 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -79,20 +79,20 @@ http_archive( http_archive( name = "tensorrt", build_file = "@//third_party/tensorrt/archive:BUILD", - sha256 = "606436ed219c72d1a25a889b2b0ae5cb5a68499dd6f944da4cabb3c34c067d55", - strip_prefix = "TensorRT-10.1.0.27", + sha256 = "adff1cd5abe5d87013806172351e58fd024e5bf0fc61d49ef4b84cd38ed99081", + strip_prefix = "TensorRT-10.3.0.26", urls = [ - "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.1.0/tars/TensorRT-10.1.0.27.Linux.x86_64-gnu.cuda-12.4.tar.gz", + "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.3.0/tars/TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-12.5.tar.gz", ], ) http_archive( name = "tensorrt_win", build_file = "@//third_party/tensorrt/archive:BUILD", - sha256 = "2eb98008944945377eb328871a308704e95bf3bb295fc548784c6da41a70bbed", - strip_prefix = "TensorRT-10.1.0.27", + sha256 = "2bb4bcb79e8c33575816d874b0512ea28c302af1c06ee6d224da71aa182f75e0", + strip_prefix = "TensorRT-10.3.0.26", urls = [ - "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.1.0/zip/TensorRT-10.1.0.27.Windows.win10.cuda-12.4.zip", + "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.3.0/zip/TensorRT-10.3.0.26.Windows.win10.cuda-12.5.zip", ], ) diff --git a/README.md b/README.md index a39ccefd33..03062bf7f7 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Torch-TensorRT [![Documentation](https://img.shields.io/badge/docs-master-brightgreen)](https://nvidia.github.io/Torch-TensorRT/) [![pytorch](https://img.shields.io/badge/PyTorch-2.4-green)](https://www.python.org/downloads/release/python-31013/) [![cuda](https://img.shields.io/badge/CUDA-12.4-green)](https://developer.nvidia.com/cuda-downloads) -[![trt](https://img.shields.io/badge/TensorRT-10.1.0-green)](https://github.com/nvidia/tensorrt-llm) +[![trt](https://img.shields.io/badge/TensorRT-10.3.0-green)](https://github.com/nvidia/tensorrt-llm) [![license](https://img.shields.io/badge/license-BSD--3--Clause-blue)](./LICENSE) [![linux_tests](https://github.com/pytorch/TensorRT/actions/workflows/build-test-linux.yml/badge.svg)](https://github.com/pytorch/TensorRT/actions/workflows/build-test-linux.yml) [![windows_tests](https://github.com/pytorch/TensorRT/actions/workflows/build-test-windows.yml/badge.svg)](https://github.com/pytorch/TensorRT/actions/workflows/build-test-windows.yml) @@ -119,7 +119,7 @@ These are the following dependencies used to verify the testcases. Torch-TensorR - Bazel 6.3.2 - Libtorch 2.5.0.dev (latest nightly) (built with CUDA 12.4) - CUDA 12.4 -- TensorRT 10.1.0.27 +- TensorRT 10.3.0.26 ## Deprecation Policy diff --git a/core/conversion/converters/impl/batch_norm.cpp b/core/conversion/converters/impl/batch_norm.cpp index 02535ffa66..07cf445f50 100644 --- a/core/conversion/converters/impl/batch_norm.cpp +++ b/core/conversion/converters/impl/batch_norm.cpp @@ -123,7 +123,7 @@ auto batch_norm_registrations TORCHTRT_UNUSED = // track_running_stats=True LOG_DEBUG("Args[3] running_mean : " << args[3].isIValue() << " / " << args[3].IValue()->isNone()); LOG_DEBUG("Args[4] running_var : " << args[4].isIValue() << " / " << args[4].IValue()->isNone()); - LOG_DEBUG("use_input_stats, momemtum, cudnn_enabled disregarded"); + LOG_DEBUG("use_input_stats, momemtum are disregarded"); LOG_DEBUG("ctx->input_is_dynamic : " << ctx->input_is_dynamic); // Expand spatial dims from 1D to 2D if needed @@ -154,6 +154,17 @@ auto batch_norm_registrations TORCHTRT_UNUSED = return true; } + auto cudnn_enabled = static_cast(args[8].unwrapToBool(false)); + if (!cudnn_enabled) { + LOG_DEBUG( + "cuDNN is not enabled, skipping instance_norm conversion. \ + Since TRT 10.0, cuDNN is loaded as a dynamic dependency, \ + so for some functionalities, users need to install correct \ + cuDNN version by themselves. Please see our support matrix \ + here: https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html."); + return false; + } + const int relu = 0; const float alpha = 0; LOG_DEBUG("Set parameter `relu` and `alpha` to 0"); diff --git a/dev_dep_versions.yml b/dev_dep_versions.yml index 4cc94e2f77..3b23c49da3 100644 --- a/dev_dep_versions.yml +++ b/dev_dep_versions.yml @@ -1,2 +1,2 @@ __cuda_version__: "12.4" -__tensorrt_version__: "10.1.0" +__tensorrt_version__: "10.3.0" diff --git a/docker/README.md b/docker/README.md index 7f69b7c789..3d44f45b74 100644 --- a/docker/README.md +++ b/docker/README.md @@ -17,14 +17,14 @@ Note: By default the container uses the `pre-cxx11-abi` version of Torch + Torch ### Instructions -- The example below uses TensorRT 10.1.0.27 +- The example below uses TensorRT 10.3.0.26 - See dependencies for a list of current default dependencies. > From root of Torch-TensorRT repo Build: ``` -DOCKER_BUILDKIT=1 docker build --build-arg TENSORRT_VERSION=10.1.0 -f docker/Dockerfile -t torch_tensorrt:latest . +DOCKER_BUILDKIT=1 docker build --build-arg TENSORRT_VERSION=10.3.0 -f docker/Dockerfile -t torch_tensorrt:latest . ``` Run: diff --git a/packaging/pre_build_script_windows.sh b/packaging/pre_build_script_windows.sh index 8906179dcd..86ac5e3055 100644 --- a/packaging/pre_build_script_windows.sh +++ b/packaging/pre_build_script_windows.sh @@ -8,7 +8,7 @@ pip install tensorrt==${TRT_VERSION} tensorrt-${CU_VERSION::4}-bindings==${TRT_V choco install bazelisk -y -#curl -Lo TensorRT.zip https://developer.download.nvidia.com/compute/machine-learning/tensorrt/10.0.1/zip/TensorRT-10.0.1.6.Windows10.win10.cuda-12.4.zip +#curl -Lo TensorRT.zip https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.3.0/zip/TensorRT-10.3.0.26.Windows.win10.cuda-12.5.zip #unzip -o TensorRT.zip -d C:/ TORCH_TORCHVISION=$(grep "^torch" py/requirements.txt) INDEX_URL=https://download.pytorch.org/whl/${CHANNEL}/${CU_VERSION} diff --git a/packaging/smoke_test_script.sh b/packaging/smoke_test_script.sh index ea187eb5d1..19d9d717a4 100644 --- a/packaging/smoke_test_script.sh +++ b/packaging/smoke_test_script.sh @@ -2,5 +2,5 @@ # The issue was smoke test installs the built torch_tensorrt wheel file and checks `import torch_tensorrt; print(torch_tensorrt.__version__)` # Since tensorrt cannot be pip installable in CI, the smoke test will fail. # One way we tried to handle it is manually install tensorrt wheel while by extracting from the tarball. -# However, the TensorRT-10.1.0.27/lib path doesn't seem to show up in LD_LIBRARY_PATH even if we explicitly set it. +# However, the TensorRT-10.3.0.26/lib path doesn't seem to show up in LD_LIBRARY_PATH even if we explicitly set it. # TODO: Implement a custom smoke_test script to verify torch_tensorrt installation. \ No newline at end of file diff --git a/py/ci/Dockerfile.ci b/py/ci/Dockerfile.ci index 82a9dbdb7c..eddf12cefb 100644 --- a/py/ci/Dockerfile.ci +++ b/py/ci/Dockerfile.ci @@ -3,13 +3,13 @@ FROM pytorch/manylinux-builder:cuda12.4 RUN yum install -y ninja-build # download TensorRT tarball -RUN wget -q https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.1.0/tars/TensorRT-10.1.0.27.Linux.x86_64-gnu.cuda-12.4.tar.gz \ -&& gunzip TensorRT-10.1.0.27.Linux.x86_64-gnu.cuda-12.4.tar.gz \ -&& tar -xvf TensorRT-10.1.0.27.Linux.x86_64-gnu.cuda-12.4.tar \ -&& rm TensorRT-10.1.0.27.Linux.x86_64-gnu.cuda-12.4.tar +RUN wget -q https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.3.0/tars/TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-12.5.tar.gz \ +&& gunzip TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-12.5.tar.gz \ +&& tar -xvf TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-12.5.tar \ +&& rm TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-12.5.tar -ENV TENSORRT_DIR=/TensorRT-10.1.0.27 -ENV TENSORRT_VERSION=10.1.0 +ENV TENSORRT_DIR=/TensorRT-10.3.0.26 +ENV TENSORRT_VERSION=10.3.0 RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazelisk-linux-amd64 \ && mv bazelisk-linux-amd64 /usr/bin/bazel \ diff --git a/pyproject.toml b/pyproject.toml index a84724f968..f6230c8a74 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ requires = [ "cffi>=1.15.1", "typing-extensions>=4.7.0", "future>=0.18.3", - "tensorrt==10.1.0", + "tensorrt-cu12==10.3.0", "torch >=2.5.0.dev,<2.6.0", "pybind11==2.6.2", "numpy", @@ -55,9 +55,9 @@ keywords = [ ] dependencies = [ "torch >=2.5.0.dev,<2.6.0", - "tensorrt==10.1.0", - "tensorrt-cu12_bindings==10.1.0", - "tensorrt-cu12_libs==10.1.0", + "tensorrt-cu12==10.3.0", + "tensorrt-cu12-bindings==10.3.0", + "tensorrt-cu12-libs==10.3.0", "packaging>=23", "numpy", "typing-extensions>=4.7.0", diff --git a/tests/core/conversion/converters/test_instance_norm.cpp b/tests/core/conversion/converters/test_instance_norm.cpp index 336d050fd1..b94145e4e3 100644 --- a/tests/core/conversion/converters/test_instance_norm.cpp +++ b/tests/core/conversion/converters/test_instance_norm.cpp @@ -18,7 +18,7 @@ constexpr auto graph = R"IR( %running_mean.1 : Tensor?, %running_var.1 : Tensor?, %use_input_stats.1 : bool): - %cudnn_enabled.1 : bool = prim::Constant[value=1]() + %cudnn_enabled.1 : bool = prim::Constant[value=0]() %momentum.1 : float = prim::Constant[value=0.10000000000000001]() %eps.1 : float = prim::Constant[value=1.0000000000000001e-05]() %4 : Tensor = aten::instance_norm(%input.1, diff --git a/tests/py/ts/api/test_classes.py b/tests/py/ts/api/test_classes.py index 2a152cdec7..ee94e01740 100644 --- a/tests/py/ts/api/test_classes.py +++ b/tests/py/ts/api/test_classes.py @@ -309,10 +309,8 @@ def test_get_layer_info(self): """ { "Layers": [ - "reshape_before_%26 : Tensor = aten::matmul(%x.1, %25)", - "%26 : Tensor = aten::matmul(%x.1, %25) + [Freeze Tensor %27 : Tensor = trt::const(%10) ] + (Unnamed Layer* 4) [Shuffle] + unsqueeze_node_after_[Freeze Tensor %27 : Tensor = trt::const(%10) ] + (Unnamed Layer* 4) [Shuffle]_(Unnamed Layer* 4) [Shuffle]_output + %28 : Tensor = aten::add(%27, %26, %24)", - "%31 : Tensor = aten::matmul(%28, %30) + [Freeze Tensor %32 : Tensor = trt::const(%12) ] + (Unnamed Layer* 10) [Shuffle] + unsqueeze_node_after_[Freeze Tensor %32 : Tensor = trt::const(%12) ] + (Unnamed Layer* 10) [Shuffle]_(Unnamed Layer* 10) [Shuffle]_output + %33 : Tensor = aten::add(%32, %31, %29)", - "copied_squeeze_after_%33 : Tensor = aten::add(%32, %31, %29)" + "%26 : Tensor = aten::matmul(%x.1, %25)_myl0_0", + "%31 : Tensor = aten::matmul(%28, %30)_myl0_1" ], "Bindings": [ "input_0", @@ -326,7 +324,7 @@ def test_get_layer_info(self): trt_mod = TestTorchTensorRTModule._get_trt_mod() trt_json = json.loads(trt_mod.get_layer_info()) [self.assertTrue(k in trt_json.keys()) for k in ["Layers", "Bindings"]] - self.assertTrue(len(trt_json["Layers"]) == 4) + self.assertTrue(len(trt_json["Layers"]) == 2) self.assertTrue(len(trt_json["Bindings"]) == 2) diff --git a/toolchains/ci_workspaces/MODULE.bazel.tmpl b/toolchains/ci_workspaces/MODULE.bazel.tmpl index b897d35f56..49ad6f473a 100644 --- a/toolchains/ci_workspaces/MODULE.bazel.tmpl +++ b/toolchains/ci_workspaces/MODULE.bazel.tmpl @@ -67,20 +67,20 @@ http_archive( http_archive( name = "tensorrt", build_file = "@//third_party/tensorrt/archive:BUILD", - sha256 = "606436ed219c72d1a25a889b2b0ae5cb5a68499dd6f944da4cabb3c34c067d55", - strip_prefix = "TensorRT-10.1.0.27", + sha256 = "adff1cd5abe5d87013806172351e58fd024e5bf0fc61d49ef4b84cd38ed99081", + strip_prefix = "TensorRT-10.3.0.26", urls = [ - "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.1.0/tars/TensorRT-10.1.0.27.Linux.x86_64-gnu.cuda-12.4.tar.gz", + "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.3.0/tars/TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-12.5.tar.gz", ], ) http_archive( name = "tensorrt_win", build_file = "@//third_party/tensorrt/archive:BUILD", - sha256 = "2eb98008944945377eb328871a308704e95bf3bb295fc548784c6da41a70bbed", - strip_prefix = "TensorRT-10.1.0.27", + sha256 = "2bb4bcb79e8c33575816d874b0512ea28c302af1c06ee6d224da71aa182f75e0", + strip_prefix = "TensorRT-10.3.0.26", urls = [ - "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.1.0/zip/TensorRT-10.1.0.27.Windows.win10.cuda-12.4.zip", + "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.3.0/zip/TensorRT-10.3.0.26.Windows.win10.cuda-12.5.zip", ], ) diff --git a/toolchains/legacy/WORKSPACE.win.release.tmpl b/toolchains/legacy/WORKSPACE.win.release.tmpl index ce3df15602..58fce5cf54 100644 --- a/toolchains/legacy/WORKSPACE.win.release.tmpl +++ b/toolchains/legacy/WORKSPACE.win.release.tmpl @@ -63,7 +63,7 @@ http_archive( new_local_repository( name = "tensorrt_win", - path = "C:/TensorRT-10.1.0.27", + path = "C:/TensorRT-10.3.0.26", build_file = "@//third_party/tensorrt/local:BUILD" ) diff --git a/toolchains/legacy/WORKSPACE.x86_64.release.rhel.tmpl b/toolchains/legacy/WORKSPACE.x86_64.release.rhel.tmpl index 5b18a48139..97b3a8c566 100644 --- a/toolchains/legacy/WORKSPACE.x86_64.release.rhel.tmpl +++ b/toolchains/legacy/WORKSPACE.x86_64.release.rhel.tmpl @@ -71,10 +71,10 @@ http_archive( http_archive( name = "tensorrt", build_file = "@//third_party/tensorrt/archive:BUILD", - sha256 = "606436ed219c72d1a25a889b2b0ae5cb5a68499dd6f944da4cabb3c34c067d55", - strip_prefix = "TensorRT-10.1.0.27", + sha256 = "adff1cd5abe5d87013806172351e58fd024e5bf0fc61d49ef4b84cd38ed99081", + strip_prefix = "TensorRT-10.3.0.26", urls = [ - "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.1.0/tars/TensorRT-10.1.0.27.Linux.x86_64-gnu.cuda-12.4.tar.gz", + "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.3.0/tars/TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-12.5.tar.gz", ], ) From 213526e52320ba13aac63aa894bacc546899ba52 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 30 Aug 2024 15:47:46 -0700 Subject: [PATCH 42/55] chore: updates --- examples/dynamo/torch_export_gpt2.py | 4 ++-- examples/dynamo/torch_export_llama2.py | 2 ++ py/requirements.txt | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py index f30c8dc3e7..531fa68dd4 100644 --- a/examples/dynamo/torch_export_gpt2.py +++ b/examples/dynamo/torch_export_gpt2.py @@ -57,11 +57,11 @@ trt_model = torch_tensorrt.dynamo.compile( gpt2_ep, inputs=[input_ids], - enabled_precisions={torch.float16}, + enabled_precisions={torch.float32}, truncate_double=True, device=DEVICE, disable_tf32=True, - use_strong_types=False, + use_strong_types=True, use_fp32_acc=True, ) diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py index 195944688b..6fe725f395 100644 --- a/examples/dynamo/torch_export_llama2.py +++ b/examples/dynamo/torch_export_llama2.py @@ -54,6 +54,8 @@ truncate_double=True, device=DEVICE, disable_tf32=True, + use_strong_types=True, + use_fp32_acc=True, ) # Auto-regressive generation loop for greedy decoding using TensorRT model diff --git a/py/requirements.txt b/py/requirements.txt index 288d766fa6..5bf38d29c3 100644 --- a/py/requirements.txt +++ b/py/requirements.txt @@ -6,4 +6,4 @@ torch>=2.5.0.dev,<2.6.0 torchvision>=0.20.0.dev,<0.21.0 --extra-index-url https://pypi.ngc.nvidia.com pyyaml -tensorrt==10.1.0 +tensorrt==10.3.0 From c193593ed586af7acb26df7b718c176709e6b1bb Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 30 Aug 2024 16:41:13 -0700 Subject: [PATCH 43/55] chore : updates --- examples/dynamo/torch_export_llama2.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py index 6fe725f395..15655efda4 100644 --- a/examples/dynamo/torch_export_llama2.py +++ b/examples/dynamo/torch_export_llama2.py @@ -24,9 +24,13 @@ # CPU is used here so that GPU memory is reserved for TRT compilation. llama_path = "meta-llama/Llama-2-7b-chat-hf" with torch.no_grad(): - model = AutoModelForCausalLM.from_pretrained( - llama_path, use_cache=False, attn_implementation="eager" - ).eval() + model = ( + AutoModelForCausalLM.from_pretrained( + llama_path, use_cache=False, attn_implementation="eager" + ) + .half() + .eval() + ) tokenizer = AutoTokenizer.from_pretrained(llama_path) From 0de0b16517d48b3cc800aa1eee173c9e9d613362 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 24 Sep 2024 11:54:15 -0700 Subject: [PATCH 44/55] chore: updates --- .../dynamo/lowering/passes/accumulate_fp32_matmul.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/py/torch_tensorrt/dynamo/lowering/passes/accumulate_fp32_matmul.py b/py/torch_tensorrt/dynamo/lowering/passes/accumulate_fp32_matmul.py index a80d12630c..e600dc416f 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/accumulate_fp32_matmul.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/accumulate_fp32_matmul.py @@ -10,17 +10,17 @@ def accumulate_fp32_matmul(gm: torch.fx.GraphModule) -> torch.fx.GraphModule: """Replace a matmul layer with fp32 accumulation nodes""" - matmul_targets = [torch.ops.aten.mm.default, torch.ops.aten.bmm.default] + matmul_targets = [ + torch.ops.aten.mm.default, + torch.ops.aten.bmm.default, + torch.ops.aten.addmm.default, + ] matmul_nodes = [node for node in gm.graph.nodes if node.target in matmul_targets] for matmul_node in matmul_nodes: # Prior to the matmul node, insert a cast to the 32-bit float32 node node_inputs = matmul_node.all_input_nodes - # Upcast only mm nodes in addmm and leave the bias - if matmul_node.target == torch.ops.aten.addmm.default: - node_inputs = node_inputs[1:] - for node_input in node_inputs: with gm.graph.inserting_before(matmul_node): node_32bit = gm.graph.call_function( From 71e33cbe395aee88be8f3ffc5de58aa7e766210f Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 26 Sep 2024 11:24:12 -0700 Subject: [PATCH 45/55] chore: updates --- examples/dynamo/torch_export_gpt2.py | 18 ++++++++---- py/torch_tensorrt/dynamo/_compiler.py | 28 ++++++++++++++++++- py/torch_tensorrt/dynamo/_defaults.py | 2 ++ py/torch_tensorrt/dynamo/_settings.py | 6 ++++ .../dynamo/conversion/_TRTInterpreter.py | 1 - 5 files changed, 47 insertions(+), 8 deletions(-) diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py index a26305e4a3..8137575375 100644 --- a/examples/dynamo/torch_export_gpt2.py +++ b/examples/dynamo/torch_export_gpt2.py @@ -25,12 +25,16 @@ # CPU is used here so that GPU memory is reserved for TRT compilation. with torch.no_grad(): tokenizer = AutoTokenizer.from_pretrained("gpt2") - model = AutoModelForCausalLM.from_pretrained( - "gpt2", - pad_token_id=tokenizer.eos_token_id, - use_cache=False, - attn_implementation="eager", - ).eval() + model = ( + AutoModelForCausalLM.from_pretrained( + "gpt2", + pad_token_id=tokenizer.eos_token_id, + use_cache=False, + attn_implementation="eager", + ) + .eval() + .half() + ) # %% # Tokenize a sample input prompt and get pytorch model outputs @@ -56,6 +60,8 @@ truncate_double=True, device=DEVICE, disable_tf32=True, + use_strong_types=True, + use_fp32_acc=True, ) # Auto-regressive generation loop for greedy decoding using TensorRT model diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 97aa2ec443..b1d92eddc3 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -88,6 +88,8 @@ def compile( engine_cache_dir: str = _defaults.ENGINE_CACHE_DIR, engine_cache_size: int = _defaults.ENGINE_CACHE_SIZE, custom_engine_cache: Optional[BaseEngineCache] = _defaults.CUSTOM_ENGINE_CACHE, + use_strong_types: bool = _defaults.USE_STRONG_TYPES, + use_fp32_acc: bool = _defaults.USE_FP32_ACC, **kwargs: Any, ) -> torch.fx.GraphModule: """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT @@ -158,6 +160,8 @@ def compile( engine_cache_dir (Optional[str]): Directory to store the cached TRT engines engine_cache_size (Optional[int]): Maximum hard-disk space (bytes) to use for the engine cache, default is 1GB. If the cache exceeds this size, the oldest engines will be removed by default custom_engine_cache (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache. If used, engine_cache_dir and engine_cache_size will be ignored. + use_strong_types (bool): Enable strong typing in TensorRT compilation + use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions. **kwargs: Any, Returns: torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT @@ -197,6 +201,20 @@ def compile( "\nThis feature is unimplemented in Torch-TRT Dynamo currently." ) + if use_strong_types: + if len(enabled_precisions) != 1 or not any( + x in enabled_precisions for x in {torch.float32, dtype.f32} + ): + raise AssertionError( + f"When use_strong_types is enabled, only torch.float32 is allowed in the enabled_precisions but found {enabled_precisions}" + ) + + if use_fp32_acc: + logger.debug( + "FP32 accumulation for matmul layers is enabled. This option should only be enabled if the model already has FP16 weights and has no effect if it has FP32 weights. \ + This flag inserts casts around matmul layers and ensures TensorRT which executes the matmul layers in FP16 with FP32 accumulation." + ) + # Aliasing inputs to arg_inputs for better understanding if not arg_inputs and not inputs: raise AssertionError("'arg_inputs' and 'inputs' should not both be None.") @@ -232,7 +250,7 @@ def compile( logger.debug("Input graph: " + str(gm.graph)) # Apply lowering on the graph module - gm = post_lowering(gm) + gm = post_lowering(gm, use_fp32_acc=use_fp32_acc) logger.debug("Lowered Input graph: " + str(gm.graph)) engine_cache = None @@ -281,6 +299,8 @@ def compile( "lazy_engine_init": lazy_engine_init, "cache_built_engines": cache_built_engines, "reuse_cached_engines": reuse_cached_engines, + "use_strong_types": use_strong_types, + "use_fp32_acc": use_fp32_acc, } settings = CompilationSettings(**compilation_options) @@ -520,6 +540,8 @@ def convert_exported_program_to_serialized_trt_engine( calibrator: object = None, allow_shape_tensors: bool = False, timing_cache_path: str = _defaults.TIMING_CACHE_PATH, + use_strong_types: bool = _defaults.USE_STRONG_TYPES, + use_fp32_acc: bool = _defaults.USE_FP32_ACC, **kwargs: Any, ) -> bytes: """Convert an ExportedProgram to a serialized TensorRT engine @@ -578,6 +600,8 @@ def convert_exported_program_to_serialized_trt_engine( calibrator (Union(torch_tensorrt._C.IInt8Calibrator, tensorrt.IInt8Calibrator)): Calibrator object which will provide data to the PTQ system for INT8 Calibration allow_shape_tensors: (Experimental) Allow aten::size to output shape tensors using IShapeLayer in TensorRT timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation + use_strong_types (bool): Enable strong typing in TensorRT compilation + use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions. Returns: bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs """ @@ -651,6 +675,8 @@ def convert_exported_program_to_serialized_trt_engine( "dla_local_dram_size": dla_local_dram_size, "dla_global_dram_size": dla_global_dram_size, "timing_cache_path": timing_cache_path, + "use_strong_types": use_strong_types, + "use_fp32_acc": use_fp32_acc, } exported_program = pre_export_lowering(exported_program) diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py index 68e446dab5..788829e757 100644 --- a/py/torch_tensorrt/dynamo/_defaults.py +++ b/py/torch_tensorrt/dynamo/_defaults.py @@ -40,6 +40,8 @@ ENGINE_CACHE_DIR = os.path.join(tempfile.gettempdir(), "torch_tensorrt_engine_cache") ENGINE_CACHE_SIZE = 1073741824 CUSTOM_ENGINE_CACHE = None +USE_STRONG_TYPES = False +USE_FP32_ACC = False def default_device() -> Device: diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index f8886fbd67..b042c65240 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -30,7 +30,9 @@ TIMING_CACHE_PATH, TRUNCATE_DOUBLE, USE_FAST_PARTITIONER, + USE_FP32_ACC, USE_PYTHON_RUNTIME, + USE_STRONG_TYPES, VERSION_COMPATIBLE, WORKSPACE_SIZE, default_device, @@ -78,6 +80,8 @@ class CompilationSettings: timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation cache_built_engines (bool): Whether to save the compiled TRT engines to storage reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage + use_strong_types (bool): Enable strong typing in TensorRT compilation + use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions. """ enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS) @@ -112,6 +116,8 @@ class CompilationSettings: lazy_engine_init: bool = LAZY_ENGINE_INIT cache_built_engines: bool = CACHE_BUILT_ENGINES reuse_cached_engines: bool = REUSE_CACHED_ENGINES + use_strong_types: bool = USE_STRONG_TYPES + use_fp32_acc: bool = USE_FP32_ACC _SETTINGS_TO_BE_ENGINE_INVARIANT = ( diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index 39de405b40..231ca51306 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -124,7 +124,6 @@ def __init__( dict() ) - self.compilation_settings = compilation_settings # Data types for TRT Module output Tensors self.output_dtypes = ( [dtype._from(o) for o in output_dtypes] if output_dtypes else None From 4257b1e58b69d9cf6c7de2fd89439f569ab37a6b Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 30 Sep 2024 16:07:12 -0700 Subject: [PATCH 46/55] chore: updates --- docsrc/index.rst | 2 + examples/dynamo/README.rst | 31 ++++++++++---- examples/dynamo/requirements.txt | 4 +- py/torch_tensorrt/dynamo/_compiler.py | 2 +- py/torch_tensorrt/dynamo/backend/backends.py | 2 +- .../lowering/passes/accumulate_fp32_matmul.py | 2 - .../lowering/test_aten_lowering_passes.py | 42 +++++++++++++++++-- tests/py/dynamo/testing_utilities.py | 6 ++- 8 files changed, 74 insertions(+), 17 deletions(-) diff --git a/docsrc/index.rst b/docsrc/index.rst index 757acc2011..0bef2f0664 100644 --- a/docsrc/index.rst +++ b/docsrc/index.rst @@ -118,6 +118,8 @@ Tutorials tutorials/_rendered_examples/distributed_inference/data_parallel_gpt2 tutorials/_rendered_examples/distributed_inference/data_parallel_stable_diffusion tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example + tutorials/_rendered_examples/dynamo/torch_export_gpt2 + tutorials/_rendered_examples/dynamo/torch_export_llama2 Python API Documentation ------------------------ diff --git a/examples/dynamo/README.rst b/examples/dynamo/README.rst index ff3563cffe..6be2aa6515 100644 --- a/examples/dynamo/README.rst +++ b/examples/dynamo/README.rst @@ -1,15 +1,24 @@ .. _torch_compile: -Dynamo / ``torch.compile`` ----------------------------- +Torch-TensorRT Examples +==================================== -Torch-TensorRT provides a backend for the new ``torch.compile`` API released in PyTorch 2.0. In the following examples we describe -a number of ways you can leverage this backend to accelerate inference. +Please refer to the following examples which demonstrate the usage of different features of Torch-TensorRT. We also provide +examples of Torch-TensorRT compilation of select computer vision and language models. -* :ref:`torch_compile_resnet`: Compiling a ResNet model using the Torch Compile Frontend for ``torch_tensorrt.compile`` -* :ref:`torch_compile_transformer`: Compiling a Transformer model using ``torch.compile`` +Dependencies +------------------------------------ + +Please install the following external depencies (assuming you already have `torch_tensorrt` installed) + +.. code-block:: python + + pip install -r requirements.txt + + +Compiler Features +------------------------------------ * :ref:`torch_compile_advanced_usage`: Advanced usage including making a custom backend to use directly with the ``torch.compile`` API -* :ref:`torch_compile_stable_diffusion`: Compiling a Stable Diffusion model using ``torch.compile`` * :ref:`torch_export_cudagraphs`: Using the Cudagraphs integration with `ir="dynamo"` * :ref:`custom_kernel_plugins`: Creating a plugin to use a custom kernel inside TensorRT engines * :ref:`refit_engine_example`: Refitting a compiled TensorRT Graph Module with updated weights @@ -17,3 +26,11 @@ a number of ways you can leverage this backend to accelerate inference. * :ref:`vgg16_fp8_ptq`: Compiling a VGG16 model with FP8 and PTQ using ``torch.compile`` * :ref:`engine_caching_example`: Utilizing engine caching to speed up compilation times * :ref:`engine_caching_bert_example`: Demonstrating engine caching on BERT + +Model Zoo +------------------------------------ +* :ref:`torch_compile_resnet`: Compiling a ResNet model using the Torch Compile Frontend for ``torch_tensorrt.compile`` +* :ref:`torch_compile_transformer`: Compiling a Transformer model using ``torch.compile`` +* :ref:`torch_compile_stable_diffusion`: Compiling a Stable Diffusion model using ``torch.compile`` +* :ref:`_torch_export_gpt2`: Compiling a GPT2 model using AOT workflow (`ir=dynamo`) +* :ref:`_torch_export_llama2`: Compiling a Llama2 model using AOT workflow (`ir=dynamo`) \ No newline at end of file diff --git a/examples/dynamo/requirements.txt b/examples/dynamo/requirements.txt index 6e53935186..59a802918c 100644 --- a/examples/dynamo/requirements.txt +++ b/examples/dynamo/requirements.txt @@ -1,4 +1,4 @@ cupy==13.1.0 -torch>=2.4.0.dev20240503+cu121 -torch-tensorrt>=2.4.0.dev20240503+cu121 triton==2.3.0 +diffusers==0.30.3 +transformers==4.44.2 \ No newline at end of file diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index b1d92eddc3..8062f01e84 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -212,7 +212,7 @@ def compile( if use_fp32_acc: logger.debug( "FP32 accumulation for matmul layers is enabled. This option should only be enabled if the model already has FP16 weights and has no effect if it has FP32 weights. \ - This flag inserts casts around matmul layers and ensures TensorRT which executes the matmul layers in FP16 with FP32 accumulation." + This flag inserts casts around matmul layers and ensures TensorRT executes the matmul layers in FP16 with FP32 accumulation." ) # Aliasing inputs to arg_inputs for better understanding diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py index 605d963a50..aa8766fdae 100644 --- a/py/torch_tensorrt/dynamo/backend/backends.py +++ b/py/torch_tensorrt/dynamo/backend/backends.py @@ -100,7 +100,7 @@ def _pretraced_backend( logger.debug("Post-AOT Autograd graph:\n" + str(gm.graph)) - gm = post_lowering(gm) + gm = post_lowering(gm, use_fp32_acc=settings.use_fp32_acc) logger.debug("Lowered Input graph:\n " + str(gm.graph)) diff --git a/py/torch_tensorrt/dynamo/lowering/passes/accumulate_fp32_matmul.py b/py/torch_tensorrt/dynamo/lowering/passes/accumulate_fp32_matmul.py index e600dc416f..d69249088c 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/accumulate_fp32_matmul.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/accumulate_fp32_matmul.py @@ -16,7 +16,6 @@ def accumulate_fp32_matmul(gm: torch.fx.GraphModule) -> torch.fx.GraphModule: torch.ops.aten.addmm.default, ] matmul_nodes = [node for node in gm.graph.nodes if node.target in matmul_targets] - for matmul_node in matmul_nodes: # Prior to the matmul node, insert a cast to the 32-bit float32 node node_inputs = matmul_node.all_input_nodes @@ -47,5 +46,4 @@ def accumulate_fp32_matmul(gm: torch.fx.GraphModule) -> torch.fx.GraphModule: gm = clean_up_graph_after_modifications(gm) logger.debug(f"Graph after changing matmuls to use FP32 accumulation:\n{gm.graph}") - return gm diff --git a/tests/py/dynamo/lowering/test_aten_lowering_passes.py b/tests/py/dynamo/lowering/test_aten_lowering_passes.py index ad3fc8fa79..86cd5e3699 100644 --- a/tests/py/dynamo/lowering/test_aten_lowering_passes.py +++ b/tests/py/dynamo/lowering/test_aten_lowering_passes.py @@ -466,9 +466,6 @@ def forward(self, input, weight, bias): ) torch._dynamo.reset() - @unittest.skip( - "This test has threshold failures. This is tracked at https://github.com/pytorch/TensorRT/issues/2715", - ) def test_lower_linear_batch(self): class Linear(torch.nn.Module): def forward(self, input, weight, bias): @@ -575,5 +572,44 @@ def forward(self, input): torch._dynamo.reset() +class TestFP32Accumulation(TestCase): + def test_fp32_acc(self): + class FP32Acc(torch.nn.Module): + def forward(self, input, weight): + out = torch.ops.aten.mm.default(input, weight) + return out + + inputs = [ + torch.rand((3, 4)).cuda(), + torch.rand((4, 5)).cuda(), + ] + + fx_graph = torch.fx.symbolic_trace(FP32Acc()) + expected_ops = {torch.ops.aten._to_copy.default, torch.ops.aten.mm.default} + unexpected_ops = {} + + unexpected_ops_seen, expected_ops_unseen = lower_graph_testing( + fx_graph, + inputs, + expected_ops=expected_ops, + unexpected_ops=unexpected_ops, + min_block_size=1, + use_fp32_acc=True, + ) + + self.assertEqual( + len(unexpected_ops_seen), + 0, + f"The following unexpected ops were encountered: {unexpected_ops_seen}", + ) + + self.assertEqual( + len(expected_ops_unseen), + 0, + f"The following expected ops were not encountered: {expected_ops_unseen}", + ) + torch._dynamo.reset() + + if __name__ == "__main__": run_tests() diff --git a/tests/py/dynamo/testing_utilities.py b/tests/py/dynamo/testing_utilities.py index 9e33aec53a..c0126fad24 100644 --- a/tests/py/dynamo/testing_utilities.py +++ b/tests/py/dynamo/testing_utilities.py @@ -24,6 +24,7 @@ def fx_dynamo_testing_backend( min_block_size: int = 3, torch_executed_ops: Sequence[str] = set(), use_fast_partitioner: bool = True, + use_fp32_acc: bool = False, ): """Helper Dynamo backend exclusively for testing""" custom_backend = partial( @@ -50,7 +51,7 @@ def fx_dynamo_testing_backend( decompositions=get_decompositions(), ) - gm = post_lowering(gm) + gm = post_lowering(gm, use_fp32_acc=use_fp32_acc) trt_compiled = custom_backend( gm, @@ -153,6 +154,7 @@ def lower_graph_testing( torch_executed_ops: Sequence[str] = set(), testing_partitioning: bool = False, use_fast_partitioner: bool = True, + use_fp32_acc: bool = False, ): """Helper function to assist with graph lowering for testing of Dynamo compile @@ -165,6 +167,7 @@ def lower_graph_testing( torch_executed_ops: Sequence of operations to run in Torch, regardless of converter coverage testing_partitioning: Whether partitioning is being tested (to analyze only TRT-supported ops) use_fast_partitioner: Whether to use the fast or global partitioner + use_fp32_acc: This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions. Returns: If testing_partitioning: List[torch.fx.GraphModule], Set, Set: List of partitioned graph outputs, unexpected ops seen, expected ops unseen @@ -179,6 +182,7 @@ def lower_graph_testing( min_block_size=min_block_size, torch_executed_ops=torch_executed_ops, use_fast_partitioner=use_fast_partitioner, + use_fp32_acc=use_fp32_acc, ) # Invoke compilation From 619a39a175cff815eec0110dbd858d07737aea00 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 1 Oct 2024 14:17:56 -0700 Subject: [PATCH 47/55] chore: updates --- examples/dynamo/README.rst | 2 +- examples/dynamo/torch_export_gpt2.py | 8 ++++++-- examples/dynamo/torch_export_llama2.py | 3 ++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/examples/dynamo/README.rst b/examples/dynamo/README.rst index 6be2aa6515..83655628bc 100644 --- a/examples/dynamo/README.rst +++ b/examples/dynamo/README.rst @@ -9,7 +9,7 @@ examples of Torch-TensorRT compilation of select computer vision and language mo Dependencies ------------------------------------ -Please install the following external depencies (assuming you already have `torch_tensorrt` installed) +Please install the following external dependencies (assuming you already have correct `torch`, `torch_tensorrt` and `tensorrt` libraries installed (`dependencies `_)) .. code-block:: python diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py index 8137575375..db4bd8fc68 100644 --- a/examples/dynamo/torch_export_gpt2.py +++ b/examples/dynamo/torch_export_gpt2.py @@ -87,6 +87,10 @@ # %% # The output sentences should look like # ============================= -# Pytorch model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my +# Pytorch model generated text: What is parallel programming ? + +# The parallel programming paradigm is a set of programming languages that are designed to be used in parallel. The main difference between parallel programming and parallel programming is that # ============================= -# TensorRT model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my +# TensorRT model generated text: What is parallel programming ? + +# The parallel programming paradigm is a set of programming languages that are designed to be used in parallel. The main difference between parallel programming and parallel programming is that diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py index 195944688b..e1e3c9d042 100644 --- a/examples/dynamo/torch_export_llama2.py +++ b/examples/dynamo/torch_export_llama2.py @@ -50,10 +50,11 @@ llama2_ep, inputs=[input_ids], enabled_precisions={torch.float32}, - min_block_size=1, truncate_double=True, device=DEVICE, disable_tf32=True, + use_strong_types=True, + use_fp32_acc=True, ) # Auto-regressive generation loop for greedy decoding using TensorRT model From 8c0b9c618bbd2e6b6e37629ca8ea0104fba58b7d Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 7 Oct 2024 11:12:52 -0700 Subject: [PATCH 48/55] chore: trunc_fiv fix --- .../dynamo/conversion/impl/elementwise/ops.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py index 3f8d9667b3..6fc77ead6d 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py @@ -18,14 +18,11 @@ from torch_tensorrt.dynamo.conversion.impl.elementwise.base import ( convert_binary_elementwise, ) -from torch_tensorrt.dynamo.conversion.impl.shape import get_shape_with_dynamic_shape from torch_tensorrt.dynamo.conversion.impl.unary import atan, sign from torch_tensorrt.dynamo.conversion.impl.unary.base import convert_unary from torch_tensorrt.fx.converters.converter_utils import broadcast from torch_tensorrt.fx.types import TRTTensor -import tensorrt as trt - def trunc_div( ctx: ConversionContext, @@ -69,10 +66,12 @@ def trunc_div( prod_output, ) + # TODO: This casting causes output divergence for llama2 in FP16. + # @apbose to investigate why this is needed and suggest alternatives. # cast the sign_output back to int32 for trunc div # This is required for scatter_reduce_.two(reduce='mean' where trunc_div casts it to float32 and TRTInterpreter expects int32) - if (isinstance(sign_output, TRTTensor)) and (sign_output.dtype == trt.float32): - sign_output = cast_trt_tensor(ctx, sign_output, trt.int32, name) + # if (isinstance(sign_output, TRTTensor)) and (sign_output.dtype == trt.float32): + # sign_output = cast_trt_tensor(ctx, sign_output, trt.int32, name) # Convert constant input into ITensor for UnaryOperation if not isinstance(input, trt.tensorrt.ITensor): From b6261f90508c0b66b6a7a938487b71966923395d Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 7 Oct 2024 11:19:38 -0700 Subject: [PATCH 49/55] chore: update result --- examples/dynamo/torch_export_llama2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py index e1e3c9d042..4e1f86eb76 100644 --- a/examples/dynamo/torch_export_llama2.py +++ b/examples/dynamo/torch_export_llama2.py @@ -86,6 +86,6 @@ # %% # The output sentences should look like # ============================= -# Pytorch model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my +# Pytorch model generated text: Dynamic programming is an algorithmic technique used to solve complex problems by breaking them down into smaller subproblems, solving each subproblem only once, and # ============================= -# TensorRT model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my +# TensorRT model generated text: Dynamic programming is an algorithmic technique used to solve complex problems by breaking them down into smaller subproblems, solving each subproblem only once, and From ebdfe8f6a3f822e23debf0eaa7a56fbec58f94e0 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 7 Oct 2024 13:51:08 -0700 Subject: [PATCH 50/55] fix: add model.half() for llama2 --- examples/dynamo/torch_export_llama2.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py index 4e1f86eb76..68e3813b30 100644 --- a/examples/dynamo/torch_export_llama2.py +++ b/examples/dynamo/torch_export_llama2.py @@ -24,9 +24,13 @@ # CPU is used here so that GPU memory is reserved for TRT compilation. llama_path = "meta-llama/Llama-2-7b-chat-hf" with torch.no_grad(): - model = AutoModelForCausalLM.from_pretrained( - llama_path, use_cache=False, attn_implementation="eager" - ).eval() + model = ( + AutoModelForCausalLM.from_pretrained( + llama_path, use_cache=False, attn_implementation="eager" + ) + .eval() + .half() + ) tokenizer = AutoTokenizer.from_pretrained(llama_path) From 61ec94869293e0e54cb547df0b04e6b2362d055d Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 8 Oct 2024 09:54:50 -0700 Subject: [PATCH 51/55] chore: address review comments --- examples/dynamo/torch_export_gpt2.py | 2 +- examples/dynamo/torch_export_llama2.py | 2 +- py/torch_tensorrt/dynamo/_compiler.py | 16 ++++++++-------- py/torch_tensorrt/dynamo/_defaults.py | 2 +- py/torch_tensorrt/dynamo/_settings.py | 6 +++--- .../dynamo/conversion/_TRTInterpreter.py | 2 +- .../dynamo/conversion/impl/activation/base.py | 6 +++++- .../dynamo/conversion/impl/elementwise/ops.py | 7 ------- 8 files changed, 20 insertions(+), 23 deletions(-) diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py index db4bd8fc68..20310f9fcf 100644 --- a/examples/dynamo/torch_export_gpt2.py +++ b/examples/dynamo/torch_export_gpt2.py @@ -60,7 +60,7 @@ truncate_double=True, device=DEVICE, disable_tf32=True, - use_strong_types=True, + use_strong_typing=True, use_fp32_acc=True, ) diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py index 68e3813b30..b93522092f 100644 --- a/examples/dynamo/torch_export_llama2.py +++ b/examples/dynamo/torch_export_llama2.py @@ -57,7 +57,7 @@ truncate_double=True, device=DEVICE, disable_tf32=True, - use_strong_types=True, + use_strong_typing=True, use_fp32_acc=True, ) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 8062f01e84..4045c38f50 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -88,7 +88,7 @@ def compile( engine_cache_dir: str = _defaults.ENGINE_CACHE_DIR, engine_cache_size: int = _defaults.ENGINE_CACHE_SIZE, custom_engine_cache: Optional[BaseEngineCache] = _defaults.CUSTOM_ENGINE_CACHE, - use_strong_types: bool = _defaults.USE_STRONG_TYPES, + use_strong_typing: bool = _defaults.USE_STRONG_TYPING, use_fp32_acc: bool = _defaults.USE_FP32_ACC, **kwargs: Any, ) -> torch.fx.GraphModule: @@ -160,7 +160,7 @@ def compile( engine_cache_dir (Optional[str]): Directory to store the cached TRT engines engine_cache_size (Optional[int]): Maximum hard-disk space (bytes) to use for the engine cache, default is 1GB. If the cache exceeds this size, the oldest engines will be removed by default custom_engine_cache (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache. If used, engine_cache_dir and engine_cache_size will be ignored. - use_strong_types (bool): Enable strong typing in TensorRT compilation + use_strong_typing (bool): Enable strong typing in TensorRT compilation use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions. **kwargs: Any, Returns: @@ -201,12 +201,12 @@ def compile( "\nThis feature is unimplemented in Torch-TRT Dynamo currently." ) - if use_strong_types: + if use_strong_typing: if len(enabled_precisions) != 1 or not any( x in enabled_precisions for x in {torch.float32, dtype.f32} ): raise AssertionError( - f"When use_strong_types is enabled, only torch.float32 is allowed in the enabled_precisions but found {enabled_precisions}" + f"When use_strong_typing is enabled, only torch.float32 is allowed in the enabled_precisions but found {enabled_precisions}" ) if use_fp32_acc: @@ -299,7 +299,7 @@ def compile( "lazy_engine_init": lazy_engine_init, "cache_built_engines": cache_built_engines, "reuse_cached_engines": reuse_cached_engines, - "use_strong_types": use_strong_types, + "use_strong_typing": use_strong_typing, "use_fp32_acc": use_fp32_acc, } @@ -540,7 +540,7 @@ def convert_exported_program_to_serialized_trt_engine( calibrator: object = None, allow_shape_tensors: bool = False, timing_cache_path: str = _defaults.TIMING_CACHE_PATH, - use_strong_types: bool = _defaults.USE_STRONG_TYPES, + use_strong_typing: bool = _defaults.USE_STRONG_TYPING, use_fp32_acc: bool = _defaults.USE_FP32_ACC, **kwargs: Any, ) -> bytes: @@ -600,7 +600,7 @@ def convert_exported_program_to_serialized_trt_engine( calibrator (Union(torch_tensorrt._C.IInt8Calibrator, tensorrt.IInt8Calibrator)): Calibrator object which will provide data to the PTQ system for INT8 Calibration allow_shape_tensors: (Experimental) Allow aten::size to output shape tensors using IShapeLayer in TensorRT timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation - use_strong_types (bool): Enable strong typing in TensorRT compilation + use_strong_typing (bool): Enable strong typing in TensorRT compilation use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions. Returns: bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs @@ -675,7 +675,7 @@ def convert_exported_program_to_serialized_trt_engine( "dla_local_dram_size": dla_local_dram_size, "dla_global_dram_size": dla_global_dram_size, "timing_cache_path": timing_cache_path, - "use_strong_types": use_strong_types, + "use_strong_typing": use_strong_typing, "use_fp32_acc": use_fp32_acc, } diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py index 788829e757..77a04e3e03 100644 --- a/py/torch_tensorrt/dynamo/_defaults.py +++ b/py/torch_tensorrt/dynamo/_defaults.py @@ -40,7 +40,7 @@ ENGINE_CACHE_DIR = os.path.join(tempfile.gettempdir(), "torch_tensorrt_engine_cache") ENGINE_CACHE_SIZE = 1073741824 CUSTOM_ENGINE_CACHE = None -USE_STRONG_TYPES = False +USE_STRONG_TYPING = False USE_FP32_ACC = False diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index b042c65240..d5dccdafba 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -32,7 +32,7 @@ USE_FAST_PARTITIONER, USE_FP32_ACC, USE_PYTHON_RUNTIME, - USE_STRONG_TYPES, + USE_STRONG_TYPING, VERSION_COMPATIBLE, WORKSPACE_SIZE, default_device, @@ -80,7 +80,7 @@ class CompilationSettings: timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation cache_built_engines (bool): Whether to save the compiled TRT engines to storage reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage - use_strong_types (bool): Enable strong typing in TensorRT compilation + use_strong_typing (bool): Enable strong typing in TensorRT compilation use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions. """ @@ -116,7 +116,7 @@ class CompilationSettings: lazy_engine_init: bool = LAZY_ENGINE_INIT cache_built_engines: bool = CACHE_BUILT_ENGINES reuse_cached_engines: bool = REUSE_CACHED_ENGINES - use_strong_types: bool = USE_STRONG_TYPES + use_strong_typing: bool = USE_STRONG_TYPING use_fp32_acc: bool = USE_FP32_ACC diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index 231ca51306..9f775ef701 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -80,7 +80,7 @@ def __init__( self.builder = trt.Builder(self.logger) flag = 0 - if compilation_settings.use_strong_types: + if compilation_settings.use_strong_typing: STRONGLY_TYPED = 1 << (int)( trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED ) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/activation/base.py b/py/torch_tensorrt/dynamo/conversion/impl/activation/base.py index f726a1c500..db257b9c4e 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/activation/base.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/activation/base.py @@ -37,7 +37,11 @@ def convert_activation( layer.beta = beta set_layer_name(layer, target, name, source_ir) - if input_val.dynamic_range is not None and dyn_range_fn is not None: + if ( + not ctx.net.get_flag(trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED) + and input_val.dynamic_range is not None + and dyn_range_fn is not None + ): dyn_range = dyn_range_fn(input_val.dynamic_range) mark_as_int8_layer(layer, dyn_range) return layer.get_output(0) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py index 6fc77ead6d..348c71fd87 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py @@ -66,13 +66,6 @@ def trunc_div( prod_output, ) - # TODO: This casting causes output divergence for llama2 in FP16. - # @apbose to investigate why this is needed and suggest alternatives. - # cast the sign_output back to int32 for trunc div - # This is required for scatter_reduce_.two(reduce='mean' where trunc_div casts it to float32 and TRTInterpreter expects int32) - # if (isinstance(sign_output, TRTTensor)) and (sign_output.dtype == trt.float32): - # sign_output = cast_trt_tensor(ctx, sign_output, trt.int32, name) - # Convert constant input into ITensor for UnaryOperation if not isinstance(input, trt.tensorrt.ITensor): input = get_trt_tensor(ctx, input, f"{name}_input") From dd27a547fae41d7d3c7b8ca5659f768f08d77308 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 8 Oct 2024 12:40:27 -0700 Subject: [PATCH 52/55] chore: address review comments --- docsrc/index.rst | 2 ++ examples/dynamo/torch_export_gpt2.py | 6 +++++- examples/dynamo/torch_export_llama2.py | 6 +++++- py/torch_tensorrt/dynamo/_compiler.py | 16 ++++++++-------- py/torch_tensorrt/dynamo/_defaults.py | 2 +- py/torch_tensorrt/dynamo/_settings.py | 6 +++--- .../dynamo/conversion/_TRTInterpreter.py | 2 +- 7 files changed, 25 insertions(+), 15 deletions(-) diff --git a/docsrc/index.rst b/docsrc/index.rst index 0bef2f0664..82600dce98 100644 --- a/docsrc/index.rst +++ b/docsrc/index.rst @@ -37,6 +37,7 @@ User Guide * :ref:`saving_models` * :ref:`runtime` * :ref:`using_dla` +* :ref:`mixed_precision` .. toctree:: :caption: User Guide @@ -48,6 +49,7 @@ User Guide user_guide/saving_models user_guide/runtime user_guide/using_dla + user_guide/mixed_precision tutorials/_rendered_examples/dynamo/torch_compile_advanced_usage tutorials/_rendered_examples/dynamo/vgg16_ptq tutorials/_rendered_examples/dynamo/engine_caching_example diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py index 20310f9fcf..f9229e420c 100644 --- a/examples/dynamo/torch_export_gpt2.py +++ b/examples/dynamo/torch_export_gpt2.py @@ -52,6 +52,10 @@ # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # Export the GPT2 model into an ExportedProgram which is input of TRT compilation +# To compile the model in FP16, we do the following +# 1) Cast the model to FP16 via model.half() +# 2) Enable use_explicit_typing=True. Certain layers are explicitly casted to FP32 within the pytorch model and this flag respects this behavior during TRT compilation +# 3) Enable use_fp32_acc=True. This ensures all the matmuls are accumulated in FP32 precision (similar to PyTorch) gpt2_ep = export_llm(model, input_ids, max_seq_len=1024) trt_model = torch_tensorrt.dynamo.compile( gpt2_ep, @@ -60,7 +64,7 @@ truncate_double=True, device=DEVICE, disable_tf32=True, - use_strong_typing=True, + use_explicit_typing=True, use_fp32_acc=True, ) diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py index b93522092f..11a0c93276 100644 --- a/examples/dynamo/torch_export_llama2.py +++ b/examples/dynamo/torch_export_llama2.py @@ -49,6 +49,10 @@ # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # Export the llama2 model into an ExportedProgram which is input of TRT compilation +# To compile the model in FP16, we do the following +# 1) Cast the model to FP16 via model.half() +# 2) Enable use_explicit_typing=True. Certain layers are explicitly casted to FP32 within the pytorch model and this flag respects this behavior during TRT compilation +# 3) Enable use_fp32_acc=True. This ensures all the matmuls are accumulated in FP32 precision (similar to PyTorch) llama2_ep = export_llm(model, input_ids, max_seq_len=64) trt_model = torch_tensorrt.dynamo.compile( llama2_ep, @@ -57,7 +61,7 @@ truncate_double=True, device=DEVICE, disable_tf32=True, - use_strong_typing=True, + use_explicit_typing=True, use_fp32_acc=True, ) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 4045c38f50..fc7d1a0bc8 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -88,7 +88,7 @@ def compile( engine_cache_dir: str = _defaults.ENGINE_CACHE_DIR, engine_cache_size: int = _defaults.ENGINE_CACHE_SIZE, custom_engine_cache: Optional[BaseEngineCache] = _defaults.CUSTOM_ENGINE_CACHE, - use_strong_typing: bool = _defaults.USE_STRONG_TYPING, + use_explicit_typing: bool = _defaults.USE_EXPLICIT_TYPING, use_fp32_acc: bool = _defaults.USE_FP32_ACC, **kwargs: Any, ) -> torch.fx.GraphModule: @@ -160,7 +160,7 @@ def compile( engine_cache_dir (Optional[str]): Directory to store the cached TRT engines engine_cache_size (Optional[int]): Maximum hard-disk space (bytes) to use for the engine cache, default is 1GB. If the cache exceeds this size, the oldest engines will be removed by default custom_engine_cache (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache. If used, engine_cache_dir and engine_cache_size will be ignored. - use_strong_typing (bool): Enable strong typing in TensorRT compilation + use_explicit_typing (bool): This flag enables strong typing in TensorRT compilation which respects the precisions set in the Pytorch model. This is useful when users have mixed precision graphs. use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions. **kwargs: Any, Returns: @@ -201,12 +201,12 @@ def compile( "\nThis feature is unimplemented in Torch-TRT Dynamo currently." ) - if use_strong_typing: + if use_explicit_typing: if len(enabled_precisions) != 1 or not any( x in enabled_precisions for x in {torch.float32, dtype.f32} ): raise AssertionError( - f"When use_strong_typing is enabled, only torch.float32 is allowed in the enabled_precisions but found {enabled_precisions}" + f"When use_explicit_typing is enabled, only torch.float32 is allowed in the enabled_precisions but found {enabled_precisions}" ) if use_fp32_acc: @@ -299,7 +299,7 @@ def compile( "lazy_engine_init": lazy_engine_init, "cache_built_engines": cache_built_engines, "reuse_cached_engines": reuse_cached_engines, - "use_strong_typing": use_strong_typing, + "use_explicit_typing": use_explicit_typing, "use_fp32_acc": use_fp32_acc, } @@ -540,7 +540,7 @@ def convert_exported_program_to_serialized_trt_engine( calibrator: object = None, allow_shape_tensors: bool = False, timing_cache_path: str = _defaults.TIMING_CACHE_PATH, - use_strong_typing: bool = _defaults.USE_STRONG_TYPING, + use_explicit_typing: bool = _defaults.USE_EXPLICIT_TYPING, use_fp32_acc: bool = _defaults.USE_FP32_ACC, **kwargs: Any, ) -> bytes: @@ -600,7 +600,7 @@ def convert_exported_program_to_serialized_trt_engine( calibrator (Union(torch_tensorrt._C.IInt8Calibrator, tensorrt.IInt8Calibrator)): Calibrator object which will provide data to the PTQ system for INT8 Calibration allow_shape_tensors: (Experimental) Allow aten::size to output shape tensors using IShapeLayer in TensorRT timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation - use_strong_typing (bool): Enable strong typing in TensorRT compilation + use_explicit_typing (bool): This flag enables strong typing in TensorRT compilation which respects the precisions set in the Pytorch model. This is useful when users have mixed precision graphs. use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions. Returns: bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs @@ -675,7 +675,7 @@ def convert_exported_program_to_serialized_trt_engine( "dla_local_dram_size": dla_local_dram_size, "dla_global_dram_size": dla_global_dram_size, "timing_cache_path": timing_cache_path, - "use_strong_typing": use_strong_typing, + "use_explicit_typing": use_explicit_typing, "use_fp32_acc": use_fp32_acc, } diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py index 77a04e3e03..de99df71e0 100644 --- a/py/torch_tensorrt/dynamo/_defaults.py +++ b/py/torch_tensorrt/dynamo/_defaults.py @@ -40,7 +40,7 @@ ENGINE_CACHE_DIR = os.path.join(tempfile.gettempdir(), "torch_tensorrt_engine_cache") ENGINE_CACHE_SIZE = 1073741824 CUSTOM_ENGINE_CACHE = None -USE_STRONG_TYPING = False +USE_EXPLICIT_TYPING = False USE_FP32_ACC = False diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index d5dccdafba..98865c683e 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -29,10 +29,10 @@ SPARSE_WEIGHTS, TIMING_CACHE_PATH, TRUNCATE_DOUBLE, + USE_EXPLICIT_TYPING, USE_FAST_PARTITIONER, USE_FP32_ACC, USE_PYTHON_RUNTIME, - USE_STRONG_TYPING, VERSION_COMPATIBLE, WORKSPACE_SIZE, default_device, @@ -80,7 +80,7 @@ class CompilationSettings: timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation cache_built_engines (bool): Whether to save the compiled TRT engines to storage reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage - use_strong_typing (bool): Enable strong typing in TensorRT compilation + use_strong_typing (bool): This flag enables strong typing in TensorRT compilation which respects the precisions set in the Pytorch model. This is useful when users have mixed precision graphs. use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions. """ @@ -116,7 +116,7 @@ class CompilationSettings: lazy_engine_init: bool = LAZY_ENGINE_INIT cache_built_engines: bool = CACHE_BUILT_ENGINES reuse_cached_engines: bool = REUSE_CACHED_ENGINES - use_strong_typing: bool = USE_STRONG_TYPING + use_explicit_typing: bool = USE_EXPLICIT_TYPING use_fp32_acc: bool = USE_FP32_ACC diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index 9f775ef701..19d80e70b1 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -80,7 +80,7 @@ def __init__( self.builder = trt.Builder(self.logger) flag = 0 - if compilation_settings.use_strong_typing: + if compilation_settings.use_explicit_typing: STRONGLY_TYPED = 1 << (int)( trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED ) From b2e5244e7c271472ae6c6abf9460c2a30d42343c Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 8 Oct 2024 12:41:35 -0700 Subject: [PATCH 53/55] chore: add docs --- docsrc/user_guide/mixed_precision.rst | 74 +++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 docsrc/user_guide/mixed_precision.rst diff --git a/docsrc/user_guide/mixed_precision.rst b/docsrc/user_guide/mixed_precision.rst new file mode 100644 index 0000000000..eca8be9f0b --- /dev/null +++ b/docsrc/user_guide/mixed_precision.rst @@ -0,0 +1,74 @@ +.. _mixed_precision: + +Compile Mixed Precision models with Torch-TensorRT +==================================== +.. currentmodule:: torch_tensorrt.dynamo + +.. automodule:: torch_tensorrt.dynamo + :members: + :undoc-members: + :show-inheritance: + +Consider the following Pytorch model which explicitly casts intermediate layer to run in FP16. + +.. code-block:: python + + class MyModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear1 = torch.nn.Linear(10,10) + self.linear2 = torch.nn.Linear(10,30).half() + self.linear3 = torch.nn.Linear(30,40) + + def forward(self, x): + x = self.linear1(x) + x = x.to(torch.float16) + x = self.linear2(x) + x = x.to(torch.float32) + x = self.linear3(x) + return x + + +If we compile the above model using Torch-TensorRT, layer profiling logs indicate that all the layers are +run in FP32 which is not the desired outcome. + +.. code-block:: python + + inputs = [torch.randn((1, 10), dtype=torch.float32).cuda()] + mod = MyModule().eval().cuda() + ep = torch.export.export(mod, tuple(inputs)) + with torch_tensorrt.logging.debug(): + trt_gm = torch_tensorrt.dynamo.compile(ep, + inputs=inputs, + debug=True) + + # Debug log info + # Layers: + # Name: __myl_MulSum_myl0_0, LayerType: kgen, Inputs: [ { Name: __mye116_dconst, Dimensions: [10,10], Format/Datatype: Float }, { Name: x, Dimensions: [10,1], Format/Datatype: Float }], Outputs: [ { Name: __myln_k_arg__bb1_2, Dimensions: [1,10], Format/Datatype: Float }], TacticName: __myl_MulSum_0xfa6c1858aea1b13b03f90165d7149ec6, StreamId: 0, Metadata: + # Name: __myl_AddResMulSum_myl0_1, LayerType: kgen, Inputs: [ { Name: __mye131_dconst, Dimensions: [10,30], Format/Datatype: Float }, { Name: __myln_k_arg__bb1_2, Dimensions: [1,10], Format/Datatype: Float }, { Name: linear1/addmm_constant_0 _ linear1/addmm_add_broadcast_to_same_shape_lhs_broadcast_constantFloat, Dimensions: [1,10], Format/Datatype: Float }], Outputs: [ { Name: __myln_k_arg__bb1_3, Dimensions: [1,30], Format/Datatype: Float }], TacticName: __myl_AddResMulSum_0xb3915d7ebfe48be45b6d49083479e12f, StreamId: 0, Metadata: + # Name: __myl_AddResMulSumAdd_myl0_2, LayerType: kgen, Inputs: [ { Name: __mye146_dconst, Dimensions: [30,40], Format/Datatype: Float }, { Name: linear3/addmm_2_constant_0 _ linear3/addmm_2_add_broadcast_to_same_shape_lhs_broadcast_constantFloat, Dimensions: [1,40], Format/Datatype: Float }, { Name: __myln_k_arg__bb1_3, Dimensions: [1,30], Format/Datatype: Float }, { Name: linear2/addmm_1_constant_0 _ linear2/addmm_1_add_broadcast_to_same_shape_lhs_broadcast_constantFloat, Dimensions: [1,30], Format/Datatype: Float }], Outputs: [ { Name: output0, Dimensions: [1,40], Format/Datatype: Float }], TacticName: __myl_AddResMulSumAdd_0xcdd0085ad25f5f45ac5fafb72acbffd6, StreamId: 0, Metadata: + + +In order to respect the types specified by the user in the model (eg: in this case, ``linear2`` layer to run in FP16), users can enable +the compilation setting ``use_explicit_typing=True``. Compiling with this option results in the following TensorRT logs + +.. note:: If you enable ``use_explicit_typing=True``, only torch.float32 is supported in the enabled_precisions. + +.. code-block:: python + + inputs = [torch.randn((1, 10), dtype=torch.float32).cuda()] + mod = MyModule().eval().cuda() + ep = torch.export.export(mod, tuple(inputs)) + with torch_tensorrt.logging.debug(): + trt_gm = torch_tensorrt.dynamo.compile(ep, + inputs=inputs, + use_explicit_typing=True + debug=True) + + # Debug log info + # Layers: + # Name: __myl_MulSumAddCas_myl0_0, LayerType: kgen, Inputs: [ { Name: linear1/addmm_constant_0 _ linear1/addmm_add_broadcast_to_same_shape_lhs_broadcast_constantFloat, Dimensions: [1,10], Format/Datatype: Float }, { Name: __mye112_dconst, Dimensions: [10,10], Format/Datatype: Float }, { Name: x, Dimensions: [10,1], Format/Datatype: Float }], Outputs: [ { Name: __myln_k_arg__bb1_2, Dimensions: [1,10], Format/Datatype: Half }], TacticName: __myl_MulSumAddCas_0xacf8f5dd9be2f3e7bb09cdddeac6c936, StreamId: 0, Metadata: + # Name: __myl_ResMulSumAddCas_myl0_1, LayerType: kgen, Inputs: [ { Name: __mye127_dconst, Dimensions: [10,30], Format/Datatype: Half }, { Name: linear2/addmm_1_constant_0 _ linear2/addmm_1_add_broadcast_to_same_shape_lhs_broadcast_constantHalf, Dimensions: [1,30], Format/Datatype: Half }, { Name: __myln_k_arg__bb1_2, Dimensions: [1,10], Format/Datatype: Half }], Outputs: [ { Name: __myln_k_arg__bb1_3, Dimensions: [1,30], Format/Datatype: Float }], TacticName: __myl_ResMulSumAddCas_0x5a3b318b5a1c97b7d5110c0291481337, StreamId: 0, Metadata: + # Name: __myl_ResMulSumAdd_myl0_2, LayerType: kgen, Inputs: [ { Name: __mye142_dconst, Dimensions: [30,40], Format/Datatype: Float }, { Name: linear3/addmm_2_constant_0 _ linear3/addmm_2_add_broadcast_to_same_shape_lhs_broadcast_constantFloat, Dimensions: [1,40], Format/Datatype: Float }, { Name: __myln_k_arg__bb1_3, Dimensions: [1,30], Format/Datatype: Float }], Outputs: [ { Name: output0, Dimensions: [1,40], Format/Datatype: Float }], TacticName: __myl_ResMulSumAdd_0x3fad91127c640fd6db771aa9cde67db0, StreamId: 0, Metadata: + +Now the ``linear2`` layer runs in FP16 as shown in the above logs. \ No newline at end of file From 7ddd6379a4f4ed10632709f7a0f600a85e8e2219 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 8 Oct 2024 12:53:18 -0700 Subject: [PATCH 54/55] chore: updates --- docsrc/user_guide/mixed_precision.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docsrc/user_guide/mixed_precision.rst b/docsrc/user_guide/mixed_precision.rst index eca8be9f0b..dca0b033e6 100644 --- a/docsrc/user_guide/mixed_precision.rst +++ b/docsrc/user_guide/mixed_precision.rst @@ -30,7 +30,7 @@ Consider the following Pytorch model which explicitly casts intermediate layer t If we compile the above model using Torch-TensorRT, layer profiling logs indicate that all the layers are -run in FP32 which is not the desired outcome. +run in FP32. This is because TensorRT picks the kernels for layers which result in the best performance. .. code-block:: python From 4529717dd565a7e70c30981771d516e607272f40 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 10 Oct 2024 11:54:19 -0700 Subject: [PATCH 55/55] chore: sign bug fix --- py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py index a4f0c2bc6c..c900c51b8f 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py @@ -478,10 +478,6 @@ def sign( name: str, input_val: TRTTensor, ) -> TRTTensor: - if (isinstance(input_val, TRTTensor)) and ( - input_val.dtype == trt.int8 or input_val.dtype == trt.int32 - ): - input_val = cast_trt_tensor(ctx, input_val, trt.float32, name) return convert_unary( ctx, target, source_ir, name, trt.UnaryOperation.SIGN, input_val