Skip to content

Commit

Permalink
organized directories, cleaned up conversion tools, need testing
Browse files Browse the repository at this point in the history
  • Loading branch information
Ishaan-Datta committed Sep 5, 2024
1 parent 47e47fc commit 005c532
Show file tree
Hide file tree
Showing 116 changed files with 407 additions and 711 deletions.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
88 changes: 56 additions & 32 deletions Conversion Tools/Onnx_TensorRT.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,11 @@
import sys
import onnx
import tensorrt as trt
import pycuda.driver as cuda

def convert_onnx_to_engine(onnx_filename, engine_filename = None, max_batch_size = 32, max_workspace_size = 1 << 30, fp16_mode = True):
logger = trt.Logger(trt.Logger.WARNING)
with trt.Builder(logger) as builder, builder.create_network() as network, trt.OnnxParser(network, logger) as parser:
builder.max_workspace_size = max_workspace_size
builder.fp16_mode = fp16_mode
builder.max_batch_size = max_batch_size

print("Parsing ONNX file.")
with open(onnx_filename, 'rb') as model:
if not parser.parse(model.read()):
for error in range(parser.num_errors):
print(parser.get_error(error))

print("Completed parsing of ONNX file.")

print("Building TensorRT engine. This may take a few minutes.")
engine = builder.build_cuda_engine(network)

if engine_filename:
with open(engine_filename, 'wb') as f:
f.write(engine.serialize())

print("Completed building Engine.")
return engine, logger

model_name = input("Enter the name of the ONNX model file (without .onnx extension): ")
engine_name = input("Enter the name of the TensorRT engine file (without .engine extension): ")
precision = input("Enter the precision (FP32/FP16): ") # INT8 later
batch_size = int(input("Enter the maximum batch size: "))
import pycuda.autoinit
import numpy as np

def get_max_memory():
cuda.init()
total, free = cuda.mem_get_info()
max_mem = free * 0.95

Expand All @@ -41,7 +14,58 @@ def get_max_memory():
print(f"Max memory to use: {max_mem / (1024**2)} MB")
return max_mem

convert_onnx_to_engine(f"{model_name}.onnx", f"{engine_name}.engine", batch_size, get_max_memory(), precision == "FP16")
# precision can be FP32, FP16, or INT8. The batch size is the maximum number of samples that can be processed in a single inference. The get_max_memory() function calculates the maximum memory that can be used by the TensorRT engine. The convert_onnx_to_engine() function converts the ONNX model to a TensorRT engine and saves it to a file. The engine is built with the specified precision and batch size.
def convert_onnx_to_trt(model_path="./model.onnx", output_path="model_trt.trt", FP16_mode = True, batch_size=1, input_shape=(1, 3, 224, 224)):
print("Loading the ONNX model")
onnx_model = onnx.load(model_path)

# # Simplify the ONNX model (optional)
# graph = gs.import_onnx(onnx_model)
# graph.toposort()
# onnx_model = gs.export_onnx(graph)
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, TRT_LOGGER)

with open(model_path, 'rb') as model_file:
print("Parsing ONNX model")
if not parser.parse(model_file.read()):
for error in range(parser.num_errors):
print(parser.get_error(error))
return

# config.max_workspace_size = 1 << 30 # Adjust as needed
# builder.max_workspace_size = get_max_memory()
# builder.fp16_mode = fp16_mode
# builder.max_batch_size = batch_size
config = builder.create_builder_config()
config.fp16_mode = FP16_mode
config.max_batch_size = batch_size
config.max_workspace_size = get_max_memory()

print("Building TensorRT engine. This may take a few minutes.")
engine = builder.build_cuda_engine(network, config)

# Serialize the TensorRT engine to a file
with open(output_path, 'wb') as f:
f.write(engine.serialize())

print("Engine built successfully")
print(f"Converted TensorRT engine saved at {output_path}")
return engine, TRT_LOGGER


if __name__ == "__main__":
print("Usage: python3 Onnx_TensorRT.py <model_path> <output_path> FP16_mode batch_size input_shape")
print("Example: python3 Onnx_TensorRT.py ./model.onnx ./model_trt.trt True 1 (1, 3, 224, 224)")

if len(sys.argv) < 2:
convert_onnx_to_trt()
else:
for i in range(len(sys.argv), 6):
sys.argv.append(None)
convert_onnx_to_trt(*sys.argv[1:6])

# improvements
# Use Dynamic Shapes: If your model supports it, using dynamic input shapes can improve the inference speed. This allows TensorRT to optimize the execution plan based on the actual input shapes at runtime.
Expand Down
127 changes: 99 additions & 28 deletions Conversion Tools/PyTorch_Onnx.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,103 @@
import sys

import torch
import torchvision
import torch.onnx

import onnx
# import onnxruntime as ort

import numpy as np
import pycuda.driver as cuda

OPSET_VERS = 13

# netowrk is the model
# network.eval()
# torch_out_load = network(example_data)
# mct.torch_to_onnx(network, example_data, "torch_model.onnx")
# ort_session = mct.loadOnnxModel("torch_model.onnx")
# ort_predictions = mct.predictOnnx(example_data.numpy(), session=ort_session)
# ort_predictions[0][0] = 3
# mct.checkPredictionConsistency(torch_out_load.detach().numpy(), ort_predictions)
# mct.checkConfidenceConsistency(torch_out_load.detach().numpy(), ort_predictions)

# given an array of test inputs and a path to onnx model or a session returns the predictions
def predictOnnx(x_test,session=None,dest_path=""):
if session is None and dest_path == "":
raise ValueError("No model or path provided, please specifiy one of them.")
if session is None:
session = loadOnnxModel(dest_path)

results_ort = session.run([out.name for out in session.get_outputs()], {session.get_inputs()[0].name: x_test})
return np.array(results_ort[0])

# given the predictions from the original model and the converted model, check if they are consistent
# shape of predictions_original and converted_results should be the same
# only checks for the predicted class (aka the argmax)
# takes in two 2D arrays: first dimension is the number of samples, second dimension is the number of classes and values correspond to confidence
def checkPredictionConsistency(predictions_original, converted_results):
for n in range(predictions_original.shape[0]):
if np.argmax(predictions_original[n]) != np.argmax(converted_results[n]):
print(f"Original: {np.argmax(predictions_original[n])}, ONNX: {np.argmax(converted_results[n])}")
print(f"{predictions_original[n]}, \n{converted_results[n]}")
print("=====================================")
raise ValueError("Predictions are not consistent")

print("All predictions are consistent")

# given the predictions from the original model and the converted model, check if they are consistent
# shape of predictions_original and converted_results should be the same
# only checks for the difference in confidence
# takes in two 2D arrays: first dimension is the number of samples, second dimension is the number of classes and values correspond to confidence
# tolerance: the maximum difference in confidence that is allowed
def checkConfidenceConsistency(predictions_original, converted_results, tolerance=1e-5):
np.testing.assert_allclose(predictions_original, converted_results,atol=tolerance)
# for n in range(predictions_original.shape[0]):
# if not np.allclose(predictions_original[n], converted_results[n], atol=tolerance):
# print(f"Original: \t {predictions_original[n]}, \nONNX: \t{converted_results[n]}")
# print("=====================================")
# return

print("All confidence percentages are consistent")

def convert_pytorch_to_onnx(model_path="./model.pt", output_path="./model.onnx", input_shape=(1,3,224,224), constant_folding=False):
print("Loading the PyTorch model")
model = torch.load(model_path)
model.eval()
# traced_model = torch.jit.trace(model, torch.randn(input_shape))

input_data = torch.randn(input_shape).cuda()

print("Exporting model to ONNX format")
torch.onnx.export(model, input_data, output_path,
verbose=True,
opset_version=OPSET_VERS,
export_params=True,
do_constant_folding=constant_folding,
input_names = ['input'],
output_names = ['output'],
dynamic_axes={'input' : {0 : 'batch_size'},
'output' : {0 : 'batch_size'}}
)

model = onnx.load(output_path)
onnx.checker.check_model(model)
print("Model converted successfully")
print(model.graph)

print(f"Converted ONNX model saved at {output_path}")
# return loadOnnxModel(output_path)

def loadOnnxModel(path, providers=["CUDAExecutionProvider"]):
return ort.InferenceSession(path,providers=providers)

if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python convert_to_trt.py <model_path>")
sys.exit(1)

model_path = sys.argv[1]
convert_pytorch_to_onnx(model_path)

model_path = input("Enter the full/absolute path to the model: ")
model_name = input("Enter the name of the model: ")
input_dimensions = (224, 224)
model = torch.load(model_path)

model.eval()
x = torch.randn(1, 3, input_dimensions[0], input_dimensions[1])
traced_model = torch.jit.trace(model, x)

torch.onnx.export(traced_model,
x,
f"{model_name}.onnx",
export_params=True,
opset_version=10,
do_constant_folding=True,
input_names = ['input'],
output_names = ['output'],
dynamic_axes={'input' : {0 : 'batch_size'},
'output' : {0 : 'batch_size'}})

model = onnx.load(f"{model_name}.onnx")
print("Model converted successfully")
print(model.graph)

# improvements:
# Model Pruning: Pruning is a technique in deep learning where you remove the weights of certain neurons which are less important. This can help in reducing the size of the model and hence improve the performance during conversion.
# Quantization: Quantization is a process that reduces the numerical precision of the model's weights, which can lead to a significant reduction in both the memory requirement and computational cost of the model.
# Set the Appropriate Opset Version: The ONNX opset version corresponds to the set of operators and their versions supported. Newer opset versions can have optimizations that were not available in previous versions. You can set the opset version with the opset_version parameter in torch.onnx.export(). The latest version as of ONNX 1.8.0 is 13.
# Quantization: Quantization is a process that reduces the numerical precision of the model's weights, which can lead to a significant reduction in both the memory requirement and computational cost of the model.
30 changes: 0 additions & 30 deletions Conversion Tools/PyTorch_TensorFlow.py

This file was deleted.

43 changes: 43 additions & 0 deletions Conversion Tools/PyTorch_TensorRT.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import sys
import torch
from torch2trt import torch2trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np

def get_max_memory():
total, free = cuda.mem_get_info()
max_mem = free * 0.95

print(f"Total GPU memory: {total / (1024**2)} MB")
print(f"Free GPU memory: {free / (1024**2)} MB")
print(f"Max memory to use: {max_mem / (1024**2)} MB")
return max_mem

def convert_pt_to_trt(model_path='./model.pt', output_path='./model_trt.trt', FP16_mode=True, batch_size=1, input_shape=(1, 3, 224, 224)):
print("Loading the PyTorch model")
model = torch.load(model_path)
model.eval()

input_data = torch.randn(input_shape).cuda()
print("Building TensorRT engine. This may take a few minutes.")
model_trt = torch2trt(model, [input_data], fp16_mode=FP16_mode, max_batch_size=batch_size, max_workspace_size=get_max_memory())

with open(output_path, 'wb') as f:
f.write(model_trt.engine.serialize())

print("Engine built successfully")
print(f"Converted TensorRT engine saved at {output_path}")
return model_trt


if __name__ == "__main__":
print("Usage: python3 PyTorch_TensorRT.py <model_path> <output_path> FP16_mode batch_size input_shape")
print("Example: python3 PyTorch_TensorRT.py ./model.pt ./model_trt.trt True 1 (1, 3, 224, 224)")

if len(sys.argv) < 2:
convert_pt_to_trt()
else:
for i in range(len(sys.argv), 6):
sys.argv.append(None)
convert_pt_to_trt(*sys.argv[1:6])
Empty file added Conversion Tools/README.md
Empty file.
Loading

0 comments on commit 005c532

Please sign in to comment.