diff --git a/examples/dynamo/engine_caching_bert_example.py b/examples/dynamo/engine_caching_bert_example.py index 428c414a06..989913bd31 100644 --- a/examples/dynamo/engine_caching_bert_example.py +++ b/examples/dynamo/engine_caching_bert_example.py @@ -52,7 +52,7 @@ def compile_bert(iterations=3): "truncate_double": True, "debug": False, "min_block_size": 1, - "make_refitable": True, + "make_refittable": True, "cache_built_engines": cache_built_engines, "reuse_cached_engines": reuse_cached_engines, "engine_cache_dir": "/tmp/torch_trt_bert_engine_cache", diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py index 5154dc1e2c..28ff73aa72 100644 --- a/examples/dynamo/engine_caching_example.py +++ b/examples/dynamo/engine_caching_example.py @@ -63,7 +63,7 @@ def remove_timing_cache(path=TIMING_CACHE_PATH): # in a subsequent compilation, either as part of this session or a new session, the cache will # pull the built engine and **refit** the weights which can reduce compilation times by orders of magnitude. # As such, in order to insert a new engine into the cache (i.e. ``cache_built_engines=True``), -# the engine must be refitable (``make_refittable=True``). See :ref:`refit_engine_example` for more details. +# the engine must be refittable (``make_refittable=True``). See :ref:`refit_engine_example` for more details. def torch_compile(iterations=3): @@ -97,7 +97,7 @@ def torch_compile(iterations=3): "enabled_precisions": enabled_precisions, "debug": debug, "min_block_size": min_block_size, - "make_refitable": True, + "make_refittable": True, "cache_built_engines": cache_built_engines, "reuse_cached_engines": reuse_cached_engines, }, @@ -157,7 +157,7 @@ def dynamo_compile(iterations=3): enabled_precisions=enabled_precisions, debug=debug, min_block_size=min_block_size, - make_refitable=True, + make_refittable=True, cache_built_engines=cache_built_engines, reuse_cached_engines=reuse_cached_engines, engine_cache_size=1 << 30, # 1GB @@ -268,7 +268,7 @@ def torch_compile_my_cache(iterations=3): "enabled_precisions": enabled_precisions, "debug": debug, "min_block_size": min_block_size, - "make_refitable": True, + "make_refittable": True, "cache_built_engines": cache_built_engines, "reuse_cached_engines": reuse_cached_engines, "custom_engine_cache": engine_cache, diff --git a/examples/dynamo/mutable_torchtrt_module_example.py b/examples/dynamo/mutable_torchtrt_module_example.py index a10c0e17ae..b68c9a11ee 100644 --- a/examples/dynamo/mutable_torchtrt_module_example.py +++ b/examples/dynamo/mutable_torchtrt_module_example.py @@ -31,7 +31,7 @@ settings = { "use_python": False, "enabled_precisions": {torch.float32}, - "make_refitable": True, + "make_refittable": True, } model = models.resnet18(pretrained=True).eval().to("cuda") @@ -80,7 +80,7 @@ "use_python_runtime": True, "enabled_precisions": {torch.float16}, "debug": True, - "make_refitable": True, + "make_refittable": True, } model_id = "runwayml/stable-diffusion-v1-5" diff --git a/examples/dynamo/refit_engine_example.py b/examples/dynamo/refit_engine_example.py index 1feb033a3a..adf1057055 100644 --- a/examples/dynamo/refit_engine_example.py +++ b/examples/dynamo/refit_engine_example.py @@ -43,11 +43,11 @@ # %% -# Make a Refitable Compilation Program +# Make a refittable Compilation Program # --------------------------------------- # # The inital step is to compile a module and save it as with a normal. Note that there is an -# additional parameter `make_refitable` that is set to `True`. This parameter is used to +# additional parameter `make_refittable` that is set to `True`. This parameter is used to # indicate that the engine being built should support weight refitting later. Engines built without # these setttings will not be able to be refit. # @@ -69,7 +69,7 @@ debug=debug, min_block_size=min_block_size, torch_executed_ops=torch_executed_ops, - make_refitable=True, + make_refittable=True, ) # Output is a torch.fx.GraphModule # Save the graph module as an exported program diff --git a/py/torch_tensorrt/_Input.py b/py/torch_tensorrt/_Input.py index 72775944cb..126219ee8a 100644 --- a/py/torch_tensorrt/_Input.py +++ b/py/torch_tensorrt/_Input.py @@ -220,6 +220,34 @@ def __str__(self) -> str: def __repr__(self) -> str: return self.__str__() + @staticmethod + def equivalent_spec(a: Input, b: Input) -> bool: + if a.shape_mode != b.shape_mode: + return False + + if a.shape_mode == Input._ShapeMode.DYNAMIC: + assert isinstance(a.shape, dict) + assert isinstance(b.shape, dict) + checks = [ + a.shape["min_shape"] == b.shape["min_shape"], + a.shape["opt_shape"] == b.shape["opt_shape"], + a.shape["max_shape"] == b.shape["max_shape"], + a.dtype == b.dtype, + a.format == b.format, + a.low_tensor_domain_incl == b.low_tensor_domain_incl, + a.high_tensor_domain_excl == b.high_tensor_domain_excl, + ] + return all(checks) + else: + checks = [ + a.shape == b.shape, + a.dtype == b.dtype, + a.format == b.format, + a.low_tensor_domain_incl == b.low_tensor_domain_incl, + a.high_tensor_domain_excl == b.high_tensor_domain_excl, + ] + return all(checks) + @staticmethod def _supported_input_size_type(input_size: Any) -> bool: if isinstance(input_size, torch.Size): diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 2e6ff039b4..96e5f313ae 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -60,7 +60,7 @@ def compile( Set[Union[torch.dtype, dtype]], Tuple[Union[torch.dtype, dtype]] ] = _defaults.ENABLED_PRECISIONS, engine_capability: EngineCapability = _defaults.ENGINE_CAPABILITY, - make_refitable: bool = _defaults.MAKE_REFITABLE, + make_refittable: bool = _defaults.MAKE_REFITTABLE, debug: bool = _defaults.DEBUG, num_avg_timing_iters: int = _defaults.NUM_AVG_TIMING_ITERS, workspace_size: int = _defaults.WORKSPACE_SIZE, @@ -180,14 +180,14 @@ def compile( if "refit" in kwargs.keys(): warnings.warn( - "Refit is deprecated. Please use make_refitable=True if you want to enable refitting of the engine.", + "Refit is deprecated. Please use make_refittable=True if you want to enable refitting of the engine.", DeprecationWarning, stacklevel=2, ) - if make_refitable: - raise ValueError("Use flag make_refitable only. Flag refit is deprecated.") + if make_refittable: + raise ValueError("Use flag make_refittable only. Flag refit is deprecated.") else: - make_refitable = kwargs["refit"] + make_refittable = kwargs["refit"] engine_capability = EngineCapability._from(engine_capability) @@ -238,8 +238,8 @@ def compile( engine_cache = None if cache_built_engines or reuse_cached_engines: assert ( - make_refitable - ), "Engine caching requires make_refitable to be set to True" + make_refittable + ), "Engine caching requires make_refittable to be set to True" engine_cache = ( custom_engine_cache if custom_engine_cache is not None @@ -270,7 +270,7 @@ def compile( "require_full_compilation": require_full_compilation, "disable_tf32": disable_tf32, "sparse_weights": sparse_weights, - "make_refitable": make_refitable, + "make_refittable": make_refittable, "engine_capability": engine_capability, "dla_sram_size": dla_sram_size, "dla_local_dram_size": dla_local_dram_size, @@ -513,7 +513,7 @@ def convert_exported_program_to_serialized_trt_engine( require_full_compilation: bool = _defaults.REQUIRE_FULL_COMPILATION, disable_tf32: bool = _defaults.DISABLE_TF32, sparse_weights: bool = _defaults.SPARSE_WEIGHTS, - make_refitable: bool = _defaults.MAKE_REFITABLE, + make_refittable: bool = _defaults.MAKE_REFITTABLE, engine_capability: EngineCapability = _defaults.ENGINE_CAPABILITY, num_avg_timing_iters: int = _defaults.NUM_AVG_TIMING_ITERS, dla_sram_size: int = _defaults.DLA_SRAM_SIZE, @@ -600,7 +600,7 @@ def convert_exported_program_to_serialized_trt_engine( ) if "refit" in kwargs.keys(): warnings.warn( - "Refit is deprecated. Please use make_refitable=True if you want to enable refitting of the engine.", + "Refit is deprecated. Please use make_refittable=True if you want to enable refitting of the engine.", DeprecationWarning, stacklevel=2, ) @@ -646,7 +646,7 @@ def convert_exported_program_to_serialized_trt_engine( "require_full_compilation": require_full_compilation, "disable_tf32": disable_tf32, "sparse_weights": sparse_weights, - "make_refitable": make_refitable, + "make_refittable": make_refittable, "engine_capability": engine_capability, "num_avg_timing_iters": num_avg_timing_iters, "dla_sram_size": dla_sram_size, diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py index 83e85cb3c7..aba457d5ee 100644 --- a/py/torch_tensorrt/dynamo/_defaults.py +++ b/py/torch_tensorrt/dynamo/_defaults.py @@ -26,7 +26,7 @@ USE_PYTHON_RUNTIME = False USE_FAST_PARTITIONER = True ENABLE_EXPERIMENTAL_DECOMPOSITIONS = False -MAKE_REFITABLE = False +MAKE_REFITTABLE = False REQUIRE_FULL_COMPILATION = False DRYRUN = False HARDWARE_COMPATIBLE = False diff --git a/py/torch_tensorrt/dynamo/_engine_cache.py b/py/torch_tensorrt/dynamo/_engine_cache.py index 7a33a81521..f166b489cb 100644 --- a/py/torch_tensorrt/dynamo/_engine_cache.py +++ b/py/torch_tensorrt/dynamo/_engine_cache.py @@ -1,17 +1,33 @@ import copy +import io import logging import os import pickle +import pickletools import shutil from abc import ABC, abstractmethod -from typing import Any, Dict, List, Optional, Tuple, cast +from typing import Any, Dict, List, Optional, Sequence, Tuple, cast import torch -from torch._inductor.codecache import FxGraphCachePickler +from torch._inductor.codecache import FxGraphCachePickler, sha256_hash from torch.fx.experimental.proxy_tensor import unset_fake_temporarily +from torch_tensorrt._Input import Input +from torch_tensorrt.dynamo._settings import ( + _SETTINGS_TO_BE_ENGINE_INVARIANT, + CompilationSettings, +) _LOGGER: logging.Logger = logging.getLogger(__name__) +UnpackedCacheHit = Tuple[ + bytes, + List[str], + List[str], + Sequence[Input], + CompilationSettings, + Optional[Dict[str, Any]], +] + class BaseEngineCache(ABC): @@ -24,7 +40,11 @@ def __init__( pass @staticmethod - def get_hash(gm: torch.fx.GraphModule) -> str: + def get_hash( + gm: torch.fx.GraphModule, + input_specs: Sequence[Input], + settings: CompilationSettings, + ) -> str: """Get the hash value of the GraphModule Args: @@ -39,7 +59,23 @@ def get_hash(gm: torch.fx.GraphModule) -> str: for name, param in new_gm.named_parameters(): param.data.zero_() - hash_val = cast(str, FxGraphCachePickler.get_hash(new_gm)) + graph_hash_val = cast(str, FxGraphCachePickler.get_hash(new_gm)) + + input_spec_strs = [str(i) for i in input_specs] + with io.BytesIO() as stream: + input_specs_data = pickle.dumps(input_spec_strs) + input_specs_data = pickletools.optimize(input_specs_data) + input_specs_hash = sha256_hash(input_specs_data) + + invariant_engine_specs = [ + str(getattr(settings, field)) for field in _SETTINGS_TO_BE_ENGINE_INVARIANT + ] + with io.BytesIO() as stream: + engine_specs_data = pickle.dumps(invariant_engine_specs) + engine_specs_data = pickletools.optimize(engine_specs_data) + engine_specs_hash = sha256_hash(engine_specs_data) + + hash_val: str = graph_hash_val + input_specs_hash + engine_specs_hash return hash_val @@ -48,6 +84,8 @@ def pack( serialized_engine: bytes, input_names: List[str], output_names: List[str], + input_specs: Sequence[Input], + compilation_settings: CompilationSettings, weight_name_map: Optional[Dict[Any, Any]], ) -> bytes: """Pack serialized engine, input names, output names, and weight map into a single blob @@ -56,40 +94,83 @@ def pack( serialized_engine (bytes): serialized TRT engine input_names (List[str]): input names of TRT engine output_names (List[str]): output names of TRT engine + input_specs (Sequence[Input]): input specs of TRT engine + compilation_settings (CompilationSettings): compilation settings of TRT engine weight_name_map (Optional[Dict[Any, Any]]): weight name map for refitting Returns: bytes: packed blob """ + + settings = copy.deepcopy(compilation_settings) return pickle.dumps( { "serialized_engine": bytes(serialized_engine), "input_names": input_names, "output_names": output_names, + "input_specs": input_specs, + "compilation_settings": settings, "weight_name_map": weight_name_map, } ) @staticmethod - def unpack( - packed_obj: bytes, - ) -> Tuple[bytes, List[str], List[str], Optional[Dict[Any, Any]]]: + def unpack(packed_obj: bytes) -> UnpackedCacheHit: """Unpack packed blob into serialized engine, input names, output names, and weight map Args: packed_obj (bytes): packed blob Returns: - Tuple[bytes, List[str], List[str], Optional[Dict[str, Any]]]: serialized engine, input names, output names, weight name map + Tuple[bytes, List[str], List[str], Sequence[Input], CompilationSettings, Optional[Dict[str, Any]]]: serialized engine, input names, output names, input specs, CompilationSettings, weight name map """ unpacked = pickle.loads(packed_obj) return ( unpacked["serialized_engine"], unpacked["input_names"], unpacked["output_names"], + unpacked["input_specs"], + unpacked["compilation_settings"], unpacked["weight_name_map"], ) + def insert( + self, hash: str, entry: UnpackedCacheHit, *args: Any, **kwargs: Any + ) -> None: + """ + Insert a cache entry into the engine cache. + + Args: + hash (str): The hash value of the GraphModule. + entry (Tuple[bytes, List[str], List[str], CompilationSettings, Optional[Dict[Any, Any]]]): The cache entry to be inserted. + *args: Variable length argument list passed to ``save``. + **kwargs: Arbitrary keyword arguments passed to ``save``. + + Returns: + None + """ + packed_cache_info = BaseEngineCache.pack(*entry) + return self.save(hash, packed_cache_info, *args, **kwargs) + + def check(self, hash: str, *args: Any, **kwargs: Any) -> Optional[UnpackedCacheHit]: + """ + Check if a cache entry exists for the given hash. + + Args: + hash (str): The hash value of the GraphModule. + *args: Variable length argument list passed to ``load``. + **kwargs: Arbitrary keyword arguments passed to ``load``. + + Returns: + Optional[Tuple[bytes, List[str], List[str], CompilationSettings, Optional[Dict[Any, Any]]]]: The unpacked cache entry if found, None otherwise. + """ + packed_cache_info = self.load(hash, *args, **kwargs) + + if packed_cache_info: + return BaseEngineCache.unpack(packed_cache_info) + else: + return None + @abstractmethod def save(self, hash: str, blob: bytes, *args: Any, **kwargs: Any) -> None: """Store blob in cache @@ -203,11 +284,7 @@ def LRU() -> None: else: LRU() - def save( - self, - hash: str, - blob: bytes, - ) -> None: + def save(self, hash: str, blob: bytes, *args: Any, **kwargs: Any) -> None: blob_size = len(blob) if blob_size > self.total_engine_cache_size: _LOGGER.warning( @@ -244,7 +321,7 @@ def save( f"The size {blob_size} is still larger than the available cache size {self.available_engine_cache_size}." ) - def load(self, hash: str) -> Optional[bytes]: + def load(self, hash: str, *args: Any, **kwargs: Any) -> Optional[bytes]: directory = os.path.join(self.engine_cache_dir, hash) if os.path.exists(directory): blob_path = os.path.join(directory, "blob.bin") diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py index 8b0d7c3e20..d6d0f8a504 100644 --- a/py/torch_tensorrt/dynamo/_refit.py +++ b/py/torch_tensorrt/dynamo/_refit.py @@ -239,7 +239,7 @@ def refit_module_weights( ) # Get the settings and check the setting to be uniform - settings: CompilationSettings = None + settings: Optional[CompilationSettings] = None if inline_module: # Obtain the settings @@ -254,7 +254,7 @@ def refit_module_weights( ] assert ( encoded_metadata != "" - ), "The engine provided is either not refittable or was built with a version of Torch-TensorRT that is too old, please recompile using the latest version with make_refitable=True" + ), "The engine provided is either not refittable or was built with a version of Torch-TensorRT that is too old, please recompile using the latest version with make_refittable=True" settings = TorchTensorRTModule.decode_metadata(encoded_metadata)["settings"] # Handle torch modules compiled_submodules_map = dict(compiled_submodules) @@ -269,8 +269,10 @@ def refit_module_weights( continue settings = submodule.settings + assert settings is not None + assert ( - settings.make_refitable + settings.make_refittable ), "Refitting is not enabled. Please recompile the engine with refit=True." if settings.debug: @@ -396,7 +398,7 @@ def refit_module_weights( if isinstance(compiled_submodule, PythonTorchTensorRTModule): engine = compiled_submodule.engine elif isinstance(compiled_submodule, TorchTensorRTModule): - engine_info = compiled_submodule.engine.__getstate__()[0] + engine_info = compiled_submodule.engine.__getstate__()[0] # type: ignore[index] engine = get_engine_from_encoded_engine( engine_info[ENGINE_IDX], runtime ) diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index 063f6f3718..f8886fbd67 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -1,5 +1,5 @@ from dataclasses import dataclass, field -from typing import Collection, Optional, Set, Union +from typing import Collection, Optional, Set, Tuple, Union from torch.fx.node import Target from torch_tensorrt._Device import Device @@ -18,7 +18,7 @@ ENGINE_CAPABILITY, HARDWARE_COMPATIBLE, LAZY_ENGINE_INIT, - MAKE_REFITABLE, + MAKE_REFITTABLE, MAX_AUX_STREAMS, MIN_BLOCK_SIZE, NUM_AVG_TIMING_ITERS, @@ -98,7 +98,7 @@ class CompilationSettings: disable_tf32: bool = DISABLE_TF32 assume_dynamic_shape_support: bool = ASSUME_DYNAMIC_SHAPE_SUPPORT sparse_weights: bool = SPARSE_WEIGHTS - make_refitable: bool = MAKE_REFITABLE + make_refittable: bool = MAKE_REFITTABLE engine_capability: EngineCapability = field( default_factory=lambda: ENGINE_CAPABILITY ) @@ -112,3 +112,31 @@ class CompilationSettings: lazy_engine_init: bool = LAZY_ENGINE_INIT cache_built_engines: bool = CACHE_BUILT_ENGINES reuse_cached_engines: bool = REUSE_CACHED_ENGINES + + +_SETTINGS_TO_BE_ENGINE_INVARIANT = ( + "enabled_precisions", + "max_aux_streams", + "version_compatible", + "optimization_level", + "disable_tf32", + "sparse_weights", + "make_refittable", + "engine_capability", + "hardware_compatible", +) + + +def settings_are_compatible( + set_a: CompilationSettings, set_b: CompilationSettings +) -> Tuple[bool, Set[str]]: + incompatible_settings: Set[str] = set() + + for f in _SETTINGS_TO_BE_ENGINE_INVARIANT: + if getattr(set_a, f) != getattr(set_b, f): + incompatible_settings.add(f) + + if len(incompatible_settings) == 0: + return True, set() + else: + return False, incompatible_settings diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index 84fe345137..ff35bf39d7 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -27,7 +27,7 @@ from torch_tensorrt._Input import Input from torch_tensorrt.dynamo import _defaults from torch_tensorrt.dynamo._engine_cache import BaseEngineCache -from torch_tensorrt.dynamo._settings import CompilationSettings +from torch_tensorrt.dynamo._settings import CompilationSettings, settings_are_compatible from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext from torch_tensorrt.dynamo.conversion._ConverterRegistry import ( DYNAMO_CONVERTERS as CONVERTERS, @@ -126,7 +126,7 @@ def __init__( # Mapping of constants to shapes and dtypes self.const_mapping: Dict[str, Tuple[Sequence[int], str]] = {} - self.weight_name_map: Optional[dict[str, Any]] = None + self.weight_name_map: Optional[Dict[str, Any]] = None # Engine cache for storing and reusing TRT engines self.engine_cache = engine_cache @@ -282,7 +282,7 @@ def _populate_trt_builder_config( if self.compilation_settings.disable_tf32: builder_config.clear_flag(trt.BuilderFlag.TF32) - if self.compilation_settings.make_refitable: + if self.compilation_settings.make_refittable: builder_config.set_flag(trt.BuilderFlag.REFIT) if strict_type_constraints: @@ -533,18 +533,42 @@ def run( self.compilation_settings.cache_built_engines or self.compilation_settings.reuse_cached_engines ): - hash_val = self.engine_cache.get_hash(self.module) + hash_val = self.engine_cache.get_hash( + self.module, self.input_specs, self.compilation_settings + ) if self.compilation_settings.reuse_cached_engines: # query the cached TRT engine - blob = self.engine_cache.load(hash_val) - if blob is not None: # hit the cache - serialized_engine, input_names, output_names, weight_name_map = ( - self.engine_cache.unpack(blob) + cached_data = self.engine_cache.check(hash_val) + if cached_data is not None: # hit the cache + ( + serialized_engine, + self._input_names, + self._output_names, + cached_engine_input_specs, + engine_compilation_settings, + self.weight_name_map, + ) = cached_data + + setting_compatiblity, incompattible_settings = ( + settings_are_compatible( + self.compilation_settings, engine_compilation_settings + ) ) - self._input_names = input_names - self._output_names = output_names - self.weight_name_map = weight_name_map + assert ( + setting_compatiblity + ), f"Attempted to refit a cached engine with incompatible settings: {incompattible_settings}, (old_settings: {engine_compilation_settings}, new_settings: {self.compilation_settings})" + + for i, e in enumerate( + [ + Input.equivalent_spec(c, i) + for c, i in zip(cached_engine_input_specs, self.input_specs) + ] + ): + assert ( + e + ), f"Attempted to refit a cached engine built for a different input size (input: {i}, cached size: {cached_engine_input_specs[i]}, new size: {self.input_specs[i]}" + _LOGGER.info( "Found the cached engine that corresponds to this graph. It is directly loaded." ) @@ -581,7 +605,7 @@ def run( self._construct_trt_network_def() - if self.compilation_settings.make_refitable: + if self.compilation_settings.make_refittable: self._save_weight_mapping() build_engine_start_time = datetime.now() @@ -612,13 +636,17 @@ def run( self.engine_cache is not None and self.compilation_settings.cache_built_engines ): - blob = self.engine_cache.pack( - serialized_engine, - self._input_names, - self._output_names, - self.weight_name_map, + self.engine_cache.insert( + hash_val, + ( + serialized_engine, + self._input_names, + self._output_names, + self.input_specs, + self.compilation_settings, + self.weight_name_map, + ), ) - self.engine_cache.save(hash_val, blob) with io.BytesIO() as engine_bytes: engine_bytes.write(serialized_engine) diff --git a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py index 672a7e267d..9abd896d50 100644 --- a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py @@ -65,7 +65,7 @@ def __init__( Union[torch.dtype, dtype] ] = _defaults.ENABLED_PRECISIONS, engine_capability: EngineCapability = _defaults.ENGINE_CAPABILITY, - make_refitable: bool = _defaults.MAKE_REFITABLE, + make_refittable: bool = _defaults.MAKE_REFITTABLE, debug: bool = _defaults.DEBUG, num_avg_timing_iters: int = _defaults.NUM_AVG_TIMING_ITERS, workspace_size: int = _defaults.WORKSPACE_SIZE, @@ -152,8 +152,8 @@ def __init__( device = to_torch_tensorrt_device(device) enabled_precisions = {dtype._from(p) for p in enabled_precisions} assert ( - make_refitable - ), "'make_refitable' has to be True for a MutableTorchTensorRTModule." + make_refittable + ), "'make_refittable' has to be True for a MutableTorchTensorRTModule." compilation_options = { "enabled_precisions": ( enabled_precisions @@ -180,7 +180,7 @@ def __init__( "require_full_compilation": require_full_compilation, "disable_tf32": disable_tf32, "sparse_weights": sparse_weights, - "make_refitable": make_refitable, + "make_refittable": make_refittable, "engine_capability": engine_capability, "dla_sram_size": dla_sram_size, "dla_local_dram_size": dla_local_dram_size, diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py index 63a932c353..7bf42da7f0 100644 --- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py @@ -184,10 +184,6 @@ def setup_engine(self) -> None: def encode_metadata(self, metadata: Any) -> str: metadata = copy.deepcopy(metadata) - metadata["settings"].torch_executed_ops = { - f"torch.ops.{op.__str__()}" - for op in metadata["settings"].torch_executed_ops - } dumped_metadata = pickle.dumps(metadata) encoded_metadata = base64.b64encode(dumped_metadata).decode("utf-8") return encoded_metadata @@ -196,9 +192,6 @@ def encode_metadata(self, metadata: Any) -> str: def decode_metadata(encoded_metadata: bytes) -> Any: dumped_metadata = base64.b64decode(encoded_metadata.encode("utf-8")) metadata = pickle.loads(dumped_metadata) - metadata["settings"].torch_executed_ops = { - eval(op) for op in metadata["settings"].torch_executed_ops - } return metadata def get_extra_state(self) -> SerializedTorchTensorRTModuleFmt: @@ -240,7 +233,9 @@ def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None: serialized_metadata = serialized_engine_info[SERIALIZED_METADATA_IDX] assert isinstance(serialized_metadata, bytes) - self.settings = TorchTensorRTModule.decode_metadata(serialized_metadata) + metadata = TorchTensorRTModule.decode_metadata(serialized_metadata) + self.settings = metadata["settings"] + self.weight_name_map = metadata["weight_name_map"] else: self.engine = None diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index 2af7922cd1..ba39ca923b 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -449,7 +449,6 @@ def parse_dynamo_kwargs( Returns: CompilationSettings object with relevant kwargs """ - # Initialize an empty CompilationSettings object settings = CompilationSettings() @@ -500,11 +499,12 @@ def parse_dynamo_kwargs( # If cache_built_engines and reuse_cached_engines are True but custom_engine_cache is not provided, # then create a default disk engine cache + engine_cache = None if kwargs.get("cache_built_engines") or kwargs.get("reuse_cached_engines"): assert kwargs.get( - "make_refitable" - ), "Engine caching requires make_refitable to be set to True" + "make_refittable" + ), "Engine caching requires make_refittable to be set to True" if kwargs.get("custom_engine_cache") is not None: engine_cache = kwargs.get("custom_engine_cache") @@ -519,6 +519,9 @@ def parse_dynamo_kwargs( ) engine_cache = DiskEngineCache(engine_cache_dir, engine_cache_size) + if kwargs.get("torch_executed_ops"): + settings.torch_executed_ops = kwargs.get("torch_executed_ops") + logger.info("Compilation Settings: %s\n", settings) return settings, engine_cache diff --git a/tests/py/dynamo/models/test_engine_cache.py b/tests/py/dynamo/models/test_engine_cache.py index 770e057a36..367f68c1f6 100644 --- a/tests/py/dynamo/models/test_engine_cache.py +++ b/tests/py/dynamo/models/test_engine_cache.py @@ -9,8 +9,9 @@ import torch_tensorrt as torch_trt import torchvision.models as models from torch.testing._internal.common_utils import TestCase -from torch_tensorrt.dynamo._defaults import ENGINE_CACHE_DIR +from torch_tensorrt.dynamo._defaults import TIMING_CACHE_PATH from torch_tensorrt.dynamo._engine_cache import BaseEngineCache +from torch_tensorrt.dynamo._settings import CompilationSettings from torch_tensorrt.dynamo.utils import COSINE_THRESHOLD, cosine_similarity assertions = unittest.TestCase() @@ -25,6 +26,8 @@ def __init__( if not os.path.exists(self.engine_cache_dir): os.makedirs(self.engine_cache_dir, exist_ok=True) + self.hashes = {} + def save( self, hash: str, @@ -41,17 +44,141 @@ def save( with open(path, "wb") as f: f.write(blob) + self.hashes[hash] = 0 + def load(self, hash: str, prefix: str = "blob") -> Optional[bytes]: path = os.path.join(self.engine_cache_dir, f"{prefix}_{hash}.bin") if os.path.exists(path): with open(path, "rb") as f: blob = f.read() + self.hashes[hash] += 1 return blob return None +class TestHashFunction(TestCase): + + def test_reexport_is_equal(self): + pyt_model = models.resnet18(pretrained=True).eval().to("cuda") + example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),) + batch = torch.export.Dim("batch", min=1, max=200) + + exp_program1 = torch.export.export( + pyt_model, args=example_inputs, dynamic_shapes={"x": {0: batch}} + ) + input_specs1 = ( + torch_trt.Input( + min_shape=(1, 3, 224, 224), + opt_shape=(100, 3, 224, 224), + max_shape=(200, 3, 224, 224), + ), + ) + settings1 = CompilationSettings( + make_refittable=True, cache_built_engines=True, reuse_cached_engines=True + ) + hash1 = BaseEngineCache.get_hash(exp_program1.module(), input_specs1, settings1) + + exp_program2 = torch.export.export( + pyt_model, args=example_inputs, dynamic_shapes={"x": {0: batch}} + ) + input_specs2 = ( + torch_trt.Input( + min_shape=(1, 3, 224, 224), + opt_shape=(100, 3, 224, 224), + max_shape=(200, 3, 224, 224), + ), + ) + settings2 = CompilationSettings( + make_refittable=True, cache_built_engines=True, reuse_cached_engines=True + ) + hash2 = BaseEngineCache.get_hash(exp_program2.module(), input_specs2, settings2) + + self.assertEqual(hash1, hash2) + + def test_input_shape_change_is_not_equal(self): + pyt_model = models.resnet18(pretrained=True).eval().to("cuda") + example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),) + batch = torch.export.Dim("batch", min=1, max=200) + + exp_program1 = torch.export.export( + pyt_model, args=example_inputs, dynamic_shapes={"x": {0: batch}} + ) + input_specs1 = ( + torch_trt.Input( + min_shape=(1, 3, 224, 224), + opt_shape=(100, 3, 224, 224), + max_shape=(200, 3, 224, 224), + ), + ) + settings1 = CompilationSettings( + make_refittable=True, cache_built_engines=True, reuse_cached_engines=True + ) + hash1 = BaseEngineCache.get_hash(exp_program1.module(), input_specs1, settings1) + + exp_program2 = torch.export.export( + pyt_model, args=example_inputs, dynamic_shapes={"x": {0: batch}} + ) + input_specs2 = ( + torch_trt.Input( + min_shape=(1, 3, 300, 300), + opt_shape=(100, 3, 300, 300), + max_shape=(200, 3, 300, 300), + ), + ) + settings2 = CompilationSettings( + make_refittable=True, cache_built_engines=True, reuse_cached_engines=True + ) + hash2 = BaseEngineCache.get_hash(exp_program2.module(), input_specs2, settings2) + + self.assertNotEqual(hash1, hash2) + + def test_engine_settings_is_not_equal(self): + pyt_model = models.resnet18(pretrained=True).eval().to("cuda") + example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),) + batch = torch.export.Dim("batch", min=1, max=200) + + exp_program1 = torch.export.export( + pyt_model, args=example_inputs, dynamic_shapes={"x": {0: batch}} + ) + input_specs1 = ( + torch_trt.Input( + min_shape=(1, 3, 224, 224), + opt_shape=(100, 3, 224, 224), + max_shape=(200, 3, 224, 224), + ), + ) + settings1 = CompilationSettings( + make_refittable=True, + cache_built_engines=True, + reuse_cached_engines=True, + enabled_precisions={torch.float32}, + ) + hash1 = BaseEngineCache.get_hash(exp_program1.module(), input_specs1, settings1) + + exp_program2 = torch.export.export( + pyt_model, args=example_inputs, dynamic_shapes={"x": {0: batch}} + ) + input_specs2 = ( + torch_trt.Input( + min_shape=(1, 3, 224, 224), + opt_shape=(100, 3, 224, 224), + max_shape=(200, 3, 224, 224), + ), + ) + settings2 = CompilationSettings( + make_refittable=True, + cache_built_engines=True, + reuse_cached_engines=True, + enabled_precisions={torch.float32, torch.float16}, + ) + hash2 = BaseEngineCache.get_hash(exp_program2.module(), input_specs2, settings2) + + self.assertNotEqual(hash1, hash2) + + class TestEngineCache(TestCase): + @pytest.mark.xfail def test_dynamo_compile_with_default_disk_engine_cache(self): model = models.resnet18(pretrained=True).eval().to("cuda") example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),) @@ -61,10 +188,14 @@ def test_dynamo_compile_with_default_disk_engine_cache(self): model, args=example_inputs, dynamic_shapes={"x": {0: batch}} ) - engine_cache_dir = ENGINE_CACHE_DIR + engine_cache_dir = "/tmp/test_torch_dynamo_with_default_disk_engine_cache" if os.path.exists(engine_cache_dir): shutil.rmtree(engine_cache_dir) + def remove_timing_cache(path=TIMING_CACHE_PATH): + if os.path.exists(path): + os.remove(path) + # The 1st iteration is to measure the compilation time without engine caching # The 2nd and 3rd iterations are to measure the compilation time with engine caching. # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. @@ -75,6 +206,8 @@ def test_dynamo_compile_with_default_disk_engine_cache(self): start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) for i in range(3): + remove_timing_cache() + torch._dynamo.reset() if i == 0: cache_built_engines = False reuse_cached_engines = False @@ -82,6 +215,7 @@ def test_dynamo_compile_with_default_disk_engine_cache(self): cache_built_engines = True reuse_cached_engines = True + torch.cuda.synchronize() start.record() trt_gm = torch_trt.dynamo.compile( exp_program, @@ -90,12 +224,14 @@ def test_dynamo_compile_with_default_disk_engine_cache(self): enabled_precisions={torch.float}, debug=False, min_block_size=1, - make_refitable=True, + make_refittable=True, cache_built_engines=cache_built_engines, reuse_cached_engines=reuse_cached_engines, + engine_cache_dir=engine_cache_dir, ) end.record() torch.cuda.synchronize() + torch._dynamo.reset() times.append(start.elapsed_time(end)) results.append(trt_gm(*inputs)) @@ -119,7 +255,7 @@ def test_dynamo_compile_with_default_disk_engine_cache(self): def test_dynamo_compile_with_custom_engine_cache(self): model = models.resnet18(pretrained=True).eval().to("cuda") - engine_cache_dir = "/tmp/your_dir" + engine_cache_dir = "/tmp/test_torch_dynamo_with_custom_engine_cache" if os.path.exists(engine_cache_dir): shutil.rmtree(engine_cache_dir) @@ -138,9 +274,6 @@ def test_dynamo_compile_with_custom_engine_cache(self): # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. inputs = [torch.rand((128, 3, 224, 224)).to("cuda")] results = [] - times = [] - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) for i in range(3): if i == 0: cache_built_engines = False @@ -149,7 +282,6 @@ def test_dynamo_compile_with_custom_engine_cache(self): cache_built_engines = True reuse_cached_engines = True - start.record() trt_gm = torch_trt.dynamo.compile( exp_program, tuple(inputs), @@ -157,14 +289,11 @@ def test_dynamo_compile_with_custom_engine_cache(self): enabled_precisions={torch.float}, debug=False, min_block_size=1, - make_refitable=True, + make_refittable=True, cache_built_engines=cache_built_engines, reuse_cached_engines=reuse_cached_engines, custom_engine_cache=custom_engine_cache, ) - end.record() - torch.cuda.synchronize() - times.append(start.elapsed_time(end)) results.append(trt_gm(*inputs)) cos_sim = cosine_similarity(results[0], results[1]) @@ -179,11 +308,47 @@ def test_dynamo_compile_with_custom_engine_cache(self): msg=f"results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) - assertions.assertTrue( - times[0] > times[2], - msg=f"Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms", - ) + [ + assertions.assertTrue( + count == 1, + f"cache was not hit exactly once for entry ({h}, hit: {count})", + ) + for h, count in custom_engine_cache.hashes.items() + ] + + def test_dynamo_compile_change_input_shape(self): + """Runs compilation 3 times, the cache should miss each time""" + model = models.resnet18(pretrained=True).eval().to("cuda") + # Mark the dim0 of inputs as dynamic + + engine_cache_dir = "/tmp/test_torch_dynamo_with_custom_engine_cache" + if os.path.exists(engine_cache_dir): + shutil.rmtree(engine_cache_dir) + + custom_engine_cache = MyEngineCache(engine_cache_dir) + + for i in range(3): + inputs = (torch.rand((4 * (i + 1), 3, 224, 224)).to("cuda"),) + trt_gm = torch_trt.dynamo.compile( + torch.export.export(model, args=inputs), + inputs=inputs, + use_python_runtime=False, + enabled_precisions={torch.float}, + debug=False, + min_block_size=1, + make_refittable=True, + cache_built_engines=True, + reuse_cached_engines=True, + ) + + [ + assertions.assertTrue( + count == 0, f"Unintended cache hit for entry ({h}, hit: {count})" + ) + for h, count in custom_engine_cache.hashes.items() + ] + @pytest.mark.xfail def test_torch_compile_with_default_disk_engine_cache(self): # Custom Engine Cache model = models.resnet18(pretrained=True).eval().to("cuda") @@ -192,6 +357,10 @@ def test_torch_compile_with_default_disk_engine_cache(self): if os.path.exists(engine_cache_dir): shutil.rmtree(engine_cache_dir) + def remove_timing_cache(path=TIMING_CACHE_PATH): + if os.path.exists(path): + os.remove(path) + # The 1st iteration is to measure the compilation time without engine caching # The 2nd and 3rd iterations are to measure the compilation time with engine caching. # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. @@ -202,7 +371,9 @@ def test_torch_compile_with_default_disk_engine_cache(self): start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) for i in range(3): - # remove timing cache and reset dynamo for engine caching messurement + # remove timing cache and reset dynamo for engine caching measurement + remove_timing_cache() + torch._dynamo.reset() if i == 0: cache_built_engines = False reuse_cached_engines = False @@ -210,6 +381,7 @@ def test_torch_compile_with_default_disk_engine_cache(self): cache_built_engines = True reuse_cached_engines = True + torch.cuda.synchronize() start.record() compiled_model = torch.compile( model, @@ -219,16 +391,18 @@ def test_torch_compile_with_default_disk_engine_cache(self): "enabled_precisions": {torch.float}, "debug": False, "min_block_size": 1, - "make_refitable": True, + "make_refittable": True, "cache_built_engines": cache_built_engines, "reuse_cached_engines": reuse_cached_engines, "engine_cache_dir": engine_cache_dir, "engine_cache_size": 1 << 30, # 1GB + "torch_executed_ops": {"torch.ops.aten.relu.default"}, }, ) results.append(compiled_model(*inputs)) # trigger the compilation end.record() torch.cuda.synchronize() + torch._dynamo.reset() times.append(start.elapsed_time(end)) cos_sim = cosine_similarity(results[0], results[1]) @@ -252,7 +426,7 @@ def test_torch_compile_with_custom_engine_cache(self): # Custom Engine Cache model = models.resnet18(pretrained=True).eval().to("cuda") - engine_cache_dir = "/tmp/your_dir" + engine_cache_dir = "/tmp/test_torch_compile_with_custom_engine_cache" if os.path.exists(engine_cache_dir): shutil.rmtree(engine_cache_dir) @@ -284,15 +458,17 @@ def test_torch_compile_with_custom_engine_cache(self): "enabled_precisions": {torch.float}, "debug": False, "min_block_size": 1, - "make_refitable": True, + "make_refittable": True, "cache_built_engines": cache_built_engines, "reuse_cached_engines": reuse_cached_engines, "custom_engine_cache": custom_engine_cache, + "torch_executed_ops": {"torch.ops.aten.relu.default"}, }, ) results.append(compiled_model(*inputs)) # trigger the compilation end.record() torch.cuda.synchronize() + torch._dynamo.reset() times.append(start.elapsed_time(end)) cos_sim = cosine_similarity(results[0], results[1]) @@ -307,7 +483,45 @@ def test_torch_compile_with_custom_engine_cache(self): msg=f"results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) - assertions.assertTrue( - times[0] > times[2], - msg=f"Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms", - ) + [ + assertions.assertTrue( + count == 1, + f"cache was not hit exactly once for entry ({h}, hit: {count})", + ) + for h, count in custom_engine_cache.hashes.items() + ] + + def test_torch_compile_change_input_shape(self): + # Custom Engine Cache + model = models.resnet18(pretrained=True).eval().to("cuda") + + engine_cache_dir = "/tmp/test_torch_compile_with_default_disk_engine_cache" + if os.path.exists(engine_cache_dir): + shutil.rmtree(engine_cache_dir) + + custom_engine_cache = MyEngineCache(engine_cache_dir) + for i in range(3): + # remove timing cache and reset dynamo for engine caching messurement + inputs = [torch.rand((4 * (i + 1), 3, 224, 224)).to("cuda")] + compiled_model = torch.compile( + model, + backend="tensorrt", + options={ + "use_python_runtime": True, + "enabled_precisions": {torch.float}, + "debug": False, + "min_block_size": 1, + "make_refittable": True, + "cache_built_engines": True, + "reuse_cached_engines": True, + "custom_engine_cache": custom_engine_cache, + "torch_executed_ops": {"torch.ops.aten.relu.default"}, + }, + ) + + [ + assertions.assertTrue( + count == 0, f"Unintended cache hit for entry ({h}, hit: {count})" + ) + for h, count in custom_engine_cache.hashes.items() + ] diff --git a/tests/py/dynamo/models/test_model_refit.py b/tests/py/dynamo/models/test_model_refit.py index 9782cd829c..77a0f2809f 100644 --- a/tests/py/dynamo/models/test_model_refit.py +++ b/tests/py/dynamo/models/test_model_refit.py @@ -5,7 +5,6 @@ import numpy as np import pytest -import tensorrt as trt import torch import torch.nn.functional as F import torch_tensorrt as torchtrt @@ -25,6 +24,8 @@ from torch_tensorrt.logging import TRT_LOGGER from transformers import BertModel +import tensorrt as trt + assertions = unittest.TestCase() @@ -57,7 +58,7 @@ def test_mapping(): enabled_precisions=enabled_precisions, debug=debug, min_block_size=min_block_size, - make_refitable=True, + make_refittable=True, ) settings = trt_gm._run_on_acc_0.settings runtime = trt.Runtime(TRT_LOGGER) @@ -109,7 +110,7 @@ def test_refit_one_engine_with_weightmap(): enabled_precisions=enabled_precisions, debug=debug, min_block_size=min_block_size, - make_refitable=True, + make_refittable=True, ) new_trt_gm = refit_module_weights( @@ -159,7 +160,7 @@ def test_refit_one_engine_no_map_with_weightmap(): enabled_precisions=enabled_precisions, debug=debug, min_block_size=min_block_size, - make_refitable=True, + make_refittable=True, ) trt_gm._run_on_acc_0.weight_name_map = None @@ -210,7 +211,7 @@ def test_refit_one_engine_with_wrong_weightmap(): enabled_precisions=enabled_precisions, debug=debug, min_block_size=min_block_size, - make_refitable=True, + make_refittable=True, ) # Manually Deleted all batch norm layer. This suppose to fail the fast refit trt_gm._run_on_acc_0.weight_name_map = { @@ -267,7 +268,7 @@ def test_refit_one_engine_bert_with_weightmap(): enabled_precisions=enabled_precisions, debug=debug, min_block_size=min_block_size, - make_refitable=True, + make_refittable=True, ) new_trt_gm = refit_module_weights( @@ -320,7 +321,7 @@ def test_refit_one_engine_inline_runtime__with_weightmap(): enabled_precisions=enabled_precisions, debug=debug, min_block_size=min_block_size, - make_refitable=True, + make_refittable=True, ) torchtrt.save(trt_gm, trt_ep_path, inputs=inputs) trt_gm = torch.export.load(trt_ep_path) @@ -366,7 +367,7 @@ def test_refit_one_engine_python_runtime_with_weightmap(): enabled_precisions=enabled_precisions, debug=debug, min_block_size=min_block_size, - make_refitable=True, + make_refittable=True, ) new_trt_gm = refit_module_weights( @@ -428,7 +429,7 @@ def forward(self, x): exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) - torch_executed_ops = {torch.ops.aten.convolution.default} + torch_executed_ops = {"torch.ops.aten.convolution.default"} trt_gm = torchtrt.dynamo.compile( exp_program, tuple(inputs), @@ -436,7 +437,7 @@ def forward(self, x): enabled_precisions=enabled_precisions, debug=debug, min_block_size=min_block_size, - make_refitable=True, + make_refittable=True, torch_executed_ops=torch_executed_ops, ) @@ -486,7 +487,7 @@ def test_refit_one_engine_without_weightmap(): enabled_precisions=enabled_precisions, debug=debug, min_block_size=min_block_size, - make_refitable=True, + make_refittable=True, ) new_trt_gm = refit_module_weights( @@ -537,7 +538,7 @@ def test_refit_one_engine_bert_without_weightmap(): enabled_precisions=enabled_precisions, debug=debug, min_block_size=min_block_size, - make_refitable=True, + make_refittable=True, ) new_trt_gm = refit_module_weights( @@ -590,7 +591,7 @@ def test_refit_one_engine_inline_runtime_without_weightmap(): enabled_precisions=enabled_precisions, debug=debug, min_block_size=min_block_size, - make_refitable=True, + make_refittable=True, ) torchtrt.save(trt_gm, trt_ep_path, inputs=inputs) trt_gm = torch.export.load(trt_ep_path) @@ -636,7 +637,7 @@ def test_refit_one_engine_python_runtime_without_weightmap(): enabled_precisions=enabled_precisions, debug=debug, min_block_size=min_block_size, - make_refitable=True, + make_refittable=True, ) new_trt_gm = refit_module_weights( @@ -698,7 +699,7 @@ def forward(self, x): exp_program = torch.export.export(model, tuple(inputs)) exp_program2 = torch.export.export(model2, tuple(inputs)) - torch_executed_ops = {torch.ops.aten.convolution.default} + torch_executed_ops = {"torch.ops.aten.convolution.default"} trt_gm = torchtrt.dynamo.compile( exp_program, tuple(inputs), @@ -706,7 +707,7 @@ def forward(self, x): enabled_precisions=enabled_precisions, debug=debug, min_block_size=min_block_size, - make_refitable=True, + make_refittable=True, torch_executed_ops=torch_executed_ops, ) diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py index bf19c3c5e6..8976c7fbc0 100644 --- a/tests/py/dynamo/models/test_models_export.py +++ b/tests/py/dynamo/models/test_models_export.py @@ -1,7 +1,8 @@ # type: ignore +import importlib import unittest +from importlib import metadata -import modelopt import pytest import timm import torch @@ -11,6 +12,8 @@ from transformers import BertModel from transformers.utils.fx import symbolic_trace as transformers_trace +from packaging.version import Version + assertions = unittest.TestCase() @@ -195,8 +198,14 @@ def test_resnet18_half(ir): torch.cuda.get_device_properties(torch.cuda.current_device()).major < 9, "FP8 compilation in Torch-TRT is not supported on cards older than Hopper", ) +@unittest.skipIf( + not importlib.util.find_spec("modelopt"), + reason="ModelOpt is necessary to run this test", +) @pytest.mark.unit def test_base_fp8(ir): + import modelopt + class SimpleNetwork(torch.nn.Module): def __init__(self): super(SimpleNetwork, self).__init__() @@ -241,11 +250,14 @@ def calibrate_loop(model): @unittest.skipIf( - modelopt.__version__ < "0.16.1", - "Int8 quantization is supported in modelopt since 0.16.1 or later", + not importlib.util.find_spec("modelopt") + or Version(metadata.version("nvidia-modelopt")) < Version("0.16.1"), + "modelopt 0.16.1 or later is required Int8 quantization is supported in modelopt since 0.16.1 or later", ) @pytest.mark.unit def test_base_int8(ir): + import modelopt + class SimpleNetwork(torch.nn.Module): def __init__(self): super(SimpleNetwork, self).__init__() diff --git a/tests/py/dynamo/runtime/test_002_lazy_engine_init.py b/tests/py/dynamo/runtime/test_002_lazy_engine_init.py index 008b0f53b1..aafd099bde 100644 --- a/tests/py/dynamo/runtime/test_002_lazy_engine_init.py +++ b/tests/py/dynamo/runtime/test_002_lazy_engine_init.py @@ -281,7 +281,7 @@ def forward(self, a, b): "ir": "dynamo", "lazy_engine_init": True, "use_python_runtime": True, - "torch_executed_ops": [torch.ops.aten.sub.Tensor], + "torch_executed_ops": {"torch.ops.aten.sub.Tensor"}, "cache_built_engines": False, "reuse_cached_engines": False, } @@ -325,7 +325,7 @@ def forward(self, a, b): "ir": "dynamo", "lazy_engine_init": True, "use_python_runtime": False, - "torch_executed_ops": [torch.ops.aten.sub.Tensor], + "torch_executed_ops": {"torch.ops.aten.sub.Tensor"}, "cache_built_engines": False, "reuse_cached_engines": False, } diff --git a/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py b/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py index 86e7678a66..b52530efd1 100644 --- a/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py +++ b/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py @@ -49,7 +49,7 @@ def test_resnet18(): compile_spec = { "use_python_runtime": False, "enabled_precisions": {torch.float32}, - "make_refitable": True, + "make_refittable": True, } model = models.resnet18(pretrained=True).eval().to("cuda") @@ -89,7 +89,7 @@ def test_save(): compile_spec = { "use_python_runtime": False, "enabled_precisions": {torch.float32}, - "make_refitable": True, + "make_refittable": True, } model = models.resnet18(pretrained=True).eval().to("cuda") @@ -123,7 +123,7 @@ def test_resnet18_modify_attribute(): compile_spec = { "use_python_runtime": False, "enabled_precisions": {torch.float32}, - "make_refitable": True, + "make_refittable": True, } model = models.resnet18(pretrained=True).eval().to("cuda") @@ -164,7 +164,7 @@ def test_resnet18_modify_attribute_no_refit(): compile_spec = { "use_python_runtime": False, "enabled_precisions": {torch.float32}, - "make_refitable": True, + "make_refittable": True, } model = models.resnet18(pretrained=True).eval().to("cuda") @@ -243,7 +243,7 @@ def forward(self, x, b=5, c=None, d=None): "optimization_level": 1, "min_block_size": 1, "ir": "dynamo", - "make_refitable": True, + "make_refittable": True, } mutable_module = torch_trt.MutableTorchTensorRTModule(model, **compile_spec) @@ -304,7 +304,7 @@ def set_weights(self): "optimization_level": 1, "min_block_size": 1, "ir": "dynamo", - "make_refitable": True, + "make_refittable": True, } mutable_module = torch_trt.MutableTorchTensorRTModule(model, **compile_spec) @@ -367,7 +367,7 @@ def set_layer(self): "optimization_level": 1, "min_block_size": 1, "ir": "dynamo", - "make_refitable": True, + "make_refittable": True, } mutable_module = torch_trt.MutableTorchTensorRTModule(model, **compile_spec) @@ -436,7 +436,7 @@ def forward(self, x, b=5, c=None, d=None): "optimization_level": 1, "min_block_size": 1, "ir": "dynamo", - "make_refitable": True, + "make_refittable": True, } mutable_module = torch_trt.MutableTorchTensorRTModule(model, **compile_spec) diff --git a/tests/py/requirements.txt b/tests/py/requirements.txt index 5e4842954e..460cc01027 100644 --- a/tests/py/requirements.txt +++ b/tests/py/requirements.txt @@ -9,6 +9,6 @@ pytest-xdist>=3.6.1 pyyaml timm>=1.0.3 transformers==4.40.2 -# TODO: once 0.16.1 is out, update it here -nvidia-modelopt>=0.15.1 +# TODO @lanlao-nvidia Renable when modelopt can be install properly to run the tests +# "nvidia-modelopt[all]">=0.16.1,<0.17.0 --extra-index-url https://pypi.nvidia.com