groq · jeremyfowers · Jun 22, 2023 · Jun 14, 2023 · Jun 14, 2023 · Jun 15, 2023
diff --git a/src/mlagility/analysis/analysis.py b/src/mlagility/analysis/analysis.py
@@ -8,7 +8,8 @@
 import functools
 import dataclasses
 import traceback
-from typing import Union, List, Dict
+import hashlib
+from typing import Union, List, Dict, Tuple
 from types import FrameType, TracebackType
 from enum import Enum
 import torch
@@ -63,28 +64,31 @@ def torch_activations(self) -> List[str]:
         return act
 
 
-def _store_traceback(model_info: util.ModelInfo):
+def _store_traceback(workload_info: util.WorkloadInfo):
     """
-    Store the traceback from an exception into model_info so that
+    Store the traceback from an exception into workload_info so that
     we can print it during the status update.
     """
 
     exc_type, exc_value, exc_traceback = sys.exc_info()
-    model_info.traceback = traceback.format_exception(
+    workload_info.traceback = traceback.format_exception(
         exc_type, exc_value, exc_traceback
     )
 
 
 def call_benchit(
-    model_inputs: dict, model_info: util.ModelInfo, tracer_args: TracerArgs
+    model_inputs: dict,
+    model_info: util.ModelInfo,
+    workload_info: util.WorkloadInfo,
+    tracer_args: TracerArgs,
 ) -> None:
     """
     Calls the benchit function from within the model forward function
     """
 
     # Update status to "computing"
-    model_info.status_message = "Computing..."
-    model_info.status_message_color = printing.Colors.OKBLUE
+    workload_info.status_message = "Computing..."
+    workload_info.status_message_color = printing.Colors.OKBLUE
     status.update(tracer_args.models_found)
 
     # Get a copy of the keyword arguments
@@ -111,10 +115,10 @@ def call_benchit(
                 inputs[all_args[i]] = torch.tensor(args[i].detach().numpy())
             else:
                 inputs[all_args[i]] = args[i]
-    model_info.inputs = inputs
+    workload_info.inputs = inputs
 
     build_name = filesystem.get_build_name(
-        tracer_args.script_name, tracer_args.labels, model_info.hash
+        tracer_args.script_name, tracer_args.labels, workload_info.hash
     )
 
     # Save model labels
@@ -124,12 +128,12 @@ def call_benchit(
     perf = None
     try:
         if model_info.model_type == build.ModelType.PYTORCH_COMPILED:
-            model_info.status_message = (
+            workload_info.status_message = (
                 "Skipping model compiled using torch.compile(). "
                 "benchit requires models to be in eager mode "
                 "(regardless of what runtime you have selected)."
             )
-            model_info.status_message_color = printing.Colors.WARNING
+            workload_info.status_message_color = printing.Colors.WARNING
         else:
             perf = benchmark_model(
                 model_info.model,
@@ -150,36 +154,36 @@ def call_benchit(
                 onnx_opset=tracer_args.onnx_opset,
             )
             if Action.BENCHMARK in tracer_args.actions:
-                model_info.status_message = "Model successfully benchmarked!"
-                model_info.performance = perf
-                model_info.status_message_color = printing.Colors.OKGREEN
+                workload_info.status_message = "Model successfully benchmarked!"
+                workload_info.performance = perf
+                workload_info.status_message_color = printing.Colors.OKGREEN
             else:
-                model_info.status_message = "Model successfully built!"
-                model_info.status_message_color = printing.Colors.OKGREEN
+                workload_info.status_message = "Model successfully built!"
+                workload_info.status_message_color = printing.Colors.OKGREEN
 
     except exp.StageError:
         build_state = build.load_state(
             cache_dir=tracer_args.cache_dir, build_name=build_name
         )
-        model_info.status_message = "Build Error: see log files for details."
-        model_info.status_message_color = printing.Colors.WARNING
+        workload_info.status_message = "Build Error: see log files for details."
+        workload_info.status_message_color = printing.Colors.WARNING
 
-        _store_traceback(model_info)
+        _store_traceback(workload_info)
 
     except exp.Error:
-        model_info.status_message = "GroqFlowError: see log files for details."
-        model_info.status_message_color = printing.Colors.WARNING
+        workload_info.status_message = "GroqFlowError: see log files for details."
+        workload_info.status_message_color = printing.Colors.WARNING
 
-        _store_traceback(model_info)
+        _store_traceback(workload_info)
 
     # This broad exception is ok since enumerating all exceptions is
     # not possible, as the tested software continuously evolves.
     except Exception as e:  # pylint: disable=broad-except
         util.stop_stdout_forward()
-        model_info.status_message = f"Unknown benchit error: {e}"
-        model_info.status_message_color = printing.Colors.WARNING
+        workload_info.status_message = f"Unknown benchit error: {e}"
+        workload_info.status_message_color = printing.Colors.WARNING
 
-        _store_traceback(model_info)
+        _store_traceback(workload_info)
     finally:
         # Ensure that stdout is not being forwarded before updating status
         if hasattr(sys.stdout, "terminal"):
@@ -247,7 +251,23 @@ def call_benchit(
 def get_model_hash(
     model: Union[torch.nn.Module, "tf.keras.Model"], model_type: build.ModelType
 ):
-    return build.hash_model(model, model_type, hash_params=True)[:8]
+    return build.hash_model(model, model_type, hash_params=False)[:8]
+
+
+def get_workload_hash(model_hash: str, args: Tuple, kwargs: Dict) -> str:
+    """
+    Combines the model hash and the input shapes to create the workload hash
+    """
+
+    # Merge positional and keyword args
+    args = {"positional{}".format(i + 1): arg for i, arg in enumerate(args)}
+    kwargs = {**kwargs, **args}
+
+    # Get input shapes and types
+    input_shapes, input_dtypes = build.get_shapes_and_dtypes(kwargs)
+
+    hashable_content = f"{model_hash}{input_shapes}{input_dtypes}"
+    return hashlib.sha256(hashable_content.encode()).hexdigest()[:8]
 
 
 def store_model_info(
@@ -292,7 +312,6 @@ def store_model_info(
             depth=depth,
             hash=model_hash,
             parent_hash=parent_hash,
-            is_target=model_hash in tracer_args.targets or tracer_args.targets == [],
             build_model=build_model,
             model_type=model_type,
             script_name=tracer_args.script_name,
@@ -439,21 +458,30 @@ def forward_spy(*args, **kwargs):
                     parent_hash,
                 )
             model_hash = get_model_hash(local_var, model_type)
+            workload_hash = get_workload_hash(model_hash, args, kwargs)
             model_info = tracer_args.models_found[model_hash]
-            model_info.exec_time = model_info.exec_time + end_time - start_time
+            if workload_hash not in model_info.workloads:
+                model_info.workloads[workload_hash] = util.WorkloadInfo(
+                    hash=workload_hash,
+                    is_target=workload_hash in tracer_args.targets
+                    or tracer_args.targets == [],
+                )
+            workload_info = model_info.workloads[workload_hash]
+            workload_info.exec_time = workload_info.exec_time + end_time - start_time
 
-            model_info.executed = model_info.executed + 1
+            workload_info.executed = workload_info.executed + 1
 
             # Call groqit if this is the first time the model is being executed
             # and this model has been selected by the user
             if (
-                model_info.executed == 1
-                and model_info.is_target
+                workload_info.executed == 1
+                and workload_info.is_target
                 and (model_info.build_model)
             ):
                 call_benchit(
                     model_inputs=[args, kwargs],
                     model_info=model_info,
+                    workload_info=workload_info,
                     tracer_args=tracer_args,
                 )
                 # Ensure that groqit() doesn't interfere with our execution count

diff --git a/src/mlagility/analysis/status.py b/src/mlagility/analysis/status.py
@@ -23,25 +23,45 @@ def recursive_print(
     models_found: Dict[str, ModelInfo],
     parent_hash: Union[str, None] = None,
     script_names_visited: List[str] = False,
+    depth: int = 0,
 ) -> None:
     script_names_visited = []
 
-    for h in models_found.keys():
-        if parent_hash == models_found[h].parent_hash and models_found[h].executed > 0:
-            print_file_name = models_found[h].script_name not in script_names_visited
+    for model_hash in models_found.keys():
+        workloads_executed = False
+        for workload_hash in models_found[model_hash].workloads.keys():
+            workload = models_found[model_hash].workloads[workload_hash]
 
-            print_model(models_found[h], h, print_file_name)
+            if (
+                parent_hash == models_found[model_hash].parent_hash
+                and workload.executed > 0
+            ):
 
-            if print_file_name:
-                script_names_visited.append(models_found[h].script_name)
+                workloads_executed = True
+                print_file_name = False
+                if models_found[model_hash].script_name not in script_names_visited:
+                    script_names_visited.append(models_found[model_hash].script_name)
+                    if depth == 0:
+                        print_file_name = True
 
+                print_workload(models_found[model_hash], workload_hash, print_file_name)
+
+                if print_file_name:
+                    script_names_visited.append(models_found[model_hash].script_name)
+
+        if workloads_executed:
             recursive_print(
-                models_found, parent_hash=h, script_names_visited=script_names_visited
+                models_found,
+                parent_hash=model_hash,
+                script_names_visited=script_names_visited,
+                depth=depth + 1,
             )
 
 
-def print_model(
-    model_info: ModelInfo, model_hash: Union[str, None], print_file_name: bool = False
+def print_workload(
+    model_info: ModelInfo,
+    workload_hash: Union[str, None],
+    print_file_name: bool = False,
 ) -> None:
     """
     Print information about a given model or submodel
@@ -54,12 +74,13 @@ def print_model(
     # Show the number of times the model has been executed
     # Only show the execution time if we are not running benchit() as this
     # impacts time measurement.
-    if model_info.exec_time == 0 or model_info.build_model:
+    workload = model_info.workloads[workload_hash]
+    if workload.exec_time == 0 or model_info.build_model:
         exec_time = ""
     else:
-        exec_time = f" - {model_info.exec_time:.2f}s"
+        exec_time = f" - {workload.exec_time:.2f}s"
     printing.logn(
-        f"(executed {model_info.executed}x{exec_time})",
+        f"(executed {workload.executed}x{exec_time})",
         c=printing.Colors.OKGREEN,
     )
 
@@ -78,34 +99,32 @@ def print_model(
     model_size = model_info.params * 2 / (1024 * 1024)
     model_size = "{:.1f}".format(model_size) if model_size > 0.1 else "<0.1"
     print(f"{ident}\tParameters:\t{'{:,}'.format(model_info.params)} ({model_size} MB)")
-    print(f"{ident}\tHash:\t\t" + model_hash)
+    print(f"{ident}\tHash:\t\t" + workload_hash)
 
     # Print benchit results if benchit was run
-    if model_info.performance:
+    if workload.performance:
         printing.log(f"{ident}\tStatus:\t\t")
         printing.logn(
-            f"Successfully benchmarked on {model_info.performance.device} ({model_info.performance.runtime} v{model_info.performance.runtime_version})",
-            c=model_info.status_message_color,
+            f"Successfully benchmarked on {workload.performance.device} ({workload.performance.runtime} v{workload.performance.runtime_version})",
+            c=workload.status_message_color,
         )
         printing.logn(
-            f"{ident}\t\t\tMean Latency:\t{model_info.performance.mean_latency:.3f}"
-            f"\t{model_info.performance.latency_units}"
+            f"{ident}\t\t\tMean Latency:\t{workload.performance.mean_latency:.3f}"
+            f"\t{workload.performance.latency_units}"
         )
         printing.logn(
-            f"{ident}\t\t\tThroughput:\t{model_info.performance.throughput:.1f}"
-            f"\t{model_info.performance.throughput_units}"
+            f"{ident}\t\t\tThroughput:\t{workload.performance.throughput:.1f}"
+            f"\t{workload.performance.throughput_units}"
         )
         print()
     else:
-        if model_info.is_target and model_info.build_model:
+        if workload.is_target and model_info.build_model:
             printing.log(f"{ident}\tStatus:\t\t")
-            printing.logn(
-                f"{model_info.status_message}", c=model_info.status_message_color
-            )
+            printing.logn(f"{workload.status_message}", c=workload.status_message_color)
 
-            if model_info.traceback is not None:
+            if workload.traceback is not None:
                 if os.environ.get("MLAGILITY_TRACEBACK") != "False":
-                    for line in model_info.traceback:
+                    for line in workload.traceback:
                         for subline in line.split("\n")[:-1]:
                             print(f"{ident}\t{subline}")
 

diff --git a/src/mlagility/analysis/util.py b/src/mlagility/analysis/util.py
@@ -2,6 +2,7 @@
 from dataclasses import dataclass
 from typing import Callable, List, Union, Dict
 import inspect
+import dataclasses
 import torch
 import onnx
 from onnxflow.common import printing
@@ -15,6 +16,20 @@ class AnalysisException(Exception):
     """
 
 
+@dataclass
+class WorkloadInfo:
+    hash: Union[str, None] = None
+    performance: MeasuredPerformance = None
+    traceback: List[str] = None
+    inputs: Union[dict, None] = None
+    executed: int = 0
+    exec_time: float = 0.0
+    status_message: str = ""
+    is_target: bool = False
+    status_message_color: printing.Colors = printing.Colors.ENDC
+    traceback_message_color: printing.Colors = printing.Colors.FAIL
+
+
 @dataclass
 class ModelInfo:
     model: torch.nn.Module
@@ -26,18 +41,12 @@ class ModelInfo:
     depth: int = 0
     hash: Union[str, None] = None
     parent_hash: Union[str, None] = None
-    inputs: Union[dict, None] = None
-    executed: int = 0
-    exec_time: float = 0.0
     old_forward: Union[Callable, None] = None
-    status_message: str = ""
-    status_message_color: printing.Colors = printing.Colors.ENDC
-    traceback_message_color: printing.Colors = printing.Colors.FAIL
-    is_target: bool = False
+    workloads: Union[Dict[str, WorkloadInfo], None] = dataclasses.field(
+        default_factory=dict
+    )
     build_model: bool = False
     model_type: build.ModelType = build.ModelType.PYTORCH
-    performance: MeasuredPerformance = None
-    traceback: List[str] = None
 
     def __post_init__(self):
         self.params = count_parameters(self.model, self.model_type)

diff --git a/src/mlagility/api/Dockerfile b/src/mlagility/api/Dockerfile
@@ -3,4 +3,4 @@ from httpd
 
 RUN apt-get update && apt-get install -y --no-install-recommends python3-dev python3-setuptools python3-wheel python3-pip 
 ENV PYTHONPATH "${PYTHONPATH}:/usr/bin/python3"
-RUN pip install onnxruntime==1.14.1
+RUN pip install onnxruntime --break-system-packages
diff --git a/src/onnxflow/common/build.py b/src/onnxflow/common/build.py
@@ -168,7 +168,10 @@ def get_shapes_and_dtypes(inputs: dict):
                 subkey = f"{key}[{i}]"
                 shapes[subkey] = np.array(v).shape
                 dtypes[subkey] = np.array(v).dtype.name
-        elif torch.is_tensor(value) or tf_helpers.is_keras_tensor(value):
+        elif torch.is_tensor(value):
+            shapes[key] = np.array(value.detach()).shape
+            dtypes[key] = np.array(value.detach()).dtype.name
+        elif tf_helpers.is_keras_tensor(value):
             shapes[key] = np.array(value).shape
             dtypes[key] = np.array(value).dtype.name
         elif isinstance(value, np.ndarray):