From 679ca36bb143f14df00cc9bb89fb9324098b4d48 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 10 Aug 2023 09:54:24 -0400 Subject: [PATCH 1/2] Cleanup the unused files in the root directory --- collect_graph_ir.py | 76 -------- compute_score.py | 71 -------- .../fx2trt-speedup-fp32.yaml | 9 - .../fx2trt-speedup-resize.yaml | 19 -- .../detectron2_speedup/fx2trt-speedup.yaml | 9 - configs/devinfra/cpu.yaml | 7 - configs/devinfra/cuda-113-116-compare.yaml | 10 -- configs/devinfra/cuda-116-117-compare.yaml | 10 -- configs/devinfra/cuda.yaml | 7 - configs/torchdynamo/cudagraph-speedup.yaml | 13 -- configs/torchdynamo/fx2trt-speedup.yaml | 15 -- configs/torchdynamo/nvfuser-aot-speedup.yaml | 10 -- plot_sweep.py | 125 -------------- run_sweep.py | 162 ------------------ 14 files changed, 543 deletions(-) delete mode 100755 collect_graph_ir.py delete mode 100644 compute_score.py delete mode 100644 configs/detectron2_speedup/fx2trt-speedup-fp32.yaml delete mode 100644 configs/detectron2_speedup/fx2trt-speedup-resize.yaml delete mode 100644 configs/detectron2_speedup/fx2trt-speedup.yaml delete mode 100644 configs/devinfra/cpu.yaml delete mode 100644 configs/devinfra/cuda-113-116-compare.yaml delete mode 100644 configs/devinfra/cuda-116-117-compare.yaml delete mode 100644 configs/devinfra/cuda.yaml delete mode 100644 configs/torchdynamo/cudagraph-speedup.yaml delete mode 100644 configs/torchdynamo/fx2trt-speedup.yaml delete mode 100644 configs/torchdynamo/nvfuser-aot-speedup.yaml delete mode 100644 plot_sweep.py delete mode 100644 run_sweep.py diff --git a/collect_graph_ir.py b/collect_graph_ir.py deleted file mode 100755 index 415fef1a22..0000000000 --- a/collect_graph_ir.py +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/env python -import argparse -import gc -import logging -import os -import re -import warnings - -from torchbenchmark import list_models -import torch - -NO_JIT = {"demucs", "dlrm", "maml", "yolov3", "moco", "pytorch_CycleGAN_and_pix2pix", "tacotron2"} -NO_GET_MODULE = {"Background_Matting"} - -def get_dump_filename(name, device, args): - if args.no_profiling: - return f"{name}.{device}.last_executed_graph.noprofile.log" - if args.inlined_graph: - return f"{name}.{device}.inlined_graph.log" - return f"{name}.{device}.last_executed_graph.log" - -def iter_models(args): - device = "cpu" - for benchmark_cls in list_models(): - bench_name = benchmark_cls.name - if args.benchmark and args.benchmark != bench_name: - continue - if bench_name in NO_GET_MODULE: - print(f"{bench_name} has no get_module, skipped") - continue - if bench_name in NO_JIT: - print(f"{bench_name} has no scripted module, skipped") - continue - try: - # disable profiling mode so that the collected graph does not contain - # profiling node - if args.no_profiling: - torch._C._jit_set_profiling_mode(False) - - benchmark = benchmark_cls(device=device, jit=True) - model, example_inputs = benchmark.get_module() - - # extract ScriptedModule object for BERT model - if bench_name == "BERT_pytorch": - model = model.bert - - fname = get_dump_filename(bench_name, device, args) - print(f"Dump Graph IR for {bench_name} to {fname}") - - # default mode need to warm up ProfileExecutor - if not (args.no_profiling or args.inlined_graph): - model.graph_for(*example_inputs) - - with open(fname, 'w') as dump_file: - if args.inlined_graph: - print(model.inlined_graph, file=dump_file) - else: - print(model.graph_for(*example_inputs), file=dump_file) - except NotImplementedError: - print(f"Cannot collect graph IR dump for {bench_name}") - pass - -def main(args=None): - parser = argparse.ArgumentParser(description="dump last_executed graph for all benchmarks with JIT implementation") - parser.add_argument("--benchmark", "-b", - help="dump graph for ") - parser.add_argument("--no_profiling", action="store_true", - help="dump last_executed graphs w/o profiling executor") - parser.add_argument("--inlined_graph", action="store_true", - help="dump graphs dumped by module.inlined_graph") - args = parser.parse_args(args) - - iter_models(args) - -if __name__ == '__main__': - main() diff --git a/compute_score.py b/compute_score.py deleted file mode 100644 index b63d79bbf3..0000000000 --- a/compute_score.py +++ /dev/null @@ -1,71 +0,0 @@ - -""" -Compute the benchmark score given a frozen score configuration and current benchmark data. -""" -import argparse -import json -import math -import yaml -import sys -import os - -from torchbenchmark.score.compute_score import TorchBenchScore - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--score_version", choices=['v1', 'v2'], default="v1", - help="which version of score to use - choose from v1 or v2") - parser.add_argument("--benchmark_data_file", - help="pytest-benchmark json file with current benchmark data") - parser.add_argument("--benchmark_data_dir", - help="directory containing multiple .json files for each of which to compute a score") - parser.add_argument("--relative", action='store_true', - help="use the first json file in benchmark data dir instead of the reference yaml") - parser.add_argument("--output-norm-only", action='store_true', - help="use the benchmark data file specified to output reference norm yaml") - args = parser.parse_args() - - if args.benchmark_data_file is None and args.benchmark_data_dir is None: - parser.print_help(sys.stderr) - raise ValueError("Invalid command-line arguments. You must specify a data file or a data dir.") - - files = [] - benchmark_data = [] - scores = [] - if args.benchmark_data_file is not None: - with open(args.benchmark_data_file) as data_file: - data = json.load(data_file) - files.append(args.benchmark_data_file) - benchmark_data.append(data) - elif args.benchmark_data_dir is not None: - for f in sorted(os.listdir(args.benchmark_data_dir)): - path = os.path.join(args.benchmark_data_dir, f) - if os.path.isfile(path) and os.path.splitext(path)[1] == '.json': - with open(path) as data_file: - data = json.load(data_file) - files.append(f) - benchmark_data.append(data) - - if args.output_norm_only: - score_config = TorchBenchScore(ref_data=benchmark_data[0], version=args.score_version) - print(yaml.dump(score_config.get_norm(benchmark_data[0]))) - exit(0) - - if args.relative: - score_config = TorchBenchScore(ref_data=benchmark_data[0], version=args.score_version) - else: - score_config = TorchBenchScore(version=args.score_version) - - results = [] - for fname, data in zip(files, benchmark_data): - try: - result = {} - score = score_config.compute_score(data) - result["file"] = fname - result["pytorch_version"] = data['machine_info']['pytorch_version'] - result["score"] = score - results.append(result) - except ValueError as e: - print(f"Error when analyzing file {fname}: {e}") - - print(json.dumps(results, indent=4)) diff --git a/configs/detectron2_speedup/fx2trt-speedup-fp32.yaml b/configs/detectron2_speedup/fx2trt-speedup-fp32.yaml deleted file mode 100644 index 319dae4839..0000000000 --- a/configs/detectron2_speedup/fx2trt-speedup-fp32.yaml +++ /dev/null @@ -1,9 +0,0 @@ -models: - - "detectron2_.*" -device: - - "cuda" -test: - - "eval" -args: - - "--precision fp32" - - "--precision fp32 --torchdynamo fx2trt" diff --git a/configs/detectron2_speedup/fx2trt-speedup-resize.yaml b/configs/detectron2_speedup/fx2trt-speedup-resize.yaml deleted file mode 100644 index 2ae6fe6699..0000000000 --- a/configs/detectron2_speedup/fx2trt-speedup-resize.yaml +++ /dev/null @@ -1,19 +0,0 @@ -models: - - "detectron2_.*" -device: - - "cuda" -test: - - "eval" -batch_size: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 -args: - - "" - - "--resize 448x608" - - "--torchdynamo fx2trt" - - "--torchdynamo fx2trt --resize 448x608" diff --git a/configs/detectron2_speedup/fx2trt-speedup.yaml b/configs/detectron2_speedup/fx2trt-speedup.yaml deleted file mode 100644 index af286cd344..0000000000 --- a/configs/detectron2_speedup/fx2trt-speedup.yaml +++ /dev/null @@ -1,9 +0,0 @@ -models: - - "detectron2_.*" -device: - - "cuda" -test: - - "eval" -args: - - "" - - "--torchdynamo fx2trt" diff --git a/configs/devinfra/cpu.yaml b/configs/devinfra/cpu.yaml deleted file mode 100644 index 5c2940ad96..0000000000 --- a/configs/devinfra/cpu.yaml +++ /dev/null @@ -1,7 +0,0 @@ -device: - - "cpu" -test: - - "train" - - "eval" -args: - - "" diff --git a/configs/devinfra/cuda-113-116-compare.yaml b/configs/devinfra/cuda-113-116-compare.yaml deleted file mode 100644 index f4d1a9092d..0000000000 --- a/configs/devinfra/cuda-113-116-compare.yaml +++ /dev/null @@ -1,10 +0,0 @@ -cuda_version: - - "11.3" - - "11.6" -device: - - "cuda" -test: - - "train" - - "eval" -args: - - "" diff --git a/configs/devinfra/cuda-116-117-compare.yaml b/configs/devinfra/cuda-116-117-compare.yaml deleted file mode 100644 index 969aa8bca4..0000000000 --- a/configs/devinfra/cuda-116-117-compare.yaml +++ /dev/null @@ -1,10 +0,0 @@ -cuda_version: - - "11.6" - - "11.7" -device: - - "cuda" -test: - - "train" - - "eval" -args: - - "" diff --git a/configs/devinfra/cuda.yaml b/configs/devinfra/cuda.yaml deleted file mode 100644 index 1c0253723e..0000000000 --- a/configs/devinfra/cuda.yaml +++ /dev/null @@ -1,7 +0,0 @@ -device: - - "cuda" -test: - - "train" - - "eval" -args: - - "" diff --git a/configs/torchdynamo/cudagraph-speedup.yaml b/configs/torchdynamo/cudagraph-speedup.yaml deleted file mode 100644 index 4ba7f994b3..0000000000 --- a/configs/torchdynamo/cudagraph-speedup.yaml +++ /dev/null @@ -1,13 +0,0 @@ -# Sample benchmark config -# Runs a test matrix of [device x test x test_args] -# Currently, `summarize.py` only supports single device and single test -# The first combination will be used as the baseline -# In this example, baseline is ("cuda", "eval", "") -device: - - "cuda" -test: - - "eval" -args: - # empty argument means the default pytorch eager mode - - "" - - "--torchdynamo cudagraphs" diff --git a/configs/torchdynamo/fx2trt-speedup.yaml b/configs/torchdynamo/fx2trt-speedup.yaml deleted file mode 100644 index a376458904..0000000000 --- a/configs/torchdynamo/fx2trt-speedup.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# Sample benchmark config -# Runs a test matrix of [device x test x test_args] -# Currently, `summarize.py` only supports single device and single test -# The first combination will be used as the baseline -# In this example, baseline is ("cuda", "eval", "") -device: - - "cuda" -test: - - "eval" -args: - # empty argument means the default pytorch eager mode - - "" - - "--fx2trt" - - "--torchdynamo eager" - - "--torchdynamo fx2trt" \ No newline at end of file diff --git a/configs/torchdynamo/nvfuser-aot-speedup.yaml b/configs/torchdynamo/nvfuser-aot-speedup.yaml deleted file mode 100644 index 976b2c56f2..0000000000 --- a/configs/torchdynamo/nvfuser-aot-speedup.yaml +++ /dev/null @@ -1,10 +0,0 @@ -# Test the speedup with dynamo+nvfuser+aotautograd -device: - - "cuda" -test: - - "train" -args: - # empty argument means the default pytorch eager mode - - "" - - "--torchdynamo nvfuser" - - "--torchdynamo aot_nvfuser" diff --git a/plot_sweep.py b/plot_sweep.py deleted file mode 100644 index 5bc65375ba..0000000000 --- a/plot_sweep.py +++ /dev/null @@ -1,125 +0,0 @@ -import argparse -import json -# import pandas as pd -import os -# import sys -# import re -import yaml -import itertools - -# from bokeh.layouts import column, row, layout, gridplot -# from bokeh.plotting import figure, output_file, show -# from bokeh.sampledata.autompg import autompg -# from bokeh.transform import jitter -from bokeh.palettes import Category10 -from bokeh.models import HoverTool, Div, Range1d, HoverTool -from bokeh.plotting import figure, output_file, show -# from bokeh.models import Legend -# from bokeh.models import ColumnDataSource, CategoricalTicker, Div -# from bokeh.models import ColumnDataSource, DataTable, DateFormatter, TableColumn -# from bokeh.transform import jitter -from collections import defaultdict -from datetime import datetime as dt -from torchbenchmark.util.data import load_data_dir, load_data_files -from torchbenchmark.score.compute_score import TorchBenchScore - -TORCHBENCH_SCORE_VERSION = "v1" - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("data_dir", nargs='+', - help="One or more directories containing benchmark json files. " - "Each directory will be plotted as a separate series. " - "By default, the first file in the first directory will be used" - " to generate a score configuration with a target of 1000," - " and everything else will be relative to that.") - parser.add_argument("--output_html", default='plot.html', help="html file to write") - parser.add_argument("--plot_all", action='store_true', - help="Plots the scores for each configuration") - parser.add_argument("--reference_json", required=True, - help="file defining score norm values, usually first json in first data_dir") - - args = parser.parse_args() - plot_height = 800 - plot_width = 1000 - - assert len(args.data_dir) > 0, "Must provide at least one data directory" - compare_datasets = [load_data_dir(d, most_recent_files=-1) for d in args.data_dir] - - with open(args.reference_json) as f: - ref_data = json.load(f) - plot_all = args.plot_all - score_config = TorchBenchScore(ref_data=ref_data, version=TORCHBENCH_SCORE_VERSION) - - p = figure(plot_width=plot_width, plot_height=plot_height, - x_axis_type='datetime') - - xs = [] - ys = [] - zs = [] - max_score = 0 - for d in compare_datasets: - scores = {} - scores_db = defaultdict(list) - for i in range(len(d._json_raw)): - data = d._json_raw[i] - pytorch_ver = data['machine_info']['pytorch_version'] - # Slice the portion after '+' - pytorch_ver_cuda_loc = pytorch_ver.rfind('+') - pytorch_ver = pytorch_ver[:pytorch_ver_cuda_loc] - date = dt.strptime(pytorch_ver[pytorch_ver.index("dev") + len("dev"):], "%Y%m%d") - score = score_config.compute_score(data) - scores[date] = score - - dates = [] - total_scores = [] - all_scores = [] - for date in sorted(scores.keys()): - dates.append(date) - total_scores.append(scores[date]["total"]) - max_score = max(max_score, max(total_scores)) - all_scores.append(scores[date]) - xs.append(dates) - ys.append(total_scores) - if plot_all: - zs.append(all_scores) - - colors = itertools.cycle(Category10[10]) - basenames = map(os.path.basename, args.data_dir) - - if plot_all: - for x, z in zip(xs, zs): - basename = next(basenames) - color = next(colors) - configs = z[0].keys() - for config in configs: - if not ("subscore" in config or "total" in config): - continue - color = next(colors) - scores = [] - for s in z: - scores.append(s[config]) - p.line(x, scores, color=color, line_width=2, legend_label=basename + '-' + config) - - p.legend.click_policy = "hide" - else: - for x, y, color in zip(xs, ys, colors): - p.line(x, y, color=color, line_width=2, legend_label=next(basenames)) - - for x, y, color in zip(xs, ys, colors): - p.circle(x, y, color=color) - - p.legend.location = "bottom_right" - p.y_range = Range1d(0, max_score * 1.25) - p.add_tools(HoverTool( - tooltips=[ - ('date', '@x{%F}'), - ('score', '@y{0.00 a}'), - ], - formatters={ - '@x': 'datetime', - '@y': 'numeral', - }, - )) - output_file(args.output_html) - show(p) diff --git a/run_sweep.py b/run_sweep.py deleted file mode 100644 index e9ba079186..0000000000 --- a/run_sweep.py +++ /dev/null @@ -1,162 +0,0 @@ -""" -Run a config of benchmarking with a list of models. -If unspecified, run a sweep of all models. -""" -import argparse -import json -import os -import sys -import numpy -import sys -import torch -import time -import pathlib -import dataclasses -import itertools -import torch -from typing import List, Optional, Dict, Any, Tuple -from torchbenchmark import ModelTask - -WARMUP_ROUNDS = 3 -WORKER_TIMEOUT = 600 # seconds -MODEL_DIR = ['torchbenchmark', 'models'] -NANOSECONDS_PER_MILLISECONDS = 1_000_000.0 - -def run_one_step(func, device: str, nwarmup=WARMUP_ROUNDS, num_iter=10) -> Tuple[float, Optional[Tuple[torch.Tensor]]]: - "Run one step of the model, and return the latency in milliseconds." - # Warm-up `nwarmup` rounds - for _i in range(nwarmup): - func() - result_summary = [] - for _i in range(num_iter): - if device == "cuda": - torch.cuda.synchronize() - # Collect time_ns() instead of time() which does not provide better precision than 1 - # second according to https://docs.python.org/3/library/time.html#time.time. - t0 = time.time_ns() - func() - torch.cuda.synchronize() # Wait for the events to be recorded! - t1 = time.time_ns() - else: - t0 = time.time_ns() - func() - t1 = time.time_ns() - result_summary.append((t1 - t0) / NANOSECONDS_PER_MILLISECONDS) - wall_latency = numpy.median(result_summary) - return wall_latency - -@dataclasses.dataclass -class ModelTestResult: - name: str - test: str - device: str - extra_args: List[str] - status: str - batch_size: Optional[int] - precision: str - results: Dict[str, Any] - -def _list_model_paths(models: List[str]) -> List[str]: - p = pathlib.Path(__file__).parent.joinpath(*MODEL_DIR) - model_paths = sorted(child for child in p.iterdir() if child.is_dir()) - valid_model_paths = sorted(filter(lambda x: x.joinpath("__init__.py").exists(), model_paths)) - if models: - valid_model_paths = sorted(filter(lambda x: x.name in models, valid_model_paths)) - return valid_model_paths - -def _validate_tests(tests: str) -> List[str]: - tests_list = list(map(lambda x: x.strip(), tests.split(","))) - valid_tests = ['train', 'eval'] - for t in tests_list: - if t not in valid_tests: - raise ValueError(f'Invalid test {t} passed into --tests. Expected tests: {valid_tests}.') - return tests_list - -def _validate_devices(devices: str) -> List[str]: - devices_list = list(map(lambda x: x.strip(), devices.split(","))) - valid_devices = ['cpu', 'cuda'] - for d in devices_list: - if d not in valid_devices: - raise ValueError(f'Invalid device {d} passed into --devices. Expected devices: {valid_devices}.') - return devices_list - -def _run_model_test(model_path: pathlib.Path, test: str, device: str, jit: bool, batch_size: Optional[int], extra_args: List[str]) -> ModelTestResult: - assert test == "train" or test == "eval", f"Test must be either 'train' or 'eval', but get {test}." - result = ModelTestResult(name=model_path.name, test=test, device=device, extra_args=extra_args, batch_size=None, precision="fp32", - status="OK", results={}) - # Run the benchmark test in a separate process - print(f"Running model {model_path.name} ... ", end='', flush=True) - status: str = "OK" - bs_name = "batch_size" - correctness_name = "correctness" - error_message: Optional[str] = None - try: - task = ModelTask(os.path.basename(model_path), timeout=WORKER_TIMEOUT) - if not task.model_details.exists: - status = "NotExist" - return - task.make_model_instance(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args) - # Check the batch size in the model matches the specified value - result.batch_size = task.get_model_attribute(bs_name) - result.precision = task.get_model_attribute("dargs", "precision") - if batch_size and (not result.batch_size == batch_size): - raise ValueError(f"User specify batch size {batch_size}, but model {result.name} runs with batch size {result.batch_size}. Please report a bug.") - result.results["latency_ms"] = run_one_step(task.invoke, device) - # if NUM_BATCHES is set, update to per-batch latencies - num_batches = task.get_model_attribute("NUM_BATCHES") - if num_batches: - result.results["latency_ms"] = result.results["latency_ms"] / num_batches - # if the model provides eager eval result, save it for cosine similarity - correctness = task.get_model_attribute(correctness_name) - if correctness is not None: - result.results[correctness_name] = str(correctness) - except NotImplementedError as e: - status = "NotImplemented" - error_message = str(e) - except TypeError as e: # TypeError is raised when the model doesn't support variable batch sizes - status = "TypeError" - error_message = str(e) - except KeyboardInterrupt as e: - status = "UserInterrupted" - error_message = str(e) - except Exception as e: - status = f"{type(e).__name__}" - error_message = str(e) - finally: - print(f"[ {status} ]") - result.status = status - if error_message: - result.results["error_message"] = error_message - if status == "UserInterrupted": - sys.exit(1) - return result - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("-m", "--models", nargs='+', default=[], - help="Specify one or more models to run. If not set, trigger a sweep-run on all models.") - parser.add_argument("-t", "--tests", required=True, type=_validate_tests, help="Specify tests, choice of train, or eval.") - parser.add_argument("-d", "--devices", required=True, type=_validate_devices, help="Specify devices, choice of cpu, or cuda.") - parser.add_argument("-b", "--bs", type=int, help="Specify batch size.") - parser.add_argument("--jit", action='store_true', help="Turn on torchscript.") - parser.add_argument("-o", "--output", type=str, default="tb-output.json", help="The default output json file.") - parser.add_argument("--proper-bs", action='store_true', help="Find the best batch_size for current devices.") - args, extra_args = parser.parse_known_args() - args.models = _list_model_paths(args.models) - results = [] - for element in itertools.product(*[args.models, args.tests, args.devices]): - model_path, test, device = element - if args.proper_bs: - if test != 'eval': - print("Error: Only batch size of eval test is tunable.") - sys.exit(1) - from scripts.proper_bs import _run_model_test_proper_bs - r = _run_model_test_proper_bs(model_path, test, device, args.jit, batch_size=args.bs, extra_args=extra_args) - else: - r = _run_model_test(model_path, test, device, args.jit, batch_size=args.bs, extra_args=extra_args) - results.append(r) - results_to_export = list(map(lambda x: dataclasses.asdict(x), results)) - parent_dir = pathlib.Path(args.output).parent - parent_dir.mkdir(exist_ok=True, parents=True) - with open(args.output, "w") as outfile: - json.dump(results_to_export, outfile, indent=4) From 6bfc03ace827376c5b0082d7b7e37de99621a4d8 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 10 Aug 2023 11:49:09 -0400 Subject: [PATCH 2/2] Upgrade numpy pin version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 37232735d8..157c13f55d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,7 +16,7 @@ psutil pyyaml # pytorch build script pins numpy version # https://github.com/pytorch/builder/blob/ae5c82e65cb3d8bac6df50e742a195019af91ad3/wheel/build_wheel.sh#L145 -numpy==1.21.2 +numpy==1.23.5 # Need https://github.com/kornia/kornia/commit/53808e5 to work on PyTorch nightly git+https://github.com/kornia/kornia.git@b7050c3 scipy # for lazy_bench.py