Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEAT] Performance Profiler #495

Closed
wants to merge 33 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
0f40ecb
add device_spec
jeromeku Jul 7, 2024
876f59b
add performance counter
jeromeku Jul 7, 2024
6943a4a
add more perf counter tools
jeromeku Jul 7, 2024
458c9dc
add performance counter manager test
jeromeku Jul 7, 2024
c9364bb
add mbu and mfu test
jeromeku Jul 7, 2024
17d1518
refactor performance manager device spec
jeromeku Jul 7, 2024
792ba23
add perf stats
jeromeku Jul 8, 2024
0af07fb
start perf counter manager test refactor
jeromeku Jul 8, 2024
0a7cf67
add stat print str
jeromeku Jul 8, 2024
7c9b1b1
refactor performance counter with perf stats
jeromeku Jul 8, 2024
e4aef14
more perf stats tests
jeromeku Jul 8, 2024
1381a82
add perf stat print formatting tests
jeromeku Jul 8, 2024
3186839
fix device spec formatting
jeromeku Jul 8, 2024
5ff6d4b
finish perf counter manager refactor
jeromeku Jul 8, 2024
2a26b77
add serialization test
jeromeku Jul 9, 2024
4510149
refactor stats tests
jeromeku Jul 9, 2024
d295f53
refactor remaining tests
jeromeku Jul 9, 2024
9bea6b1
clean up tests
jeromeku Jul 9, 2024
1633c12
clean up device_spec tests
jeromeku Jul 9, 2024
82dcc4b
add latency
jeromeku Jul 9, 2024
29a6307
add latency tests
jeromeku Jul 9, 2024
6c5d483
fix formatting
jeromeku Jul 10, 2024
bd45cc1
remove unused methods
jeromeku Jul 10, 2024
83af26e
add documentation
jeromeku Jul 10, 2024
982322a
more docs
jeromeku Jul 10, 2024
0a88983
formatting
jeromeku Jul 10, 2024
2fe10f3
clean up warnings
jeromeku Jul 10, 2024
684dddb
rename duration -> latency
jeromeku Jul 10, 2024
4a4e522
add gpt-fast example
jeromeku Jul 10, 2024
5756a56
linting and formatting
jeromeku Jul 10, 2024
9e2790d
update profiler tutorial readme
jeromeku Jul 10, 2024
8dd0111
move total_model_params to utils
jeromeku Jul 27, 2024
9b37dd3
remove tutorials/profiler
jeromeku Jul 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions test/profiler/test_device_spec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import pytest

cuda_driver = pytest.importorskip(
"triton.runtime.driver", reason="requires triton cuda driver module"
)
import itertools

import torch
from utils import patch_device

from torchao.profiler.device_spec import (
_AVAILABLE_GPU_SPECS,
CUDADeviceSpec,
get_chip_name,
)

# -------------------- Device Spec Tests ------------------- #
DEVICE_NAMES = ["h100 sxm", "a100", "nvidia geforce rtx 4090"]
Copy link
Member

@msaroufim msaroufim Jul 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we only have a10G in CI that might explain some of teh CI failures

And we're exploring L4 instances next since those are cheaper and have fp8 support

DTYPES = [torch.float32, torch.bfloat16, torch.float16]
USE_TENSORCORES = [True, False]
DEVICE_CONFIGS = itertools.product(DEVICE_NAMES, DTYPES, USE_TENSORCORES)


@pytest.mark.parametrize(
"device_name, dtype, use_tensorcores", DEVICE_CONFIGS, ids=lambda x: str(x)
)
def test_device_spec(device_name, dtype, use_tensorcores):
with patch_device(device_name):
device_spec = CUDADeviceSpec(dtype=dtype, use_tensorcores=use_tensorcores)
if dtype == torch.float32 and use_tensorcores:
dtype = "tfloat32"
chip_name = get_chip_name(device_name)
expected_flops = _AVAILABLE_GPU_SPECS[chip_name][dtype]
assert device_spec.flops_per_s == expected_flops
assert device_spec.flops_by_dtype[dtype] == expected_flops
assert (
device_spec.roofline_balancepoint == expected_flops / device_spec.bandwidth
)

with pytest.raises(AssertionError):
device_spec.flops_per_s = None
print(device_spec.roofline_balancepoint)
# Prevent setting attributes not in named fields to guard against user error
with pytest.raises(AttributeError):
device_spec.FLOPs = None


def test_empty_device_spec():
device_name = "fake device"
with patch_device(device_name):
with pytest.raises(AssertionError):
_ = CUDADeviceSpec()

# Ok to instantiate as long as fields are filled
_ = CUDADeviceSpec(
name=device_name,
flops_per_s=1.0,
bandwidth=1.0,
dtype=torch.float32,
use_tensorcores=True,
)
device_name = DEVICE_NAMES[0]

with patch_device(device_name):
# All critical fields will be auto-filled except for dtype (and vram, but vram is not used for downstream calcs atm)
_ = CUDADeviceSpec(dtype=torch.float32)

# No dtype specified
with pytest.raises(AssertionError):
_ = CUDADeviceSpec()
Loading
Loading