Skip to content

Commit

Permalink
fix: Update runners group
Browse files Browse the repository at this point in the history
  • Loading branch information
Hugoch committed Oct 1, 2024
1 parent fc7dcb0 commit beb1cf1
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 29 deletions.
6 changes: 4 additions & 2 deletions .github/workflows/load_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
runs-on:
group: aws-g6-12xlarge-plus-priv
group: aws-g6-12xl-plus-priv-cache
env:
DOCKER_VOLUME: /cache
steps:
Expand All @@ -41,8 +41,10 @@ jobs:
- name: Run bench test
run: |
export PATH="$HOME/.local/bin:$PATH"
cd load_tests
python benchmarks.py
poetry install
poetry run python benchmarks.py
shell: bash
env:
HF_TOKEN: ${{ secrets.HF_TOKEN_BENCHMARK }}
84 changes: 57 additions & 27 deletions load_tests/benchmarks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import os
import traceback
from typing import Dict, Tuple, List

import GPUtil
import docker
Expand All @@ -13,7 +14,7 @@ class InferenceEngineRunner:
def __init__(self, model: str):
self.model = model

def run(self, parameters: list[tuple]):
def run(self, parameters: list[tuple], gpus: int = 0):
NotImplementedError("This method should be implemented by the subclass")

def stop(self):
Expand All @@ -32,7 +33,7 @@ def __init__(self,
self.image = image
self.volumes = volumes

def run(self, parameters: list[tuple]):
def run(self, parameters: list[tuple], gpus: int = 0):
params = f"--model-id {self.model} --port 8080"
for p in parameters:
params += f" --{p[0]} {str(p[1])}"
Expand All @@ -43,7 +44,10 @@ def run(self, parameters: list[tuple]):
self.container = run_docker(self.image, params,
"Connected",
"ERROR",
volumes=volumes)
volumes=volumes,
gpus=gpus,
ports={"8080/tcp": 8080}
)

def stop(self):
if self.container:
Expand All @@ -53,15 +57,15 @@ def stop(self):
class BenchmarkRunner:
def __init__(self,
image: str = "ghcr.io/huggingface/text-generation-inference-benchmark:latest",
volumes=None):
volumes: List[Tuple[str, str]] = None):
if volumes is None:
volumes = []
self.container = None
self.image = image
self.volumes = volumes

def run(self, parameters: list[tuple]):
params = ""
def run(self, parameters: list[tuple], network_mode):
params = "text-generation-inference-benchmark"
for p in parameters:
params += f" --{p[0]} {str(p[1])}" if p[1] is not None else f" --{p[0]}"
logger.info(f"Running text-generation-inference-benchmarks with parameters: {params}")
Expand All @@ -71,31 +75,42 @@ def run(self, parameters: list[tuple]):
self.container = run_docker(self.image, params,
"Benchmark finished",
"Error",
volumes=volumes)
volumes=volumes,
extra_env={"RUST_LOG": "text_generation_inference_benchmark=info",
"RUST_BACKTRACE": "full"},
network_mode=network_mode)

def stop(self):
if self.container:
self.container.stop()


def run_docker(image: str, args: str, success_sentinel: str,
error_sentinel: str, volumes=None, gpus: int = 0) -> Container:
error_sentinel: str, ports: Dict[str, int] = None, volumes=None, network_mode: str = "bridge",
gpus: int = 0, extra_env: Dict[str, str] = None) -> Container:
if ports is None:
ports = {}
if volumes is None:
volumes = {}
client = docker.from_env()
if extra_env is None:
extra_env = {}
client = docker.from_env(timeout=300)
# retrieve the GPU devices from CUDA_VISIBLE_DEVICES
devices = [f"{i}" for i in
range(get_num_gpus())][:gpus]
environment = {"HF_TOKEN": os.environ.get("HF_TOKEN")}
environment.update(extra_env)
container = client.containers.run(image, args,
detach=True,
device_requests=[
docker.types.DeviceRequest(device_ids=devices,
capabilities=[['gpu']]) if gpus > 0 else None
],
capabilities=[['gpu']])
] if gpus > 0 else None,
volumes=volumes,
shm_size="1g",
ports={"8080/tcp": 8080},
environment={"HF_TOKEN": os.environ.get("HF_TOKEN")}, )
ports=ports,
network_mode=network_mode,
environment=environment, )
for line in container.logs(stream=True):
print(line.decode("utf-8"), end="")
if success_sentinel.encode("utf-8") in line:
Expand Down Expand Up @@ -145,22 +160,26 @@ def build_df(model: str, data_files: dict[str, str]) -> pd.DataFrame:

def main():
results_dir = 'results'
# get absolute path
results_dir = os.path.join(os.path.dirname(__file__), results_dir)
logger.info('Starting benchmark')
models = [
('meta-llama/Llama-3.1-8B-Instruct', 1),
# ('meta-llama/Llama-3.1-70B-Instruct', 4),
# ('mistralai/Mixtral-8x7B-Instruct-v0.1', 2),
]
sha = os.environ.get('GITHUB_SHA')
# create results directory
os.makedirs(results_dir, exist_ok=True)
success = True
for model in models:
tgi_runner = TGIDockerRunner(model[0])
# create results directory
model_dir = os.path.join(results_dir, f'{model[0].replace("/", "_").replace(".", "_")}')
os.makedirs(model_dir, exist_ok=True)
runner = BenchmarkRunner(
volumes=['results', '/opt/text-generation-inference-benchmark/results']
volumes=[(model_dir, '/opt/text-generation-inference-benchmark/results')]
)
try:
tgi_runner.run([('max-concurrent-requests', 512)])
tgi_runner.run([('max-concurrent-requests', 512)], gpus=model[1])
logger.info(f'TGI started for model {model[0]}')
parameters = [
('tokenizer-name', model[0]),
Expand All @@ -171,27 +190,38 @@ def main():
('benchmark-kind', 'rate'),
('prompt-options', 'num_tokens=200,max_tokens=220,min_tokens=180,variance=10'),
('decode-options', 'num_tokens=200,max_tokens=220,min_tokens=180,variance=10'),
('extra-meta', f'engine=TGI,tp={model[1]},version={sha},gpu={get_gpu_name()}'),
('--no-console', None)
('extra-meta', f'"engine=TGI,tp={model[1]},version={sha},gpu={get_gpu_name()}"'),
('no-console', None)
]
runner.run(parameters)
rates = [('rates', f'{r / 10.}') for r in list(range(8, 248, 8))]
parameters.extend(rates)
runner.run(parameters, f'container:{tgi_runner.container.id}')
except Exception as e:
logger.error(f'Error running benchmark for model {model[0]}: {e}')
# print the stack trace
print(traceback.format_exc())
success = False
finally:
tgi_runner.stop()
runner.stop()
# list json files in results directory
data_files = {}
if not success:
logger.error('Some benchmarks failed')
exit(1)

df = pd.DataFrame()
for filename in os.listdir(results_dir):
if filename.endswith('.json'):
data_files[filename.split('.')[-2]] = f'{results_dir}/{filename}'
df = pd.concat([df, build_df(results_dir.split('/')[-1], data_files)])
# list recursively directories
directories = [f'{results_dir}/{d}' for d in os.listdir(results_dir) if os.path.isdir(f'{results_dir}/{d}')]
logger.info(f'Found result directories: {directories}')
for directory in directories:
data_files = {}
for filename in os.listdir(directory):
if filename.endswith('.json'):
data_files[filename.split('.')[-2]] = f'{directory}/{filename}'
logger.info(f'Processing directory {directory}')
df = pd.concat([df, build_df(directory.split('/')[-1], data_files)])
df['device'] = get_gpu_name()
df['error_rate'] = df['failed_requests'] / (df['failed_requests'] + df['successful_requests']) * 100.0
df.to_parquet('s3://text-generation-inference-ci/benchmarks/ci/')
df.to_parquet(f's3://text-generation-inference-ci/benchmarks/ci/{sha}.parquet')


if __name__ == "__main__":
Expand Down

0 comments on commit beb1cf1

Please sign in to comment.