feat: Add extra metadata. Update Slurm scripts

huggingface · Sep 27, 2024 · 609ae8c · 609ae8c
1 parent c8c1687
commit 609ae8c
Show file tree

Hide file tree

Showing 9 changed files with 1,184 additions and 112 deletions.
diff --git a/extra/slurm/benchmark.py b/extra/slurm/benchmark.py
@@ -7,44 +7,47 @@
 
 def main():
     models = [
-        ('meta-llama/Meta-Llama-3.1-8B-Instruct', 1),
-        ('meta-llama/Meta-Llama-3.1-70B-Instruct', 4),
+        ('meta-llama/Llama-3.1-8B-Instruct', 1),
+        ('meta-llama/Llama-3.1-70B-Instruct', 4),
         ('mistralai/Mixtral-8x7B-Instruct-v0.1', 2),
+        ('neuralmagic/Meta-Llama-3-70B-Instruct-FP8', 2),
     ]
+    num_passes = 2
     engines = ['tgi', 'vllm']
-    for model in models:
-        print(f"Submitting job for {model[0]}")
-        gpus = model[1]
-        cpus_per_task = gpus * CPUS_PER_GPU
-        for engine in engines:
-            job_name = f'bench_{engine}_{model[0].replace("/", "_")}'
-            args = ['sbatch',
-                    '--job-name', job_name,
-                    '--output', f'/fsx/%u/logs/%x-%j.log',
-                    '--time', '1:50:00',
-                    '--qos', 'normal',
-                    '--partition', 'hopper-prod',
-                    '--gpus', str(gpus),
-                    '--ntasks', '1',
-                    '--cpus-per-task', str(cpus_per_task),
-                    '--mem-per-cpu', str(MEM_PER_CPU_GB) + 'G',
-                    '--nodes', '1',
-                    ':',
-                    '--gpus', '1',
-                    '--ntasks', '1',
-                    '--cpus-per-task', str(CPUS_PER_GPU),
-                    '--mem-per-cpu', str(MEM_PER_CPU_GB) + 'G',
-                    '--nodes', '1',
-                    f'{engine}.slurm']
-            env = os.environ.copy()
-            env['MODEL'] = model[0]
-            process = subprocess.run(args, capture_output=True,
-                                     env=env)
-            print(process.stdout.decode())
-            print(process.stderr.decode())
-            if process.returncode != 0:
-                print(f'Error while submitting :: {args}')
-                exit(1)
+    for i in range(num_passes):
+        for model in models:
+            print(f"PASS {i} - Submitting job for {model[0]}")
+            gpus = model[1]
+            cpus_per_task = gpus * CPUS_PER_GPU
+            for engine in engines:
+                job_name = f'bench_{model[0].replace("/", "_")}_{engine}_pass_{i}'
+                args = ['sbatch',
+                        '--job-name', job_name,
+                        '--output', f'/fsx/%u/logs/%x-%j.log',
+                        '--time', '1:50:00',
+                        '--qos', 'normal',
+                        '--partition', 'hopper-prod',
+                        '--gpus', str(gpus),
+                        '--ntasks', '1',
+                        '--cpus-per-task', str(cpus_per_task),
+                        '--mem-per-cpu', str(MEM_PER_CPU_GB) + 'G',
+                        '--nodes', '1',
+                        ':',
+                        '--gpus', '1',
+                        '--ntasks', '1',
+                        '--cpus-per-task', str(CPUS_PER_GPU),
+                        '--mem-per-cpu', str(MEM_PER_CPU_GB) + 'G',
+                        '--nodes', '1',
+                        f'{engine}.slurm']
+                env = os.environ.copy()
+                env['MODEL'] = model[0]
+                process = subprocess.run(args, capture_output=True,
+                                         env=env)
+                print(process.stdout.decode())
+                print(process.stderr.decode())
+                if process.returncode != 0:
+                    print(f'Error while submitting :: {args}')
+                    exit(1)
 
 
 if __name__ == '__main__':

diff --git a/extra/slurm/tgi.slurm b/extra/slurm/tgi.slurm
@@ -8,7 +8,6 @@
 #SBATCH hetjob
 #SBATCH --gpus 1 --ntasks 1 --cpus-per-task 11 --mem-per-cpu 20G --nodes=1
 
-
 if [ -z "$MODEL" ]; then
     echo "MODEL environment variable is not set"
     exit 1
@@ -17,7 +16,10 @@ fi
 echo "Starting TGI benchmark for $MODEL"
 export RUST_BACKTRACE=full
 export RUST_LOG=text_generation_inference_benchmark=info
-export PORT=8090
+
+# set a random available port to avoid conflicts
+PORT=$(shuf -i 8000-9999 -n 1)
+export PORT
 
 echo "Model will run on ${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}"
 echo "Benchmark will run on ${SLURM_JOB_NODELIST_HET_GROUP_1}"
@@ -38,7 +40,7 @@ srun --het-group=0 \
       --cuda-graphs="1,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120,128"&
 
 # wait until /health is available, die after 5 minutes
-timeout 300 bash -c "while [[ \"\$(curl -s -o /dev/null -w '%{http_code}' http://localhost:${PORT}/health)\" != \"200\" ]]; do sleep 1 && echo \"Waiting for TGI to start...\"; done" || exit 1
+timeout 600 bash -c "while [[ \"\$(curl -s -o /dev/null -w '%{http_code}' http://localhost:${PORT}/health)\" != \"200\" ]]; do sleep 1 && echo \"Waiting for TGI to start...\"; done" || exit 1
 exit_code=$?
 
 RESULTS_DIR="/fsx/$USER/benchmarks_results/tgi"
@@ -47,6 +49,7 @@ mkdir -p "${RESULTS_DIR}"
 if [[ $exit_code != 124 ]]; then
     # run benchmark
     echo "Starting benchmark"
+    VERSION=$(curl -s http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}/info | jq -r '.version')
     srun --het-group=1 \
          -u \
          -n 1 \
@@ -59,9 +62,12 @@ if [[ $exit_code != 124 ]]; then
              --url "http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}" \
              --duration 120s \
              --warmup 30s \
-             --num-rates 30 \
+             --benchmark-kind rate \
+             --rates 0.8 --rates 1.6 --rates 2.4 --rates 3.2 --rates 4.0 --rates 4.8 --rates 5.6 --rates 6.4 --rates 7.2 --rates 8.0 --rates 8.8 --rates 9.6 --rates 10.4 --rates 11.2 --rates 12.0 --rates 12.8 --rates 13.6 --rates 14.4 --rates 15.2 --rates 16.0 --rates 16.8 --rates 17.6 --rates 18.4 --rates 19.2 --rates 20.0 --rates 20.8 --rates 21.6 --rates 22.4 --rates 23.2 --rates 24.0 \
              --prompt-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
              --decode-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
+             --extra-meta "version=$VERSION" \
+             --extra-meta "engine=\"TGI\"" \
              --no-console
 fi
 

diff --git a/extra/slurm/vllm.slurm b/extra/slurm/vllm.slurm
@@ -17,7 +17,9 @@ fi
 echo "Starting vLLM benchmark for $MODEL"
 export RUST_BACKTRACE=full
 export RUST_LOG=text_generation_inference_benchmark=info
-export PORT=8090
+# set a random available port to avoid conflicts
+PORT=$(shuf -i 8000-9999 -n 1)
+export PORT
 
 echo "Model will run on ${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}"
 echo "Benchmark will run on ${SLURM_JOB_NODELIST_HET_GROUP_1}"
@@ -37,7 +39,7 @@ srun --het-group=0 \
         --tensor-parallel-size "${SLURM_GPUS_ON_NODE}"&
 
 # wait until /health is available, die after 5 minutes
-timeout 300 bash -c "while [[ \"\$(curl -s -o /dev/null -w '%{http_code}' http://localhost:${PORT}/health)\" != \"200\" ]]; do sleep 1 && echo \"Waiting for vLLM to start...\"; done" || exit 1
+timeout 600 bash -c "while [[ \"\$(curl -s -o /dev/null -w '%{http_code}' http://localhost:${PORT}/health)\" != \"200\" ]]; do sleep 1 && echo \"Waiting for vLLM to start...\"; done" || exit 1
 exit_code=$?
 
 RESULTS_DIR="/fsx/$USER/benchmarks_results/vllm"
@@ -46,6 +48,7 @@ mkdir -p "${RESULTS_DIR}"
 if [[ $exit_code != 124 ]]; then
     # run benchmark
     echo "Starting benchmark"
+    VERSION=$(curl -s http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}/version | jq -r '.version')
     srun --het-group=1 \
          -u \
          -n 1 \
@@ -58,9 +61,12 @@ if [[ $exit_code != 124 ]]; then
              --url "http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}" \
              --duration 120s \
              --warmup 30s \
-             --num-rates 30 \
+             --benchmark-kind rate \
+             --rates 0.8 --rates 1.6 --rates 2.4 --rates 3.2 --rates 4.0 --rates 4.8 --rates 5.6 --rates 6.4 --rates 7.2 --rates 8.0 --rates 8.8 --rates 9.6 --rates 10.4 --rates 11.2 --rates 12.0 --rates 12.8 --rates 13.6 --rates 14.4 --rates 15.2 --rates 16.0 --rates 16.8 --rates 17.6 --rates 18.4 --rates 19.2 --rates 20.0 --rates 20.8 --rates 21.6 --rates 22.4 --rates 23.2 --rates 24.0 \
              --prompt-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
              --decode-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
+             --extra-meta "version=$VERSION" \
+             --extra-meta "engine=\"vLLM\"" \
              --no-console
 fi
 

diff --git a/plot.py b/plot.py
@@ -10,7 +10,7 @@
 pd.options.mode.copy_on_write = True
 
 
-def plot(data_files: dict[str, str]):
+def plot(model:str,data_files: dict[str, str]):
     df = pd.DataFrame()
     # Load the results
     for key, filename in data_files.items():
@@ -23,15 +23,16 @@ def plot(data_files: dict[str, str]):
                 entry['engine'] = key
                 del entry['config']
                 df = pd.concat([df, pd.DataFrame(entry, index=[0])])
-
     # Filter the results
     constant_rate = df[
         (df['executor_type'] == 'ConstantArrivalRate') & (df['id'] != 'warmup') & (df['id'] != 'throughput')]
     constant_vus = df[(df['executor_type'] == 'ConstantVUs') & (df['id'] != 'warmup') & (df['id'] != 'throughput')]
     if len(constant_rate) > 0:
-        plot_inner('Requests/s', 'rate', constant_rate, 'Constant Rate benchmark')
+        plot_inner('Requests/s', 'rate', constant_rate, f'Constant Rate benchmark\n{model}')
+        plt.savefig(f'{directory}/{model}_constant_rate.png')
     if len(constant_vus) > 0:
-        plot_inner('VUs', 'max_vus', constant_vus, 'Constant VUs benchmark')
+        plot_inner('VUs', 'max_vus', constant_vus, f'Constant VUs benchmark\n{model}')
+        plt.savefig(f'{directory}/{model}_constant_vus.png')
 
 
 def plot_inner(x_title, x_key, results, chart_title):
@@ -59,7 +60,7 @@ def plot_inner(x_title, x_key, results, chart_title):
         for i, engine in enumerate(results['engine'].unique()):
             df_sorted = results[results['engine'] == engine].sort_values(by=x_key)
             ax.plot(df_sorted[x_key], df_sorted[metric], marker='o', markersize=2,
-                    color=colors[i % len(colors)] if engine != 'tgi' else '#FF9D00',
+                    color=colors[i % len(colors)] if not engine.lower().startswith('tgi') else '#FF9D00',
                     label=f"{engine}")
         ax.set_title(title)
         ax.tick_params(axis='x', rotation=0)
@@ -80,14 +81,17 @@ def plot_inner(x_title, x_key, results, chart_title):
         ax.legend(title='Engine', loc='upper right')
     plt.suptitle(chart_title, fontsize=16)
 
-    plt.show()
+    #plt.show()
 
 
 if __name__ == '__main__':
-    directory='results/llama-70B'
-    # list json files in results directory
-    data_files = {}
-    for filename in os.listdir(directory):
-        if filename.endswith('.json'):
-            data_files[filename.split('.')[0]] = f'{directory}/{filename}'
-    plot(data_files)
+    results_dir = 'results'
+    # list directories
+    directories = [f'{results_dir}/{d}' for d in os.listdir(results_dir) if os.path.isdir(f'{results_dir}/{d}')]
+    for directory in directories:
+        # list json files in results directory
+        data_files = {}
+        for filename in os.listdir(directory):
+            if filename.endswith('.json'):
+                data_files[filename.split('.')[-2]] = f'{directory}/{filename}'
+        plot(directory.split('/')[-1], data_files)