Skip to content

Commit

Permalink
Update eval ci cfg (#1259)
Browse files Browse the repository at this point in the history
* add cfg for pt, hf models

* update

* change max out len to 256

* update

* fix

* not batching for hf models

* update

* add gemma

* update

* update to new server

* avoid oom

* include hf res

* update hf res
  • Loading branch information
RunningLeon authored Mar 11, 2024
1 parent 42f94c5 commit f89ce87
Show file tree
Hide file tree
Showing 4 changed files with 709 additions and 89 deletions.
90 changes: 90 additions & 0 deletions .github/resources/opencompass-hf-results.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
{
"meta-llama/Llama-2-7b-chat": {
"ceval": "28.3",
"mmlu": "35.49",
"wic": "0",
"wsc": "0",
"triviaqa": "42.83",
"gsm8k": "26.38",
"race-middle": "41.57",
"race-high": "38.77",
"crows_pairs": "23.21"
},
"Qwen/Qwen-7B-Chat": {
"ceval": "55.41",
"mmlu": "53.74",
"wic": "52.04",
"wsc": "53.85",
"triviaqa": "43.71",
"gsm8k": "45.11",
"race-middle": "82.87",
"race-high": "77.96",
"crows_pairs": "55.37"
},
"internlm/internlm-chat-7b": {
"ceval": "53.40",
"mmlu": "50.86",
"wic": "57.21",
"wsc": "41.35",
"triviaqa": "28.19",
"gsm8k": "33.43",
"race-middle": "80.99",
"race-high": "77.62",
"crows_pairs": "43.04"
},
"baichuan-inc/Baichuan2-7B-Chat": {
"ceval": "53.92",
"mmlu": "50.13",
"wic": "0.16",
"wsc": "2.88",
"triviaqa": "37.66",
"gsm8k": "32.37",
"race-middle": "72.01",
"race-high": "67.44",
"crows_pairs": "8.09"
},
"internlm/internlm2-chat-7b": {
"ceval": "61.46",
"mmlu": "63.68",
"wic": "63.01",
"wsc": "41.35",
"triviaqa": "49.41",
"gsm8k": "71.57",
"race-middle": "89.21",
"race-high": "84.82",
"crows_pairs": "13.46"
},
"internlm/internlm2-chat-20b": {
"ceval": "-",
"mmlu": "66.50",
"wic": "-",
"wsc": "-",
"triviaqa": "-",
"gsm8k": "79.53",
"race-middle": "-",
"race-high": "-",
"crows_pairs": "-"
},
"Qwen/Qwen1.5-7B-Chat": {
"ceval": "-",
"mmlu": "61.44",
"wic": "-",
"wsc": "-",
"triviaqa": "-",
"gsm8k": "55.65",
"race-middle": "-",
"race-high": "-",
"crows_pairs": "-"
},
"mistralai/Mistral-7B-Instruct-v0.1": {
"ceval": "-",
"mmlu": "52.64",
"wic": "-",
"wsc": "-",
"triviaqa": "-",
"gsm8k": "41.93",
"race-middle": "-",
"race-high": "-",
"crows_pairs": "-"
}
}
78 changes: 72 additions & 6 deletions .github/scripts/action_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
import os
import shutil
import subprocess
from collections import OrderedDict
from typing import List

import fire
from mmengine.config import Config


def run_cmd(cmd_lines: List[str], log_path: str, cwd: str = None):
Expand Down Expand Up @@ -74,6 +76,28 @@ def add_summary(csv_path: str):
_append_summary(line)


def _load_hf_results(test_results: dict, model_name: str):
"""Read opencompass eval results."""
lmdeploy_dir = os.path.abspath(os.environ['LMDEPLOY_DIR'])
hf_res_path = os.path.join(
lmdeploy_dir, '.github/resources/opencompass-hf-results.json')
out = OrderedDict()
if os.path.exists(hf_res_path):
with open(hf_res_path, 'r') as f:
data = json.load(f)
if model_name in data:
res = data[model_name]
for dataset in test_results:
value = '-'
if dataset in res:
value = res[dataset]
out[dataset] = value
else:
logging.warning(
f'No opencompass results found for model {model_name}')
return out


def evaluate(models: List[str], workspace: str):
"""Evaluate models from lmdeploy using opencompass.
Expand All @@ -84,6 +108,7 @@ def evaluate(models: List[str], workspace: str):
os.makedirs(workspace, exist_ok=True)
output_csv = os.path.join(workspace, 'results.csv')
num_model = len(models)
test_model_names = set()
for idx, ori_model in enumerate(models):
print()
print(50 * '==')
Expand All @@ -98,7 +123,6 @@ def evaluate(models: List[str], workspace: str):
engine_type = 'tb'
else:
model = model_
assert engine_type == 'tb', 'current only support turbomind'

opencompass_dir = os.path.abspath(os.environ['OPENCOMPASS_DIR'])
lmdeploy_dir = os.path.abspath(os.environ['LMDEPLOY_DIR'])
Expand All @@ -112,6 +136,17 @@ def evaluate(models: List[str], workspace: str):
target_model = f'{engine_type}_{model}'
if do_lite:
target_model = target_model + f'_{precision}'
cfg = Config.fromfile(config_path_new)
if not hasattr(cfg, target_model):
logging.error(
f'Model {target_model} not found in configuration file')
continue
model_cfg = cfg[target_model]
hf_model_path = model_cfg['path']
if not os.path.exists(hf_model_path):
logging.error(f'Model path not exists: {hf_model_path}')
continue
logging.info(f'Start evaluating {target_model} ...\\nn{model_cfg}\n\n')
with open(config_path_new, 'a') as f:
f.write(f'\nmodels = [ {target_model} ]\n')

Expand All @@ -135,39 +170,70 @@ def evaluate(models: List[str], workspace: str):
print(f.read())

# parse evaluation results from csv file
data = []
model_results = OrderedDict()
with open(csv_file, 'r') as f:
lines = f.readlines()
for line in lines[1:]:
row = line.strip().split(',')
row = [_.strip() for _ in row]
if row[-1] != '-':
data.append((row[0], row[-1]))
model_results[row[0]] = row[-1]
crows_pairs_json = glob.glob(os.path.join(
work_dir, '*/results/*/crows_pairs.json'),
recursive=True)
if len(crows_pairs_json) == 1:
with open(crows_pairs_json[0], 'r') as f:
acc = json.load(f)['accuracy']
acc = f'{float(acc):.2f}'
data.append(('crows_pairs', acc))
model_results['crows_pairs'] = acc
logging.info(f'\n{hf_model_path}\n{model_results}')
dataset_names = list(model_results.keys())
prec = precision if do_lite else '-'
row = ','.join([model, engine_type, prec] + [_[1] for _ in data])

row = ','.join([model, engine_type, prec] +
[model_results[_] for _ in dataset_names])
hf_res_row = None
if hf_model_path not in test_model_names:
test_model_names.add(hf_model_path)
hf_res = _load_hf_results(model_results, hf_model_path)
if hf_res:
hf_metrics = [
hf_res[d] if d in hf_res else '-' for d in dataset_names
]
hf_res_row = ','.join([model, 'hf', '-'] + hf_metrics)
if not os.path.exists(output_csv):
with open(output_csv, 'w') as f:
header = ','.join(['Model', 'Engine', 'Precision'] +
[_[0] for _ in data])
dataset_names)
f.write(header + '\n')
f.write(row + '\n')
if hf_res_row:
f.write(hf_res_row + '\n')
else:
with open(output_csv, 'a') as f:
f.write(row + '\n')
if hf_res_row:
f.write(hf_res_row + '\n')

# write to github action summary
_append_summary('## Evaluation Results')
if os.path.exists(output_csv):
add_summary(output_csv)


def create_model_links(src_dir: str, dst_dir: str):
"""Create softlinks for models."""
paths = glob.glob(os.path.join(src_dir, '*'))
model_paths = [os.path.abspath(p) for p in paths if os.path.isdir(p)]
os.makedirs(dst_dir, exist_ok=True)
for src in model_paths:
_, model_name = os.path.split(src)
dst = os.path.join(dst_dir, model_name)
if not os.path.exists(dst):
os.symlink(src, dst)
else:
logging.warning(f'Model_path exists: {dst}')


if __name__ == '__main__':
fire.Fire()
Loading

0 comments on commit f89ce87

Please sign in to comment.