Update eval ci cfg (#1259)

* add cfg for pt, hf models * update * change max out len to 256 * update * fix * not batching for hf models * update * add gemma * update * update to new server * avoid oom * include hf res * update hf res
InternLM · Mar 11, 2024 · f89ce87 · f89ce87
1 parent 42f94c5
commit f89ce87
Show file tree

Hide file tree

Showing 4 changed files with 709 additions and 89 deletions.
diff --git a/.github/resources/opencompass-hf-results.json b/.github/resources/opencompass-hf-results.json
@@ -0,0 +1,90 @@
+{
+    "meta-llama/Llama-2-7b-chat": {
+        "ceval": "28.3",
+        "mmlu": "35.49",
+        "wic": "0",
+        "wsc": "0",
+        "triviaqa": "42.83",
+        "gsm8k": "26.38",
+        "race-middle": "41.57",
+        "race-high": "38.77",
+        "crows_pairs": "23.21"
+    },
+    "Qwen/Qwen-7B-Chat": {
+        "ceval": "55.41",
+        "mmlu": "53.74",
+        "wic": "52.04",
+        "wsc": "53.85",
+        "triviaqa": "43.71",
+        "gsm8k": "45.11",
+        "race-middle": "82.87",
+        "race-high": "77.96",
+        "crows_pairs": "55.37"
+    },
+    "internlm/internlm-chat-7b": {
+        "ceval": "53.40",
+        "mmlu": "50.86",
+        "wic": "57.21",
+        "wsc": "41.35",
+        "triviaqa": "28.19",
+        "gsm8k": "33.43",
+        "race-middle": "80.99",
+        "race-high": "77.62",
+        "crows_pairs": "43.04"
+    },
+    "baichuan-inc/Baichuan2-7B-Chat": {
+        "ceval": "53.92",
+        "mmlu": "50.13",
+        "wic": "0.16",
+        "wsc": "2.88",
+        "triviaqa": "37.66",
+        "gsm8k": "32.37",
+        "race-middle": "72.01",
+        "race-high": "67.44",
+        "crows_pairs": "8.09"
+    },
+    "internlm/internlm2-chat-7b": {
+        "ceval": "61.46",
+        "mmlu": "63.68",
+        "wic": "63.01",
+        "wsc": "41.35",
+        "triviaqa": "49.41",
+        "gsm8k": "71.57",
+        "race-middle": "89.21",
+        "race-high": "84.82",
+        "crows_pairs": "13.46"
+    },
+    "internlm/internlm2-chat-20b": {
+        "ceval": "-",
+        "mmlu": "66.50",
+        "wic": "-",
+        "wsc": "-",
+        "triviaqa": "-",
+        "gsm8k": "79.53",
+        "race-middle": "-",
+        "race-high": "-",
+        "crows_pairs": "-"
+    },
+    "Qwen/Qwen1.5-7B-Chat": {
+        "ceval": "-",
+        "mmlu": "61.44",
+        "wic": "-",
+        "wsc": "-",
+        "triviaqa": "-",
+        "gsm8k": "55.65",
+        "race-middle": "-",
+        "race-high": "-",
+        "crows_pairs": "-"
+    },
+    "mistralai/Mistral-7B-Instruct-v0.1": {
+        "ceval": "-",
+        "mmlu": "52.64",
+        "wic": "-",
+        "wsc": "-",
+        "triviaqa": "-",
+        "gsm8k": "41.93",
+        "race-middle": "-",
+        "race-high": "-",
+        "crows_pairs": "-"
+    }
+}
diff --git a/.github/scripts/action_tools.py b/.github/scripts/action_tools.py
@@ -5,9 +5,11 @@
 import os
 import shutil
 import subprocess
+from collections import OrderedDict
 from typing import List
 
 import fire
+from mmengine.config import Config
 
 
 def run_cmd(cmd_lines: List[str], log_path: str, cwd: str = None):
@@ -74,6 +76,28 @@ def add_summary(csv_path: str):
             _append_summary(line)
 
 
+def _load_hf_results(test_results: dict, model_name: str):
+    """Read opencompass eval results."""
+    lmdeploy_dir = os.path.abspath(os.environ['LMDEPLOY_DIR'])
+    hf_res_path = os.path.join(
+        lmdeploy_dir, '.github/resources/opencompass-hf-results.json')
+    out = OrderedDict()
+    if os.path.exists(hf_res_path):
+        with open(hf_res_path, 'r') as f:
+            data = json.load(f)
+            if model_name in data:
+                res = data[model_name]
+                for dataset in test_results:
+                    value = '-'
+                    if dataset in res:
+                        value = res[dataset]
+                    out[dataset] = value
+            else:
+                logging.warning(
+                    f'No opencompass results found for model {model_name}')
+    return out
+
+
 def evaluate(models: List[str], workspace: str):
     """Evaluate models from lmdeploy using opencompass.
 
@@ -84,6 +108,7 @@ def evaluate(models: List[str], workspace: str):
     os.makedirs(workspace, exist_ok=True)
     output_csv = os.path.join(workspace, 'results.csv')
     num_model = len(models)
+    test_model_names = set()
     for idx, ori_model in enumerate(models):
         print()
         print(50 * '==')
@@ -98,7 +123,6 @@ def evaluate(models: List[str], workspace: str):
             engine_type = 'tb'
         else:
             model = model_
-        assert engine_type == 'tb', 'current only support turbomind'
 
         opencompass_dir = os.path.abspath(os.environ['OPENCOMPASS_DIR'])
         lmdeploy_dir = os.path.abspath(os.environ['LMDEPLOY_DIR'])
@@ -112,6 +136,17 @@ def evaluate(models: List[str], workspace: str):
         target_model = f'{engine_type}_{model}'
         if do_lite:
             target_model = target_model + f'_{precision}'
+        cfg = Config.fromfile(config_path_new)
+        if not hasattr(cfg, target_model):
+            logging.error(
+                f'Model {target_model} not found in configuration file')
+            continue
+        model_cfg = cfg[target_model]
+        hf_model_path = model_cfg['path']
+        if not os.path.exists(hf_model_path):
+            logging.error(f'Model path not exists: {hf_model_path}')
+            continue
+        logging.info(f'Start evaluating {target_model} ...\\nn{model_cfg}\n\n')
         with open(config_path_new, 'a') as f:
             f.write(f'\nmodels = [ {target_model} ]\n')
 
@@ -135,39 +170,70 @@ def evaluate(models: List[str], workspace: str):
                 print(f.read())
 
         # parse evaluation results from csv file
-        data = []
+        model_results = OrderedDict()
         with open(csv_file, 'r') as f:
             lines = f.readlines()
             for line in lines[1:]:
                 row = line.strip().split(',')
                 row = [_.strip() for _ in row]
                 if row[-1] != '-':
-                    data.append((row[0], row[-1]))
+                    model_results[row[0]] = row[-1]
         crows_pairs_json = glob.glob(os.path.join(
             work_dir, '*/results/*/crows_pairs.json'),
                                      recursive=True)
         if len(crows_pairs_json) == 1:
             with open(crows_pairs_json[0], 'r') as f:
                 acc = json.load(f)['accuracy']
                 acc = f'{float(acc):.2f}'
-                data.append(('crows_pairs', acc))
+                model_results['crows_pairs'] = acc
+        logging.info(f'\n{hf_model_path}\n{model_results}')
+        dataset_names = list(model_results.keys())
         prec = precision if do_lite else '-'
-        row = ','.join([model, engine_type, prec] + [_[1] for _ in data])
+
+        row = ','.join([model, engine_type, prec] +
+                       [model_results[_] for _ in dataset_names])
+        hf_res_row = None
+        if hf_model_path not in test_model_names:
+            test_model_names.add(hf_model_path)
+            hf_res = _load_hf_results(model_results, hf_model_path)
+            if hf_res:
+                hf_metrics = [
+                    hf_res[d] if d in hf_res else '-' for d in dataset_names
+                ]
+                hf_res_row = ','.join([model, 'hf', '-'] + hf_metrics)
         if not os.path.exists(output_csv):
             with open(output_csv, 'w') as f:
                 header = ','.join(['Model', 'Engine', 'Precision'] +
-                                  [_[0] for _ in data])
+                                  dataset_names)
                 f.write(header + '\n')
                 f.write(row + '\n')
+                if hf_res_row:
+                    f.write(hf_res_row + '\n')
         else:
             with open(output_csv, 'a') as f:
                 f.write(row + '\n')
+                if hf_res_row:
+                    f.write(hf_res_row + '\n')
 
     # write to github action summary
     _append_summary('## Evaluation Results')
     if os.path.exists(output_csv):
         add_summary(output_csv)
 
 
+def create_model_links(src_dir: str, dst_dir: str):
+    """Create softlinks for models."""
+    paths = glob.glob(os.path.join(src_dir, '*'))
+    model_paths = [os.path.abspath(p) for p in paths if os.path.isdir(p)]
+    os.makedirs(dst_dir, exist_ok=True)
+    for src in model_paths:
+        _, model_name = os.path.split(src)
+        dst = os.path.join(dst_dir, model_name)
+        if not os.path.exists(dst):
+            os.symlink(src, dst)
+        else:
+            logging.warning(f'Model_path exists: {dst}')
+
+
 if __name__ == '__main__':
     fire.Fire()