-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
64 changed files
with
9,666 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# Ignore test linting to avoid conflicting changes to version stability. | ||
exclude: ^tests/testdata/ | ||
repos: | ||
- repo: https://github.com/pre-commit/pre-commit-hooks | ||
rev: v4.1.0 | ||
hooks: | ||
- id: check-added-large-files | ||
- id: check-ast | ||
- id: check-byte-order-marker | ||
- id: check-case-conflict | ||
- id: check-json | ||
- id: check-merge-conflict | ||
- id: check-symlinks | ||
- id: check-yaml | ||
- id: destroyed-symlinks | ||
- id: detect-private-key | ||
- id: end-of-file-fixer | ||
- id: no-commit-to-branch | ||
- id: requirements-txt-fixer | ||
- id: trailing-whitespace | ||
- id: fix-byte-order-marker | ||
exclude: docs/CNAME | ||
- id: fix-encoding-pragma | ||
args: [--remove] | ||
- id: mixed-line-ending | ||
args: [--fix=lf] | ||
- repo: https://github.com/pycqa/flake8 | ||
rev: 3.7.9 | ||
hooks: | ||
- id: flake8 | ||
- repo: https://github.com/psf/black | ||
rev: 23.11.0 | ||
hooks: | ||
- id: black | ||
language_version: python3.9 | ||
- repo: https://github.com/codespell-project/codespell | ||
rev: v2.1.0 | ||
hooks: | ||
- id: codespell | ||
exclude: > | ||
(?x)^( | ||
.*\.json|ignore.txt | ||
)$ | ||
args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
@software{eval-harness, | ||
author = {Gao, Leo and | ||
Tow, Jonathan and | ||
Biderman, Stella and | ||
Black, Sid and | ||
DiPofi, Anthony and | ||
Foster, Charles and | ||
Golding, Laurence and | ||
Hsu, Jeffrey and | ||
McDonell, Kyle and | ||
Muennighoff, Niklas and | ||
Phang, Jason and | ||
Reynolds, Laria and | ||
Tang, Eric and | ||
Thite, Anish and | ||
Wang, Ben and | ||
Wang, Kevin and | ||
Zou, Andy}, | ||
title = {A framework for few-shot language model evaluation}, | ||
month = sep, | ||
year = 2021, | ||
publisher = {Zenodo}, | ||
version = {v0.0.1}, | ||
doi = {10.5281/zenodo.5371628}, | ||
url = {https://doi.org/10.5281/zenodo.5371628} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
# MERA with Language Model Evaluation Harness | ||
|
||
MERA: Multimodal Evaluation for Russian-language Architectures | ||
|
||
The LM-harness support for the MERA benchmark datasets. | ||
|
||
## Overview | ||
|
||
This project provides a unified framework to test generative language models on MERA benchmark and its evaluation tasks. | ||
|
||
## Install | ||
|
||
To install `lm-eval` from the repository main branch, run: | ||
|
||
```bash | ||
pip install -e . | ||
``` | ||
|
||
To support loading GPTQ quantized models, install the package with the `auto-gptq` extra: | ||
|
||
```bash | ||
pip install -e ".[auto-gptq]" | ||
``` | ||
|
||
## MERA Benchmark: | ||
|
||
### Run full benchmark with bash script | ||
|
||
Sample command to run benchmark with `ai-forever/rugpt3large_based_on_gpt2` model from Huggingface Hub: | ||
|
||
```linux | ||
CUDA_VISIBLE_DEVICES=0 MERA_FOLDER="$PWD/mera_results/rugpt3large_760m_defaults" MERA_MODEL_STRING="pretrained=ai-forever/rugpt3large_based_on_gpt2,dtype=auto" bash run_mera.sh | ||
``` | ||
|
||
Use `CUDA_VISIBLE_DEVICES` to set cuda device visibility, `MERA_FOLDER` for path to store outputs, | ||
`MERA_MODEL_STRING` to setup `model_args` parameter of `lm-evaluation-harness`'s `main.py`. | ||
Use `MERA_COMMON_SETUP` to change default parameters for model inferencing with `main.py` (defaults are | ||
`--model hf-causal --device cuda --max_batch_size=64 --batch_size=auto --inference`). | ||
See more on parameters in next section. | ||
|
||
### Run specific bencmark manually (ruMMLU example) | ||
|
||
Running specific benchmark available with `main.py` script. | ||
|
||
Example: | ||
```shell | ||
CUDA_VISIBLE_DEVICES=3 python main.py --model hf-causal --model_args pretrained=mistralai/Mistral-7B-v0.1,dtype=auto,max_length=11000 \ | ||
--device cuda --output_base_path="$PWD/mera_results/Mistral-7B-v0.1_defaults" --max_batch_size=16 --batch_size=auto \ | ||
--inference --write_out --tasks rummlu --num_fewshot=5 \ | ||
--output_path="$PWD/mera_results/Mistral-7B-v0.1_defaults/rummlu_result.json" | ||
``` | ||
|
||
#### Notes on `main.py` settings | ||
|
||
Use `--tasks` to provide comma separated list of tasks to run (available options are: `bps`, `chegeka`, `lcs`, | ||
`mathlogicqa`, `multiq`, `parus`, `rcb`, `rudetox`, `ruethics`, `ruhatespeech`, `ruhhh`, `ruhumaneval`, `rummlu`, | ||
`rumodar`, `rumultiar`, `ruopenbookqa`, `rutie`, `ruworldtree`, `rwsd`, `simplear`, `use`). | ||
Avoiding this argument will run all tasks with same provided settings. | ||
|
||
`--num_fewshot` sets fewshot count. MERA supposes to run tasks with the following fewshot count: | ||
* `--num_fewshot=0` (zeroshot) with `multiq`, `parus`, `rcb`, `rumodar`, `rwsd`, `use`, `rudetox`, `ruethics`, | ||
`ruhatespeech`, `ruhhh`, `rutie`, and `ruhumaneval`; | ||
* `--num_fewshot=2` with `bps` and `lcs`; | ||
* `--num_fewshot=4` with `chegeka`; | ||
* `--num_fewshot=5` with `mathlogicqa`, `ruworldtree`, `ruopenbookqa`, `simplear`, `rumultiar`, and `rummlu`. | ||
|
||
Use `CUDA_VISIBLE_DEVICES` to set cuda device visibility (setting `--device cuda:3` works inconsisitently). | ||
|
||
`--model hf-causal` is used for models compatible with transformers' `AutoModelForCausalLM` class and is most | ||
stable with MERA benchmark. | ||
You can try to use unstable `hf-causal-experimental` (`AutoModelForCausalLM` compatible) or | ||
`hf-seq2seq` (`AutoModelForSeq2SeqLM`) for your model. | ||
|
||
`--model_args` is for comma separated parameters of `from_pretrained` method of autoclass. One should be aware of | ||
hardware requirements to run big models and limit maximum input length of models with parameter `max_length` | ||
to avoid out-of-memory errors during run. | ||
|
||
`--batch_size=auto` is set to determine batch size for run automatically based on tasks and inputs maximum value | ||
to start search down is set with `--max_batch_size`. Bigger batches may speed up running whole MERA benchmark. | ||
|
||
`--output_base_path` is path to dir (will be created) to store data for submission preparation and logs. | ||
|
||
`--inference` important to use this key always, it allows to run on datasets without proper replies provided | ||
(score result 0 will be reported). | ||
|
||
`--write_out` turn on extra logging, should be always on if the submission may be made public. | ||
|
||
`--no_cache` is used to turn off caching of model files (datasets are not cached). | ||
|
||
`--output_path` path to extra log file with parameters of run and results of task. It is preferred to be inside | ||
`output_base_path` directory. | ||
|
||
|
||
### Convert lm-harness to submission | ||
Bash script above runs submission zip packing routine. Here is the way to run packing manually. | ||
|
||
For converting run | ||
|
||
```shell | ||
python scripts/log_to_submission.py | ||
``` | ||
|
||
Cmd arguments: | ||
|
||
* `--outputs_dir` — path to directory with outputs (`MERA_FOLDER` from bash script above) | ||
* `--dst_dir` — directory for store submission zip | ||
* `--dataset_dir` — path to `lm_eval/datasets/` | ||
* `--logs_public_submit` (`--no-logs_public_submit`) — pack logs for public submission in separate file (true by default) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
import os | ||
import json | ||
import collections | ||
import argparse | ||
import pathlib | ||
from typing import List | ||
|
||
import lm_eval.tasks | ||
import lm_eval.metrics | ||
from lm_eval import evaluator | ||
|
||
decontaminate_suffix = "_decontaminate" | ||
|
||
|
||
def restore_records(dirs: List[pathlib.Path], base_path: pathlib.Path): | ||
print(dirs) | ||
process_res_queue = {} | ||
docs = {} | ||
for path in dirs: | ||
task_name = str(path)[len("lm_harness_logs_") :] | ||
with open(base_path.joinpath(path, "output_answers.json")) as resp_file, open( | ||
base_path.joinpath(path, "input_docs.json") | ||
) as source_file: | ||
resps = json.load(resp_file) | ||
sources = json.load(source_file) | ||
for doc_id, resp in resps.items(): | ||
process_res_queue[(task_name, doc_id)] = resp | ||
docs[(task_name, doc_id)] = sources[doc_id] | ||
|
||
return process_res_queue, docs | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--output_base_path", type=str, default=None) | ||
args = parser.parse_args() | ||
output_base_path = pathlib.Path(args.output_base_path) if args.output_base_path is not None else pathlib.Path(".") | ||
|
||
log_dirs = [] | ||
task_names = [] | ||
results = collections.defaultdict(dict) | ||
for path in os.listdir(output_base_path): | ||
if path.startswith("lm_harness_logs_"): | ||
log_dirs.append(path) | ||
task_names.append(str(path)[len("lm_harness_logs_") :]) | ||
|
||
task_dict = lm_eval.tasks.get_task_dict(task_names) | ||
process_res_queue, docs = restore_records(log_dirs, output_base_path) | ||
decontaminate = False | ||
|
||
if output_base_path.joinpath("overlaps.json").is_file(): | ||
decontaminate = True | ||
with open(output_base_path.joinpath("overlaps.json")) as file: | ||
overlaps = json.load(file) | ||
|
||
with open(output_base_path.joinpath("evaluation_config.json")) as file: | ||
config = json.load(file) | ||
|
||
vals = collections.defaultdict(list) | ||
|
||
# unpack results and sort back in order and return control to Task | ||
for (task_name, doc_id), requests in process_res_queue.items(): | ||
requests.sort(key=lambda x: x[0]) | ||
requests = [x[1] for x in requests] | ||
|
||
task = task_dict[task_name] | ||
doc = docs[(task_name, doc_id)] | ||
|
||
metrics = task.process_results(doc, requests) | ||
for metric, value in metrics.items(): | ||
vals[(task_name, metric)].append(value) | ||
|
||
# Re-use the evaluation for the decontaminated set by just ignoring the overlaps | ||
if decontaminate and task_name in overlaps: | ||
if doc_id not in overlaps[task_name]: | ||
vals[(task_name, metric + decontaminate_suffix)].append(value) | ||
|
||
# aggregate results | ||
for (task_name, metric), items in vals.items(): | ||
task = task_dict[task_name] | ||
real_metric = metric # key when looking up the metric with task.aggregation | ||
if metric.endswith(decontaminate_suffix): | ||
real_metric = metric.replace(decontaminate_suffix, "") # decontaminated still uses the same metric | ||
results[task_name][metric] = task.aggregation()[real_metric](items) | ||
|
||
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap | ||
# so we run them less iterations. still looking for a cleaner way to do this | ||
|
||
stderr = lm_eval.metrics.stderr_for_metric( | ||
metric=task.aggregation()[real_metric], | ||
bootstrap_iters=min(config["bootstrap_iters"], 1000) | ||
if metric in ["bleu", "chrf", "ter"] | ||
else config["bootstrap_iters"], | ||
) | ||
|
||
if stderr is not None: | ||
results[task_name][metric + "_stderr"] = stderr(items) | ||
|
||
versions = collections.defaultdict(dict) | ||
for task_name, task in task_dict.items(): | ||
versions[task_name] = task.VERSION | ||
|
||
results = {"results": dict(results), "config": config, "versions": dict(versions)} | ||
|
||
dumped = json.dumps(results, indent=2) | ||
print(dumped) | ||
|
||
print(evaluator.make_table(results)) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Empty file.
Oops, something went wrong.