forked from EleutherAI/lm-evaluation-harness
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evalu…
…ation-harness into benchmark-scripts
- Loading branch information
Showing
32 changed files
with
859 additions
and
193 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
name: Tasks Modified | ||
|
||
on: | ||
push: | ||
branches: | ||
- big-refactor | ||
pull_request: | ||
branches: | ||
- big-refactor | ||
workflow_dispatch: | ||
# comment/edit out the above to stop/change the triggers | ||
jobs: | ||
changed_files: | ||
runs-on: ubuntu-latest # windows-latest || macos-latest | ||
timeout-minutes: 120 | ||
name: Scan for changed tasks | ||
steps: | ||
- name: checkout | ||
uses: actions/checkout@v3 | ||
with: | ||
fetch-depth: 0 # OR "2" -> To retrieve the preceding commit. | ||
|
||
# Uses the tj-actions/changed-files@v37 action to check for changes. | ||
# Outputs provided here: https://github.com/tj-actions/changed-files#outputs | ||
# The `files_yaml` input optionally takes a yaml string to specify filters, | ||
# and prepends the filter name to the standard output names. | ||
- name: Check task folders | ||
id: changed-tasks | ||
uses: tj-actions/[email protected] | ||
with: | ||
# tasks checks the tasks folder and api checks the api folder for changes | ||
files_yaml: | | ||
tasks: | ||
- lm_eval/tasks/** | ||
api: | ||
- lm_eval/api/** | ||
write_output_files: true | ||
|
||
# The next step is optional; the files are written to the workspace by default (above). | ||
# so it's just for debugging | ||
- name: Run Tests | ||
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' | ||
run: | | ||
echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV' | ||
echo "One or more test file(s) has changed." | ||
echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}" | ||
- name: Set up Python 3.9 | ||
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: 3.9 | ||
- name: Install dependencies | ||
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' | ||
run: | | ||
python -m pip install --upgrade pip | ||
pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu | ||
# Install optional git dependencies | ||
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt | ||
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi | ||
- name: Test with pytest | ||
# if new tasks are added, run tests on them | ||
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' | ||
run: python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto | ||
# if api is modified, run tests on it | ||
- name: Test more tasks with pytest | ||
env: | ||
API: true | ||
if: steps.changed-tasks.outputs.api_any_modified == 'true' | ||
run: python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions | ||
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python | ||
# just comment out unwanted steps to turn off the test. | ||
name: Unit Tests | ||
|
||
on: | ||
push: | ||
branches: | ||
- big-refactor | ||
pull_request: | ||
branches: | ||
- big-refactor | ||
workflow_dispatch: | ||
# Jobs run concurrently and steps run sequentially within a job. | ||
# jobs: linter and cpu_tests. Add more jobs/steps as required. | ||
jobs: | ||
linter: | ||
name: Linters | ||
runs-on: ubuntu-latest | ||
timeout-minutes: 20 | ||
|
||
steps: | ||
- name: Checkout Code | ||
uses: actions/checkout@v3 | ||
- name: Set up Python 3.9 | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: 3.9 | ||
- name: Install dependencies | ||
run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu | ||
- name: Lint with pylint | ||
run: python -m pylint --disable=all -e W0311 --jobs=0 --indent-string=' ' **/*.py | ||
- name: Lint with flake8 | ||
run: | | ||
# stop the build if there are Python syntax errors or undefined names | ||
flake8 . --count --select=F,E9,E71,E72,E501,E112,E113,W6 --extend-ignore=F541 --show-source --statistics --exit-zero | ||
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide | ||
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics | ||
# mypy turned off for now | ||
# - name: Lint with mypy | ||
# run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable | ||
# Job 2 | ||
testcpu: | ||
name: CPU Tests | ||
runs-on: ubuntu-latest | ||
timeout-minutes: 20 | ||
|
||
steps: | ||
- name: Checkout Code | ||
uses: actions/checkout@v3 | ||
- name: Set up Python 3.9 | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: 3.9 | ||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu | ||
# Install optional git dependencies | ||
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt | ||
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi | ||
- name: Test with pytest | ||
run: python -m pytest -s -v -n=auto --ignore=tests/tests_master --ignore=tests/extra |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# Task-name | ||
|
||
### Paper | ||
|
||
Title: `Semantic Parsing on Freebase from Question-Answer Pairs` | ||
|
||
Abstract: `https://cs.stanford.edu/~pliang/papers/freebase-emnlp2013.pdf` | ||
|
||
WebQuestions is a benchmark for question answering. The dataset consists of 6,642 | ||
question/answer pairs. The questions are supposed to be answerable by Freebase, a | ||
large knowledge graph. The questions are mostly centered around a single named entity. | ||
The questions are popular ones asked on the web (at least in 2013). | ||
|
||
Homepage: `https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a` | ||
|
||
|
||
### Citation | ||
|
||
``` | ||
@inproceedings{berant-etal-2013-semantic, | ||
title = "Semantic Parsing on {F}reebase from Question-Answer Pairs", | ||
author = "Berant, Jonathan and | ||
Chou, Andrew and | ||
Frostig, Roy and | ||
Liang, Percy", | ||
booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing", | ||
month = oct, | ||
year = "2013", | ||
address = "Seattle, Washington, USA", | ||
publisher = "Association for Computational Linguistics", | ||
url = "https://aclanthology.org/D13-1160", | ||
pages = "1533--1544", | ||
} | ||
``` | ||
|
||
### Subtasks | ||
|
||
List or describe tasks defined in this folder, and their names here: | ||
* `webqs`: `Questions with multiple accepted answers.` | ||
|
||
### Checklist | ||
|
||
For adding novel benchmarks/datasets to the library: | ||
* [x] Is the task an existing benchmark in the literature? | ||
* [x] Have you referenced the original paper that introduced the task? | ||
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? | ||
|
||
|
||
If other tasks on this dataset are already supported: | ||
* [ ] Is the "Main" variant of this task clearly denoted? | ||
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? | ||
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from typing import Dict, List | ||
|
||
|
||
def doc_to_choice(doc: Dict) -> List[str]: | ||
"""Return all of the accepted answers as choices.""" | ||
return _remove_prefixes(doc["answers"]) | ||
|
||
|
||
def doc_to_target(doc: Dict) -> List[int]: | ||
"""Return list of indices of accepted answers (all of them).""" | ||
remaining = _remove_prefixes(doc["answers"]) | ||
return list(range(len(remaining))) | ||
|
||
|
||
def _remove_prefixes(aliases): | ||
""" | ||
Remove any alias that has a strict prefix elsewhere in the list. | ||
This is an optimization. We can do this because if the prefix is acceptable by isgreedy, | ||
we can stop looking. | ||
""" | ||
aliases.sort() | ||
ret = [aliases[0]] | ||
for alias in aliases[1:]: | ||
if not alias.startswith(ret[-1]): | ||
ret.append(alias) | ||
return ret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
group: | ||
- freebase | ||
- question_answer | ||
task: webqs | ||
dataset_path: web_questions | ||
dataset_name: null | ||
output_type: multiple_choice | ||
training_split: train | ||
validation_split: null | ||
test_split: test | ||
doc_to_text: "Question: {{question}}\nAnswer:" | ||
doc_to_target: !function utils.doc_to_target | ||
doc_to_choice: !function utils.doc_to_choice | ||
should_decontaminate: true | ||
doc_to_decontamination_query: question | ||
metric_list: | ||
- metric: exact_match | ||
aggregation: mean | ||
higher_is_better: true |
Oops, something went wrong.