Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evalu…

…ation-harness into benchmark-scripts
Sparkier · Jul 27, 2023 · b7cd829 · b7cd829
2 parents 2d96a8c + 4e44f0a
commit b7cd829
Show file tree

Hide file tree

Showing 32 changed files with 859 additions and 193 deletions.
diff --git a/.github/workflows/new_tasks.yml b/.github/workflows/new_tasks.yml
@@ -0,0 +1,70 @@
+name: Tasks Modified
+
+on:
+  push:
+    branches:
+      - big-refactor
+  pull_request:
+    branches:
+      - big-refactor
+  workflow_dispatch:
+# comment/edit out the above to stop/change the triggers
+jobs:
+  changed_files:
+    runs-on: ubuntu-latest  # windows-latest || macos-latest
+    timeout-minutes: 120
+    name: Scan for changed tasks
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0  # OR "2" -> To retrieve the preceding commit.
+
+      # Uses the tj-actions/changed-files@v37 action to check for changes.
+      # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
+      # The `files_yaml` input optionally takes a yaml string to specify filters,
+      # and prepends the filter name to the standard output names.
+      - name: Check task folders
+        id: changed-tasks
+        uses: tj-actions/[email protected]
+        with:
+          # tasks checks the tasks folder and api checks the api folder for changes
+          files_yaml: |
+            tasks:
+              - lm_eval/tasks/**
+            api:
+              - lm_eval/api/**
+          write_output_files: true
+
+    # The next step is optional; the files are written to the workspace by default (above).
+    # so it's just for debugging
+      - name: Run Tests
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: |
+          echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
+          echo "One or more test file(s) has changed."
+          echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
+
+      - name: Set up Python 3.9
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+      - name: Install dependencies
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: |
+            python -m pip install --upgrade pip
+            pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+    #   Install optional git dependencies
+    #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+    #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+      - name: Test with pytest
+        # if new tasks are added, run tests on them
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
+        run: python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto
+        # if api is modified, run tests on it
+      - name: Test more tasks with pytest
+        env:
+          API: true
+        if: steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -0,0 +1,63 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+# just comment out unwanted steps to turn off the test.
+name: Unit Tests
+
+on:
+  push:
+    branches:
+      - big-refactor
+  pull_request:
+    branches:
+      - big-refactor
+  workflow_dispatch:
+# Jobs run concurrently and steps run sequentially within a job.
+# jobs: linter and cpu_tests. Add more jobs/steps as required.
+jobs:
+  linter:
+    name: Linters
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v3
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.9
+    - name: Install dependencies
+      run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+    - name: Lint with pylint
+      run: python -m pylint --disable=all -e W0311 --jobs=0 --indent-string='    ' **/*.py
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=F,E9,E71,E72,E501,E112,E113,W6 --extend-ignore=F541 --show-source --statistics --exit-zero
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+      # mypy turned off for now
+#    - name: Lint with mypy
+#      run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
+# Job 2
+  testcpu:
+    name: CPU Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v3
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.9
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+        #         Install optional git dependencies
+#                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+#        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Test with pytest
+      run: python -m pytest -s -v -n=auto --ignore=tests/tests_master --ignore=tests/extra
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,15 @@ output/
 data/
 lm_cache
 .idea
-
-*.egg-info/
+build
+dist
+*.egg-info
+venv
 .vscode/
+temp
+__pycache__
+.ipynb_checkpoints
+temp
+# IPython
+profile_default/
+ipython_config.py
diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
@@ -472,6 +472,9 @@ def fewshot_context(self, doc, num_fewshot):
             return labeled_examples + example
         elif type(example) == list:
             return [labeled_examples + ex for ex in example]
+        elif type(example) == int:
+            choices = self.doc_to_choice(doc)
+            return labeled_examples + choices[example]
 
     def apply_filters(self):
 
@@ -950,22 +953,21 @@ def process_results(self, doc, results):
             if self.multiple_target:
                 acc = 1.0 if pred in gold else 0.0
                 acc_norm = 1.0 if pred_norm in gold else 0.0
+                exact_match = int(any([is_greedy[i] for i in gold]))
             else:
                 acc = 1.0 if pred == gold else 0.0
                 acc_norm = 1.0 if pred_norm == gold else 0.0
+                # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
+                exact_match = int(is_greedy[gold])
 
             result_dict = {
                 **({"acc": acc} if "acc" in use_metric else {}),
                 **({"f1": (gold, pred)} if "f1" in use_metric else {}),
                 **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
                 **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
+                **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
             }
 
-            if "exact_match" in self._metric_fn_list.keys():
-                # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
-                is_greedy = is_greedy[gold]  # take value for the gold answer
-                result_dict["exact_match"] = int(is_greedy)
-
             if "acc_mutual_info" in use_metric:
                 lls_mutual_info = [
                     ll_c - ll_u for ll_c, ll_u in zip(lls, lls_unconditional)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
@@ -244,10 +244,7 @@ def evaluate(
         # aggregate Instances by LM method requested to get output.
         reqtype = (
             "loglikelihood"
-            if (
-                task.OUTPUT_TYPE == "multiple_choice"
-                or task.OUTPUT_TYPE == "winograd_schema"
-            )
+            if task.OUTPUT_TYPE == "multiple_choice"
             else task.OUTPUT_TYPE
         )  # TODO: this is hacky, fix in task.py
         requests[reqtype].extend(task.instances)

diff --git a/lm_eval/models/dummy.py b/lm_eval/models/dummy.py
@@ -6,7 +6,7 @@
 @register_model("dummy")
 class DummyLM(LM):
     def __init__(self):
-        pass
+        super().__init__()
 
     @classmethod
     def create_from_arg_string(cls, arg_string, additional_config=None):

diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
@@ -723,7 +723,7 @@ def _collate(x):
                 else:
                     max_gen_toks = self.max_gen_toks
                 # first stop sequence is used to halt generation upon encountering
-                (primary_until) = until[0]
+                primary_until = [until[0]]
 
                 # set the max length in tokens of inputs ("context_enc")
                 if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:

diff --git a/lm_eval/tasks/webqs/README.md b/lm_eval/tasks/webqs/README.md
@@ -0,0 +1,52 @@
+# Task-name
+
+### Paper
+
+Title: `Semantic Parsing on Freebase from Question-Answer Pairs`
+
+Abstract: `https://cs.stanford.edu/~pliang/papers/freebase-emnlp2013.pdf`
+
+WebQuestions is a benchmark for question answering. The dataset consists of 6,642
+question/answer pairs. The questions are supposed to be answerable by Freebase, a
+large knowledge graph. The questions are mostly centered around a single named entity.
+The questions are popular ones asked on the web (at least in 2013).
+
+Homepage: `https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a`
+
+
+### Citation
+
+```
+@inproceedings{berant-etal-2013-semantic,
+    title = "Semantic Parsing on {F}reebase from Question-Answer Pairs",
+    author = "Berant, Jonathan  and
+      Chou, Andrew  and
+      Frostig, Roy  and
+      Liang, Percy",
+    booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing",
+    month = oct,
+    year = "2013",
+    address = "Seattle, Washington, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/D13-1160",
+    pages = "1533--1544",
+}
+```
+
+### Subtasks
+
+List or describe tasks defined in this folder, and their names here:
+* `webqs`: `Questions with multiple accepted answers.`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+  * [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/lm_eval/tasks/webqs/utils.py b/lm_eval/tasks/webqs/utils.py
@@ -0,0 +1,27 @@
+from typing import Dict, List
+
+
+def doc_to_choice(doc: Dict) -> List[str]:
+    """Return all of the accepted answers as choices."""
+    return _remove_prefixes(doc["answers"])
+
+
+def doc_to_target(doc: Dict) -> List[int]:
+    """Return list of indices of accepted answers (all of them)."""
+    remaining = _remove_prefixes(doc["answers"])
+    return list(range(len(remaining)))
+
+
+def _remove_prefixes(aliases):
+    """
+    Remove any alias that has a strict prefix elsewhere in the list.
+
+    This is an optimization. We can do this because if the prefix is acceptable by isgreedy,
+    we can stop looking.
+    """
+    aliases.sort()
+    ret = [aliases[0]]
+    for alias in aliases[1:]:
+        if not alias.startswith(ret[-1]):
+            ret.append(alias)
+    return ret
diff --git a/lm_eval/tasks/webqs/webqs.yaml b/lm_eval/tasks/webqs/webqs.yaml
@@ -0,0 +1,19 @@
+group:
+  - freebase
+  - question_answer
+task: webqs
+dataset_path: web_questions
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: null
+test_split: test
+doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: !function utils.doc_to_target
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: question
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true