fix merge conflicts

allenai · Aug 1, 2023 · 8b21459 · 8b21459
2 parents 517d4b0 + 5a11cd1
commit 8b21459
Show file tree

Hide file tree

Showing 30 changed files with 101,438 additions and 20 deletions.
diff --git a/evaluation/README.md b/evaluation/README.md
@@ -0,0 +1,104 @@
+
+# Evaluation
+
+We use tango and catwalk to build the pipeline.
+The catwalk code exists [here](https://github.com/allenai/catwalk/tree/olmo-eval).
+
+### Creating an evaluation config
+
+The evaluation pipeline is run as a cross product of models that need to be evaluated, and task sets.
+
+1. Ensure that model paths are present in a `gs://` or `s3://` location.
+2. Copy `evaluation/experiments/test_config.jsonnet` to `evaluation/experiment_YYYY_MM_DD.jsonnet`
+3. Add models and choose relevant task sets from [experiments/task_sets](evaluation/experiments/task_sets).
+
+### Running the pipeline
+
+#### Basic setup
+
+```commandline 
+export GITHUB_TOKEN="<your token>"  # Needed for beaker to clone the repo.
+export GOOGLE_TOKEN="<google credentials>"  (or simply gcloud auth login) # If you are using a GS workspace.
+```
+
+#### If specifying a Google Sheet to write results to
+
+* Share the google sheet with `[email protected]`.
+* Create API json key and download from [here](https://console.cloud.google.com/iam-admin/serviceaccounts/details/101308414346962828659;edit=true/keys?project=ai2-allennlp).
+* Add a beaker secret:
+
+```python
+from tango.integrations.beaker.common import get_client
+beaker = get_client("<beaker_workspace>")
+
+with open("credentials_file.json") as f:
+    beaker.secret.write("GDRIVE_SERVICE_ACCOUNT_JSON", f.read())
+```
+
+```commandline
+export GDRIVE_SERVICE_ACCOUNT_JSON=$(cat credentials_file.json)
+```
+
+#### Run locally
+
+```commandline
+tango run evaluation/experiments/test_config.jsonnet -w your-local-workspace --include-package evaluation.steps
+```
+
+#### Run on beaker
+
+* Update `evaluation/tango-in-beaker.yml` (the fields that should be updated are marked).
+
+```commandline
+tango --settings evaluation/tango-in-beaker.yml run evaluation/experiments/test_config.jsonnet
+```
+
+### See results
+
+If you specify `gsheet` in your config, results will be appended to the google sheet.
+
+All intermediate and final results will also be saved to the specified workspace, and can be accessed as follows:
+
+```python
+from tango import Workspace
+workspace = Workspace.from_url("gs://your-workspace-url")
+result = workspace.step_result("combine-all-outputs")
+```
+
+
+### Adding new task sets
+
+A task set is of the form:
+
+```jsonnet
+{
+    name: "<Name of the task set>",
+    tasks: [
+        {
+            task_name: "<One of the tasks present in `TASKS_LM` or `TASKS`>",
+            task_kwargs: "<task-specific kwargs (See eval_suite for examples)>",
+            prediction_kwargs: "<kwargs on how to evaluate the model on this task>"
+        }
+    ]
+}
+```
+
+1. Add new task sets under `evaluation/experiments/task_sets` (Current full sets: `gen_tasks.libsonnet`, `eval_suite_ppl_val_v2_small.libsonnet`, `rc20_tasks.libsonnet`, `summary_tasks.libsonnet`).
+2. The list of potential tasks can be seen by running `python evaluation/see_available_tasks.py`. 
+
+
+#### Adding a new dataset to our perplexity eval set
+
+1. Add the new set under our current ppl data at /net/nfs.cirrascale/allennlp/akshitab/eval_data.
+2. Add the name of the folder to `experiments/task_sets/eval_suite_ppl_val_v2_small.libsonnet`
+
+#### Adding tasks already present in catwalk
+
+1. See `gen_tasks.libsonnet` for a simple example.
+
+#### Adding new tasks to catwalk
+
+(TODO: catwalk needs better documentation on adding new tasks).
+1. See examples [here](https://github.com/allenai/catwalk/tree/olmo-eval/catwalk/tasks).
+2. Add newly created tasks to [TASKS_LM](https://github.com/allenai/catwalk/blob/olmo-eval/catwalk/tasks/tasks_lm.py)
+ or [TASKS](https://github.com/allenai/catwalk/blob/olmo-eval/catwalk/tasks/__init__.py).
diff --git a/evaluation/__init__.py b/evaluation/__init__.py
diff --git a/evaluation/empty_workspace.py b/evaluation/empty_workspace.py
@@ -0,0 +1,8 @@
+import sys
+
+from tango.integrations.gs.common import empty_bucket, empty_datastore
+
+if __name__ == "__main__":
+    bucket_name = sys.argv[1]
+    empty_bucket(bucket_name)
+    empty_datastore(bucket_name)
diff --git a/evaluation/experiments/evaluation_2023_07_26.jsonnet b/evaluation/experiments/evaluation_2023_07_26.jsonnet
@@ -0,0 +1,46 @@
+/*--------------------------------------- Configurations -----------------------------------------*/
+
+local utils = import 'utils.libsonnet';
+
+local rc20_tasks = import 'task_sets/rc20_tasks.libsonnet';
+local gen_tasks = import 'task_sets/gen_tasks.libsonnet';
+local summary_tasks = import 'task_sets/summary_tasks.libsonnet';
+local ppl_suite = import 'task_sets/eval_suite_ppl_val_v2_small.libsonnet';
+
+
+//❗Set gsheet to the name of your google sheet.
+// Set it to null if you do not want your results to be uploaded to a google sheet (they will still be saved as an object).
+//local gsheet = "auto-gsheet-test";
+local gsheet = null;
+
+// Models to evaluate
+
+local models = [
+    {
+        model_path: "s3://ai2-llm/test_fixtures/olmo-1b", //❗Specify olmo unsharded checkpoint path
+        gpus_needed: 1,
+        trust_remote_code: true
+    },
+    {
+        model_path: "EleutherAI/pythia-1b",
+        revision: "step140000", //❗Specify checkpoint if needed
+        gpus_needed: 1,
+        //❗Task sets contain default values for prediction_kwargs. These can be overriden for each model here.
+        prediction_kwargs: {
+            model_max_length: 2048,
+            max_batch_tokens: 20480,
+        }
+    }
+];
+
+local task_sets = [
+    rc20_tasks.task_set,
+    gen_tasks.task_set,
+    summary_tasks.task_set,
+    ppl_suite.task_set
+];
+
+
+{
+    steps: utils.create_pipeline(models, task_sets, gsheet)
+}
diff --git a/evaluation/experiments/task_sets/eval_suite_ppl_val_v2_small.libsonnet b/evaluation/experiments/task_sets/eval_suite_ppl_val_v2_small.libsonnet
@@ -0,0 +1,36 @@
+
+local task_utils = import 'task_utils.libsonnet';
+
+local common_kwargs = {
+    task_name: "ppl_custom",
+    task_kwargs: {
+        keep_instance_fields: ["orig_file_name", "source", "subdomain"],
+    },
+    prediction_kwargs: {
+        split: "validation",
+        model_max_length: task_utils.model_max_length,
+    }
+};
+
+// TODO: refactor catwalk's Perplexity task so that it actually uses the s3 path.
+// until then, let the path be present in nfs ($EVAL_DATA_PATH).
+local data_dir = "olmo-ppl-val-v2-small/";
+
+local create_task_kwargs(task_names) = [
+    {
+        task_kwargs: {
+            task_rename: "ppl_" + task_name + "_small",
+            files: [data_dir + "/" + task_name + "/val"]
+        }
+    }
+    for task_name in task_names
+];
+
+local task_dicts = create_task_kwargs(
+    ["4chan", "c4_100_domains", "c4_en", "gab", "ice", "m2d2_s2orc", "m2d2_wiki",
+    "manosphere", "mc4_en", "pile", "ptb", "twitterAEE", "wikitext_103"]
+);
+
+{
+    task_set: task_utils.create_task_set_from_task_dicts("eval_suite", task_dicts, common_kwargs)
+}
diff --git a/evaluation/experiments/task_sets/gen_tasks.libsonnet b/evaluation/experiments/task_sets/gen_tasks.libsonnet
@@ -0,0 +1,18 @@
+
+local task_utils = import 'task_utils.libsonnet';
+
+local name = "gen_tasks";
+local task_names = ["drop", "naturalqs_short_open"];
+local prediction_kwargs = {
+    split: "validation",
+    limit: 1000,
+    num_shots: 5,
+    fewshot_seed: 1234,
+    num_recorded_inputs: 3,
+    model_max_length: task_utils.model_max_length
+};
+local task_kwargs = {};
+
+{
+    task_set: task_utils.create_task_set_from_task_names(name, task_names, prediction_kwargs, task_kwargs)
+}
diff --git a/evaluation/experiments/task_sets/rc20_tasks.libsonnet b/evaluation/experiments/task_sets/rc20_tasks.libsonnet
@@ -0,0 +1,19 @@
+
+local task_utils = import 'task_utils.libsonnet';
+
+local name = "rc20_tasks";
+local task_names = ["arc_challenge", "arc_easy", "boolq", "copa", "headqa_en", "hellaswag", "logiqa", "mathqa", "mrpc",
+    "openbookqa", "piqa", "qnli", "qqp", "rte", "sciq", "sst", "wic", "winogrande", "wnli", "wsc"];
+
+local prediction_kwargs = {
+    split: "validation",
+    limit: 1000,
+    num_shots: 0,
+    num_recorded_inputs: 3,
+    model_max_length: task_utils.model_max_length
+};
+local task_kwargs = {};
+
+{
+    task_set: task_utils.create_task_set_from_task_names(name, task_names, prediction_kwargs, task_kwargs)
+}
diff --git a/evaluation/experiments/task_sets/summary_tasks.libsonnet b/evaluation/experiments/task_sets/summary_tasks.libsonnet
@@ -0,0 +1,18 @@
+
+local task_utils = import 'task_utils.libsonnet';
+
+local name = "summary_tasks";
+local task_names = ["scitldr", "xsum"];
+local prediction_kwargs = {
+    split: "validation",
+    limit: 1000,
+    num_shots: 1,
+    fewshot_seed: 1234,
+    num_recorded_inputs: 3,
+    model_max_length: task_utils.model_max_length
+};
+local task_kwargs = {};
+
+{
+    task_set: task_utils.create_task_set_from_task_names(name, task_names, prediction_kwargs, task_kwargs)
+}
diff --git a/evaluation/experiments/task_sets/task_utils.libsonnet b/evaluation/experiments/task_sets/task_utils.libsonnet
@@ -0,0 +1,31 @@
+
+local create_task_set_from_task_dicts(name, task_dicts, common_kwargs) = {
+    name: name,
+    tasks: std.map(
+        function(task_dict) common_kwargs + {
+            task_name: std.get(task_dict, "task_name", std.get(common_kwargs, "task_name")),
+            prediction_kwargs: std.get(common_kwargs, "prediction_kwargs", {}) + std.get(task_dict, "prediction_kwargs", {}),
+            task_kwargs: std.get(common_kwargs, "task_kwargs", {}) + std.get(task_dict, "task_kwargs", {})
+        },
+        task_dicts
+    )
+};
+
+local create_task_set_from_task_names(name, task_names, prediction_kwargs, task_kwargs) = {
+    name: name,
+    tasks: std.map(
+                function(task_name) {
+                    task_name: task_name,
+                    prediction_kwargs: prediction_kwargs,
+                    task_kwargs: task_kwargs
+                },
+                task_names
+            )
+};
+
+{
+    model_max_length: 2048,
+    max_batch_tokens: 2048,
+    create_task_set_from_task_names: create_task_set_from_task_names,
+    create_task_set_from_task_dicts: create_task_set_from_task_dicts
+}
diff --git a/evaluation/experiments/task_sets/test_sets/test_eval_suite_ppl_val_v2_small.libsonnet b/evaluation/experiments/task_sets/test_sets/test_eval_suite_ppl_val_v2_small.libsonnet
@@ -0,0 +1,37 @@
+
+local task_utils = import '../task_utils.libsonnet';
+
+local task_set_name = "eval_suite";
+
+local common_kwargs = {
+    task_name: "ppl_custom",
+    task_kwargs: {
+        keep_instance_fields: ["orig_file_name", "source", "subdomain"],
+    },
+    prediction_kwargs: {
+        split: "validation",
+        model_max_length: 256,
+    }
+};
+
+// TODO: refactor catwalk's Perplexity task so that it actually uses the s3 path.
+// until then, let the path be present in nfs
+local data_dir = "test_fixtures/evaluation/ppl-test-data";
+
+local create_task_kwargs(task_names) = [
+    {
+        task_kwargs: {
+            task_rename: "ppl_" + task_name + "_small",
+            files: [data_dir + "/" + task_name + "/val"]
+        }
+    }
+    for task_name in task_names
+];
+
+local task_dicts = create_task_kwargs(
+    ["4chan", "c4_100_domains"]
+);
+
+{
+    task_set: task_utils.create_task_set_from_task_dicts(task_set_name, task_dicts, common_kwargs)
+}
diff --git a/evaluation/experiments/task_sets/test_sets/test_gen_tasks.libsonnet b/evaluation/experiments/task_sets/test_sets/test_gen_tasks.libsonnet
@@ -0,0 +1,18 @@
+
+local task_utils = import '../task_utils.libsonnet';
+
+local name = "gen_tasks";
+local task_names = ["drop"];
+local prediction_kwargs = {
+    split: "validation",
+    limit: 1000,
+    num_shots: 5,
+    fewshot_seed: 1234,
+    num_recorded_inputs: 3,
+    model_max_length: 256
+};
+local task_kwargs = {};
+
+{
+    task_set: task_utils.create_task_set_from_task_names(name, task_names, prediction_kwargs, task_kwargs)
+}
diff --git a/evaluation/experiments/task_sets/test_sets/test_rc20_tasks.libsonnet b/evaluation/experiments/task_sets/test_sets/test_rc20_tasks.libsonnet
@@ -0,0 +1,18 @@
+
+local task_utils = import '../task_utils.libsonnet';
+
+local name = "rc20_tasks";
+local task_names = ["boolq", "arc_easy"];
+local prediction_kwargs = {
+    split: "validation",
+    limit: 1000,
+    num_shots: 0,
+    num_recorded_inputs: 3,
+    //model_max_length: task_utils.model_max_length
+    model_max_length: 256
+};
+local task_kwargs = {};
+
+{
+    task_set: task_utils.create_task_set_from_task_names(name, task_names, prediction_kwargs, task_kwargs)
+}