Skip to content

Commit

Permalink
fix merge conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
epwalsh committed Aug 1, 2023
2 parents 517d4b0 + 5a11cd1 commit 8b21459
Show file tree
Hide file tree
Showing 30 changed files with 101,438 additions and 20 deletions.
104 changes: 104 additions & 0 deletions evaluation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@

# Evaluation

We use tango and catwalk to build the pipeline.
The catwalk code exists [here](https://github.com/allenai/catwalk/tree/olmo-eval).

### Creating an evaluation config

The evaluation pipeline is run as a cross product of models that need to be evaluated, and task sets.

1. Ensure that model paths are present in a `gs://` or `s3://` location.
2. Copy `evaluation/experiments/test_config.jsonnet` to `evaluation/experiment_YYYY_MM_DD.jsonnet`
3. Add models and choose relevant task sets from [experiments/task_sets](evaluation/experiments/task_sets).

### Running the pipeline

#### Basic setup

```commandline
export GITHUB_TOKEN="<your token>" # Needed for beaker to clone the repo.
export GOOGLE_TOKEN="<google credentials>" (or simply gcloud auth login) # If you are using a GS workspace.
```

#### If specifying a Google Sheet to write results to

* Share the google sheet with `[email protected]`.
* Create API json key and download from [here](https://console.cloud.google.com/iam-admin/serviceaccounts/details/101308414346962828659;edit=true/keys?project=ai2-allennlp).
* Add a beaker secret:

```python
from tango.integrations.beaker.common import get_client
beaker = get_client("<beaker_workspace>")

with open("credentials_file.json") as f:
beaker.secret.write("GDRIVE_SERVICE_ACCOUNT_JSON", f.read())
```

```commandline
export GDRIVE_SERVICE_ACCOUNT_JSON=$(cat credentials_file.json)
```

#### Run locally

```commandline
tango run evaluation/experiments/test_config.jsonnet -w your-local-workspace --include-package evaluation.steps
```

#### Run on beaker

* Update `evaluation/tango-in-beaker.yml` (the fields that should be updated are marked).

```commandline
tango --settings evaluation/tango-in-beaker.yml run evaluation/experiments/test_config.jsonnet
```

### See results

If you specify `gsheet` in your config, results will be appended to the google sheet.

All intermediate and final results will also be saved to the specified workspace, and can be accessed as follows:

```python
from tango import Workspace
workspace = Workspace.from_url("gs://your-workspace-url")
result = workspace.step_result("combine-all-outputs")
```


### Adding new task sets

A task set is of the form:

```jsonnet
{
name: "<Name of the task set>",
tasks: [
{
task_name: "<One of the tasks present in `TASKS_LM` or `TASKS`>",
task_kwargs: "<task-specific kwargs (See eval_suite for examples)>",
prediction_kwargs: "<kwargs on how to evaluate the model on this task>"
}
]
}
```

1. Add new task sets under `evaluation/experiments/task_sets` (Current full sets: `gen_tasks.libsonnet`, `eval_suite_ppl_val_v2_small.libsonnet`, `rc20_tasks.libsonnet`, `summary_tasks.libsonnet`).
2. The list of potential tasks can be seen by running `python evaluation/see_available_tasks.py`.


#### Adding a new dataset to our perplexity eval set

1. Add the new set under our current ppl data at /net/nfs.cirrascale/allennlp/akshitab/eval_data.
2. Add the name of the folder to `experiments/task_sets/eval_suite_ppl_val_v2_small.libsonnet`

#### Adding tasks already present in catwalk

1. See `gen_tasks.libsonnet` for a simple example.

#### Adding new tasks to catwalk

(TODO: catwalk needs better documentation on adding new tasks).
1. See examples [here](https://github.com/allenai/catwalk/tree/olmo-eval/catwalk/tasks).
2. Add newly created tasks to [TASKS_LM](https://github.com/allenai/catwalk/blob/olmo-eval/catwalk/tasks/tasks_lm.py)
or [TASKS](https://github.com/allenai/catwalk/blob/olmo-eval/catwalk/tasks/__init__.py).
Empty file added evaluation/__init__.py
Empty file.
8 changes: 8 additions & 0 deletions evaluation/empty_workspace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import sys

from tango.integrations.gs.common import empty_bucket, empty_datastore

if __name__ == "__main__":
bucket_name = sys.argv[1]
empty_bucket(bucket_name)
empty_datastore(bucket_name)
46 changes: 46 additions & 0 deletions evaluation/experiments/evaluation_2023_07_26.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*--------------------------------------- Configurations -----------------------------------------*/

local utils = import 'utils.libsonnet';

local rc20_tasks = import 'task_sets/rc20_tasks.libsonnet';
local gen_tasks = import 'task_sets/gen_tasks.libsonnet';
local summary_tasks = import 'task_sets/summary_tasks.libsonnet';
local ppl_suite = import 'task_sets/eval_suite_ppl_val_v2_small.libsonnet';


//❗Set gsheet to the name of your google sheet.
// Set it to null if you do not want your results to be uploaded to a google sheet (they will still be saved as an object).
//local gsheet = "auto-gsheet-test";
local gsheet = null;

// Models to evaluate

local models = [
{
model_path: "s3://ai2-llm/test_fixtures/olmo-1b", //❗Specify olmo unsharded checkpoint path
gpus_needed: 1,
trust_remote_code: true
},
{
model_path: "EleutherAI/pythia-1b",
revision: "step140000", //❗Specify checkpoint if needed
gpus_needed: 1,
//❗Task sets contain default values for prediction_kwargs. These can be overriden for each model here.
prediction_kwargs: {
model_max_length: 2048,
max_batch_tokens: 20480,
}
}
];

local task_sets = [
rc20_tasks.task_set,
gen_tasks.task_set,
summary_tasks.task_set,
ppl_suite.task_set
];


{
steps: utils.create_pipeline(models, task_sets, gsheet)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@

local task_utils = import 'task_utils.libsonnet';

local common_kwargs = {
task_name: "ppl_custom",
task_kwargs: {
keep_instance_fields: ["orig_file_name", "source", "subdomain"],
},
prediction_kwargs: {
split: "validation",
model_max_length: task_utils.model_max_length,
}
};

// TODO: refactor catwalk's Perplexity task so that it actually uses the s3 path.
// until then, let the path be present in nfs ($EVAL_DATA_PATH).
local data_dir = "olmo-ppl-val-v2-small/";

local create_task_kwargs(task_names) = [
{
task_kwargs: {
task_rename: "ppl_" + task_name + "_small",
files: [data_dir + "/" + task_name + "/val"]
}
}
for task_name in task_names
];

local task_dicts = create_task_kwargs(
["4chan", "c4_100_domains", "c4_en", "gab", "ice", "m2d2_s2orc", "m2d2_wiki",
"manosphere", "mc4_en", "pile", "ptb", "twitterAEE", "wikitext_103"]
);

{
task_set: task_utils.create_task_set_from_task_dicts("eval_suite", task_dicts, common_kwargs)
}
18 changes: 18 additions & 0 deletions evaluation/experiments/task_sets/gen_tasks.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

local task_utils = import 'task_utils.libsonnet';

local name = "gen_tasks";
local task_names = ["drop", "naturalqs_short_open"];
local prediction_kwargs = {
split: "validation",
limit: 1000,
num_shots: 5,
fewshot_seed: 1234,
num_recorded_inputs: 3,
model_max_length: task_utils.model_max_length
};
local task_kwargs = {};

{
task_set: task_utils.create_task_set_from_task_names(name, task_names, prediction_kwargs, task_kwargs)
}
19 changes: 19 additions & 0 deletions evaluation/experiments/task_sets/rc20_tasks.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

local task_utils = import 'task_utils.libsonnet';

local name = "rc20_tasks";
local task_names = ["arc_challenge", "arc_easy", "boolq", "copa", "headqa_en", "hellaswag", "logiqa", "mathqa", "mrpc",
"openbookqa", "piqa", "qnli", "qqp", "rte", "sciq", "sst", "wic", "winogrande", "wnli", "wsc"];

local prediction_kwargs = {
split: "validation",
limit: 1000,
num_shots: 0,
num_recorded_inputs: 3,
model_max_length: task_utils.model_max_length
};
local task_kwargs = {};

{
task_set: task_utils.create_task_set_from_task_names(name, task_names, prediction_kwargs, task_kwargs)
}
18 changes: 18 additions & 0 deletions evaluation/experiments/task_sets/summary_tasks.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

local task_utils = import 'task_utils.libsonnet';

local name = "summary_tasks";
local task_names = ["scitldr", "xsum"];
local prediction_kwargs = {
split: "validation",
limit: 1000,
num_shots: 1,
fewshot_seed: 1234,
num_recorded_inputs: 3,
model_max_length: task_utils.model_max_length
};
local task_kwargs = {};

{
task_set: task_utils.create_task_set_from_task_names(name, task_names, prediction_kwargs, task_kwargs)
}
31 changes: 31 additions & 0 deletions evaluation/experiments/task_sets/task_utils.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@

local create_task_set_from_task_dicts(name, task_dicts, common_kwargs) = {
name: name,
tasks: std.map(
function(task_dict) common_kwargs + {
task_name: std.get(task_dict, "task_name", std.get(common_kwargs, "task_name")),
prediction_kwargs: std.get(common_kwargs, "prediction_kwargs", {}) + std.get(task_dict, "prediction_kwargs", {}),
task_kwargs: std.get(common_kwargs, "task_kwargs", {}) + std.get(task_dict, "task_kwargs", {})
},
task_dicts
)
};

local create_task_set_from_task_names(name, task_names, prediction_kwargs, task_kwargs) = {
name: name,
tasks: std.map(
function(task_name) {
task_name: task_name,
prediction_kwargs: prediction_kwargs,
task_kwargs: task_kwargs
},
task_names
)
};

{
model_max_length: 2048,
max_batch_tokens: 2048,
create_task_set_from_task_names: create_task_set_from_task_names,
create_task_set_from_task_dicts: create_task_set_from_task_dicts
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@

local task_utils = import '../task_utils.libsonnet';

local task_set_name = "eval_suite";

local common_kwargs = {
task_name: "ppl_custom",
task_kwargs: {
keep_instance_fields: ["orig_file_name", "source", "subdomain"],
},
prediction_kwargs: {
split: "validation",
model_max_length: 256,
}
};

// TODO: refactor catwalk's Perplexity task so that it actually uses the s3 path.
// until then, let the path be present in nfs
local data_dir = "test_fixtures/evaluation/ppl-test-data";

local create_task_kwargs(task_names) = [
{
task_kwargs: {
task_rename: "ppl_" + task_name + "_small",
files: [data_dir + "/" + task_name + "/val"]
}
}
for task_name in task_names
];

local task_dicts = create_task_kwargs(
["4chan", "c4_100_domains"]
);

{
task_set: task_utils.create_task_set_from_task_dicts(task_set_name, task_dicts, common_kwargs)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

local task_utils = import '../task_utils.libsonnet';

local name = "gen_tasks";
local task_names = ["drop"];
local prediction_kwargs = {
split: "validation",
limit: 1000,
num_shots: 5,
fewshot_seed: 1234,
num_recorded_inputs: 3,
model_max_length: 256
};
local task_kwargs = {};

{
task_set: task_utils.create_task_set_from_task_names(name, task_names, prediction_kwargs, task_kwargs)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

local task_utils = import '../task_utils.libsonnet';

local name = "rc20_tasks";
local task_names = ["boolq", "arc_easy"];
local prediction_kwargs = {
split: "validation",
limit: 1000,
num_shots: 0,
num_recorded_inputs: 3,
//model_max_length: task_utils.model_max_length
model_max_length: 256
};
local task_kwargs = {};

{
task_set: task_utils.create_task_set_from_task_names(name, task_names, prediction_kwargs, task_kwargs)
}
Loading

0 comments on commit 8b21459

Please sign in to comment.