From 121b7096ab608a3ef8a73957c0f6efae053b5f15 Mon Sep 17 00:00:00 2001
From: Fabrizio Milo <mistobaan@gmail.com>
Date: Mon, 2 May 2022 18:09:18 -0700
Subject: [PATCH] add pre-commit

---
 .coveragerc                                   |   4 +-
 .flake8                                       |   5 +
 .github/workflows/pull_request.yml            |  13 +
 .gitignore                                    |   2 +-
 .pre-commit-config.yaml                       |  42 ++
 README.md                                     |   8 +-
 docs/decontamination.md                       |   5 +-
 docs/task_guide.md                            |  18 +-
 lm_eval/base.py                               | 258 +++++----
 lm_eval/datasets/README.md                    |   4 +-
 lm_eval/datasets/arithmetic/arithmetic.py     |  85 ++-
 .../datasets/arithmetic/dataset_infos.json    |   2 +-
 lm_eval/datasets/asdiv/asdiv.py               |  13 +-
 lm_eval/datasets/asdiv/dataset_infos.json     |   2 +-
 lm_eval/datasets/coqa/coqa.py                 |  93 +--
 lm_eval/datasets/coqa/dataset_infos.json      |   2 +-
 lm_eval/datasets/drop/dataset_infos.json      |   2 +-
 lm_eval/datasets/drop/drop.py                 | 101 ++--
 lm_eval/datasets/gsm8k/dataset_infos.json     |   2 +-
 lm_eval/datasets/gsm8k/gsm8k.py               |  18 +-
 lm_eval/datasets/headqa/dataset_infos.json    |   2 +-
 lm_eval/datasets/headqa/headqa.py             |  30 +-
 .../hendrycks_ethics/dataset_infos.json       |   2 +-
 .../hendrycks_ethics/hendrycks_ethics.py      |  91 +--
 .../hendrycks_math/dataset_infos.json         |   2 +-
 .../datasets/hendrycks_math/hendrycks_math.py |  26 +-
 lm_eval/datasets/lambada/dataset_infos.json   |   2 +-
 lm_eval/datasets/lambada/lambada.py           |  40 +-
 lm_eval/datasets/logiqa/dataset_infos.json    |   2 +-
 lm_eval/datasets/logiqa/logiqa.py             |  22 +-
 lm_eval/datasets/mutual/dataset_infos.json    |   2 +-
 lm_eval/datasets/mutual/mutual.py             |  24 +-
 lm_eval/datasets/pile/dataset_infos.json      |   2 +-
 lm_eval/datasets/pile/pile.py                 |   5 +-
 lm_eval/datasets/quac/dataset_infos.json      |   2 +-
 lm_eval/datasets/quac/quac.py                 |  13 +-
 .../datasets/sat_analogies/sat_analogies.py   |  23 +-
 lm_eval/datasets/triviaqa/dataset_infos.json  |   2 +-
 lm_eval/datasets/triviaqa/triviaqa.py         |  33 +-
 .../datasets/truthfulqa/dataset_infos.json    |   2 +-
 lm_eval/datasets/truthfulqa/truthfulqa.py     |  62 +-
 .../datasets/unscramble/dataset_infos.json    |   2 +-
 lm_eval/datasets/unscramble/unscramble.py     |   5 +-
 lm_eval/datasets/wikitext/dataset_infos.json  |   2 +-
 lm_eval/datasets/wikitext/wikitext.py         |  85 ++-
 lm_eval/decontamination/archiver.py           |  80 +--
 lm_eval/decontamination/decontaminate.py      |  49 +-
 lm_eval/decontamination/janitor.py            |  78 +--
 lm_eval/evaluator.py                          | 124 ++--
 lm_eval/metrics.py                            |  25 +-
 lm_eval/models/gpt2.py                        |  61 +-
 lm_eval/models/gpt3.py                        |  57 +-
 lm_eval/tasks/__init__.py                     |  57 +-
 lm_eval/tasks/anli.py                         |  47 +-
 lm_eval/tasks/arithmetic.py                   |  12 +-
 lm_eval/tasks/asdiv.py                        |  31 +-
 lm_eval/tasks/blimp.py                        |  14 +-
 lm_eval/tasks/cbt.py                          |  20 +-
 lm_eval/tasks/coqa.py                         |  65 ++-
 lm_eval/tasks/drop.py                         |  64 ++-
 lm_eval/tasks/glue.py                         | 131 ++---
 lm_eval/tasks/gsm8k.py                        |  34 +-
 lm_eval/tasks/headqa.py                       |   8 +-
 lm_eval/tasks/hellaswag.py                    | 154 ++---
 lm_eval/tasks/hendrycks_ethics.py             | 135 +++--
 lm_eval/tasks/hendrycks_math.py               |  47 +-
 lm_eval/tasks/hendrycks_test.py               | 107 +++-
 lm_eval/tasks/lambada.py                      |  27 +-
 lm_eval/tasks/lambada_cloze.py                |   8 +-
 lm_eval/tasks/lambada_multilingual.py         |  22 +-
 lm_eval/tasks/logiqa.py                       |  25 +-
 lm_eval/tasks/mathqa.py                       |  11 +-
 lm_eval/tasks/mc_taco.py                      |  35 +-
 lm_eval/tasks/mutual.py                       |  22 +-
 lm_eval/tasks/naturalqs.py                    | 260 ++++-----
 lm_eval/tasks/openbookqa.py                   | 142 ++---
 lm_eval/tasks/piqa.py                         |   2 +-
 lm_eval/tasks/prost.py                        |  17 +-
 lm_eval/tasks/pubmedqa.py                     |  26 +-
 lm_eval/tasks/qa4mre.py                       |   8 +-
 lm_eval/tasks/qasper.py                       |   2 +-
 lm_eval/tasks/quac.py                         | 235 ++++----
 lm_eval/tasks/race.py                         |  91 +--
 lm_eval/tasks/sat.py                          |  12 +-
 lm_eval/tasks/sciq.py                         |   4 +-
 lm_eval/tasks/squad.py                        | 388 +++++++------
 lm_eval/tasks/storycloze.py                   |  49 +-
 lm_eval/tasks/superglue.py                    | 166 +++---
 lm_eval/tasks/translation.py                  |  31 +-
 lm_eval/tasks/triviaqa.py                     |  18 +-
 lm_eval/tasks/truthfulqa.py                   | 127 ++---
 lm_eval/tasks/unscramble.py                   |  12 +-
 lm_eval/tasks/webqs.py                        |  20 +-
 lm_eval/tasks/wikitext.py                     |   4 +-
 lm_eval/tasks/winogrande.py                   | 270 +++++----
 lm_eval/tasks/wsc273.py                       |  31 +-
 lm_eval/utils.py                              |  79 +--
 main.py                                       |  44 +-
 pile_statistics.json                          |   2 +-
 scripts/clean_training_data/README.md         |  11 +-
 .../compress_and_package.py                   |  32 +-
 .../clean_training_data/generate_13_grams.py  |  37 +-
 .../clean_training_data/investigate_pile.py   |  39 +-
 scripts/clean_training_data/janitor_util.cpp  | 343 ++++++------
 .../process_sorted_buckets.py                 |  40 +-
 .../sort_13_gram_buckets.py                   |  13 +-
 scripts/cost_estimate.py                      |  42 +-
 scripts/get_prompts.py                        |  13 +-
 scripts/make_gpt2_test_cases.py               |  22 +-
 scripts/make_table_tasks.py                   |  19 +-
 scripts/write_out.py                          |  38 +-
 setup.py                                      |   6 +-
 tests/test_evaluator.py                       |  37 +-
 tests/test_generate_13_grams.py               |  36 +-
 tests/test_gpt3.py                            | 114 ++--
 tests/test_janitor.py                         | 530 ++++++++++--------
 tests/test_models.py                          | 150 +++--
 tests/test_tasks.py                           |  20 +-
 tests/test_utils.py                           |  28 +-
 tests/test_version_stable.py                  |  64 ++-
 tests/testdata/anagrams1-v0-greedy_until      |   2 +-
 tests/testdata/anagrams1-v0-res.json          |   2 +-
 tests/testdata/anagrams2-v0-greedy_until      |   2 +-
 tests/testdata/anagrams2-v0-res.json          |   2 +-
 tests/testdata/anli_r1-v0-loglikelihood       |   2 +-
 tests/testdata/anli_r1-v0-res.json            |   2 +-
 tests/testdata/anli_r2-v0-loglikelihood       |   2 +-
 tests/testdata/anli_r2-v0-res.json            |   2 +-
 tests/testdata/anli_r3-v0-loglikelihood       |   2 +-
 tests/testdata/anli_r3-v0-res.json            |   2 +-
 tests/testdata/arc_challenge-v0-loglikelihood |   2 +-
 tests/testdata/arc_challenge-v0-res.json      |   2 +-
 tests/testdata/arc_easy-v0-loglikelihood      |   2 +-
 tests/testdata/arc_easy-v0-res.json           |   2 +-
 .../testdata/arithmetic_1dc-v0-loglikelihood  |   2 +-
 tests/testdata/arithmetic_1dc-v0-res.json     |   2 +-
 .../testdata/arithmetic_2da-v0-loglikelihood  |   2 +-
 tests/testdata/arithmetic_2da-v0-res.json     |   2 +-
 .../testdata/arithmetic_2dm-v0-loglikelihood  |   2 +-
 tests/testdata/arithmetic_2dm-v0-res.json     |   2 +-
 .../testdata/arithmetic_2ds-v0-loglikelihood  |   2 +-
 tests/testdata/arithmetic_2ds-v0-res.json     |   2 +-
 .../testdata/arithmetic_3da-v0-loglikelihood  |   2 +-
 tests/testdata/arithmetic_3da-v0-res.json     |   2 +-
 .../testdata/arithmetic_3ds-v0-loglikelihood  |   2 +-
 tests/testdata/arithmetic_3ds-v0-res.json     |   2 +-
 .../testdata/arithmetic_4da-v0-loglikelihood  |   2 +-
 tests/testdata/arithmetic_4da-v0-res.json     |   2 +-
 .../testdata/arithmetic_4ds-v0-loglikelihood  |   2 +-
 tests/testdata/arithmetic_4ds-v0-res.json     |   2 +-
 .../testdata/arithmetic_5da-v0-loglikelihood  |   2 +-
 tests/testdata/arithmetic_5da-v0-res.json     |   2 +-
 .../testdata/arithmetic_5ds-v0-loglikelihood  |   2 +-
 tests/testdata/arithmetic_5ds-v0-res.json     |   2 +-
 tests/testdata/boolq-v0-loglikelihood         |   2 +-
 tests/testdata/boolq-v0-res.json              |   2 +-
 tests/testdata/boolq-v1-loglikelihood         |   2 +-
 tests/testdata/boolq-v1-res.json              |   2 +-
 tests/testdata/cb-v0-loglikelihood            |   2 +-
 tests/testdata/cb-v0-res.json                 |   2 +-
 tests/testdata/cb-v1-loglikelihood            |   2 +-
 tests/testdata/cb-v1-res.json                 |   2 +-
 tests/testdata/cola-v0-loglikelihood          |   2 +-
 tests/testdata/cola-v0-res.json               |   2 +-
 tests/testdata/copa-v0-loglikelihood          |   2 +-
 tests/testdata/copa-v0-res.json               |   2 +-
 tests/testdata/coqa-v0-greedy_until           |   2 +-
 tests/testdata/coqa-v0-res.json               |   2 +-
 tests/testdata/coqa-v1-greedy_until           |   2 +-
 tests/testdata/coqa-v1-res.json               |   2 +-
 tests/testdata/cycle_letters-v0-greedy_until  |   2 +-
 tests/testdata/cycle_letters-v0-res.json      |   2 +-
 tests/testdata/drop-v0-greedy_until           |   2 +-
 tests/testdata/drop-v0-res.json               |   2 +-
 tests/testdata/drop-v1-greedy_until           |   2 +-
 tests/testdata/drop-v1-res.json               |   2 +-
 tests/testdata/ethics_cm-v0-loglikelihood     |   2 +-
 tests/testdata/ethics_cm-v0-res.json          |   2 +-
 .../ethics_deontology-v0-loglikelihood        |   2 +-
 tests/testdata/ethics_deontology-v0-res.json  |   2 +-
 .../testdata/ethics_justice-v0-loglikelihood  |   2 +-
 tests/testdata/ethics_justice-v0-res.json     |   2 +-
 .../ethics_utilitarianism-v0-loglikelihood    |   2 +-
 .../ethics_utilitarianism-v0-res.json         |   2 +-
 ...s_utilitarianism_original-v0-loglikelihood |   2 +-
 ...ethics_utilitarianism_original-v0-res.json |   2 +-
 tests/testdata/ethics_virtue-v0-loglikelihood |   2 +-
 tests/testdata/ethics_virtue-v0-res.json      |   2 +-
 tests/testdata/gsm8k-v0-greedy_until          |   2 +-
 tests/testdata/gsm8k-v0-res.json              |   2 +-
 tests/testdata/headqa-v0-loglikelihood        |   2 +-
 tests/testdata/headqa-v0-res.json             |   2 +-
 tests/testdata/headqa_en-v0-loglikelihood     |   2 +-
 tests/testdata/headqa_en-v0-res.json          |   2 +-
 tests/testdata/headqa_es-v0-loglikelihood     |   2 +-
 tests/testdata/headqa_es-v0-res.json          |   2 +-
 tests/testdata/hellaswag-v0-loglikelihood     |   2 +-
 tests/testdata/hellaswag-v0-res.json          |   2 +-
 ...ycksTest-abstract_algebra-v0-loglikelihood |   2 +-
 ...hendrycksTest-abstract_algebra-v0-res.json |   2 +-
 .../hendrycksTest-anatomy-v0-loglikelihood    |   2 +-
 .../hendrycksTest-anatomy-v0-res.json         |   2 +-
 .../hendrycksTest-astronomy-v0-loglikelihood  |   2 +-
 .../hendrycksTest-astronomy-v0-res.json       |   2 +-
 ...rycksTest-business_ethics-v0-loglikelihood |   2 +-
 .../hendrycksTest-business_ethics-v0-res.json |   2 +-
 ...ksTest-clinical_knowledge-v0-loglikelihood |   2 +-
 ...ndrycksTest-clinical_knowledge-v0-res.json |   2 +-
 ...rycksTest-college_biology-v0-loglikelihood |   2 +-
 .../hendrycksTest-college_biology-v0-res.json |   2 +-
 ...cksTest-college_chemistry-v0-loglikelihood |   2 +-
 ...endrycksTest-college_chemistry-v0-res.json |   2 +-
 ...-college_computer_science-v0-loglikelihood |   2 +-
 ...sTest-college_computer_science-v0-res.json |   2 +-
 ...sTest-college_mathematics-v0-loglikelihood |   2 +-
 ...drycksTest-college_mathematics-v0-res.json |   2 +-
 ...ycksTest-college_medicine-v0-loglikelihood |   2 +-
 ...hendrycksTest-college_medicine-v0-res.json |   2 +-
 ...rycksTest-college_physics-v0-loglikelihood |   2 +-
 .../hendrycksTest-college_physics-v0-res.json |   2 +-
 ...cksTest-computer_security-v0-loglikelihood |   2 +-
 ...endrycksTest-computer_security-v0-res.json |   2 +-
 ...ksTest-conceptual_physics-v0-loglikelihood |   2 +-
 ...ndrycksTest-conceptual_physics-v0-res.json |   2 +-
 ...endrycksTest-econometrics-v0-loglikelihood |   2 +-
 .../hendrycksTest-econometrics-v0-res.json    |   2 +-
 ...st-electrical_engineering-v0-loglikelihood |   2 +-
 ...cksTest-electrical_engineering-v0-res.json |   2 +-
 ...st-elementary_mathematics-v0-loglikelihood |   2 +-
 ...cksTest-elementary_mathematics-v0-res.json |   2 +-
 ...endrycksTest-formal_logic-v0-loglikelihood |   2 +-
 .../hendrycksTest-formal_logic-v0-res.json    |   2 +-
 ...endrycksTest-global_facts-v0-loglikelihood |   2 +-
 .../hendrycksTest-global_facts-v0-res.json    |   2 +-
 ...sTest-high_school_biology-v0-loglikelihood |   2 +-
 ...drycksTest-high_school_biology-v0-res.json |   2 +-
 ...est-high_school_chemistry-v0-loglikelihood |   2 +-
 ...ycksTest-high_school_chemistry-v0-res.json |   2 +-
 ...h_school_computer_science-v0-loglikelihood |   2 +-
 ...t-high_school_computer_science-v0-res.json |   2 +-
 ...h_school_european_history-v0-loglikelihood |   2 +-
 ...t-high_school_european_history-v0-res.json |   2 +-
 ...est-high_school_geography-v0-loglikelihood |   2 +-
 ...ycksTest-high_school_geography-v0-res.json |   2 +-
 ...l_government_and_politics-v0-loglikelihood |   2 +-
 ...school_government_and_politics-v0-res.json |   2 +-
 ...igh_school_macroeconomics-v0-loglikelihood |   2 +-
 ...est-high_school_macroeconomics-v0-res.json |   2 +-
 ...t-high_school_mathematics-v0-loglikelihood |   2 +-
 ...ksTest-high_school_mathematics-v0-res.json |   2 +-
 ...igh_school_microeconomics-v0-loglikelihood |   2 +-
 ...est-high_school_microeconomics-v0-res.json |   2 +-
 ...sTest-high_school_physics-v0-loglikelihood |   2 +-
 ...drycksTest-high_school_physics-v0-res.json |   2 +-
 ...st-high_school_psychology-v0-loglikelihood |   2 +-
 ...cksTest-high_school_psychology-v0-res.json |   2 +-
 ...st-high_school_statistics-v0-loglikelihood |   2 +-
 ...cksTest-high_school_statistics-v0-res.json |   2 +-
 ...st-high_school_us_history-v0-loglikelihood |   2 +-
 ...cksTest-high_school_us_history-v0-res.json |   2 +-
 ...high_school_world_history-v0-loglikelihood |   2 +-
 ...Test-high_school_world_history-v0-res.json |   2 +-
 ...hendrycksTest-human_aging-v0-loglikelihood |   2 +-
 .../hendrycksTest-human_aging-v0-res.json     |   2 +-
 ...rycksTest-human_sexuality-v0-loglikelihood |   2 +-
 .../hendrycksTest-human_sexuality-v0-res.json |   2 +-
 ...cksTest-international_law-v0-loglikelihood |   2 +-
 ...endrycksTest-international_law-v0-res.json |   2 +-
 ...ndrycksTest-jurisprudence-v0-loglikelihood |   2 +-
 .../hendrycksTest-jurisprudence-v0-res.json   |   2 +-
 ...cksTest-logical_fallacies-v0-loglikelihood |   2 +-
 ...endrycksTest-logical_fallacies-v0-res.json |   2 +-
 ...ycksTest-machine_learning-v0-loglikelihood |   2 +-
 ...hendrycksTest-machine_learning-v0-res.json |   2 +-
 .../hendrycksTest-management-v0-loglikelihood |   2 +-
 .../hendrycksTest-management-v0-res.json      |   2 +-
 .../hendrycksTest-marketing-v0-loglikelihood  |   2 +-
 .../hendrycksTest-marketing-v0-res.json       |   2 +-
 ...ycksTest-medical_genetics-v0-loglikelihood |   2 +-
 ...hendrycksTest-medical_genetics-v0-res.json |   2 +-
 ...ndrycksTest-miscellaneous-v0-loglikelihood |   2 +-
 .../hendrycksTest-miscellaneous-v0-res.json   |   2 +-
 ...drycksTest-moral_disputes-v0-loglikelihood |   2 +-
 .../hendrycksTest-moral_disputes-v0-res.json  |   2 +-
 ...rycksTest-moral_scenarios-v0-loglikelihood |   2 +-
 .../hendrycksTest-moral_scenarios-v0-res.json |   2 +-
 .../hendrycksTest-nutrition-v0-loglikelihood  |   2 +-
 .../hendrycksTest-nutrition-v0-res.json       |   2 +-
 .../hendrycksTest-philosophy-v0-loglikelihood |   2 +-
 .../hendrycksTest-philosophy-v0-res.json      |   2 +-
 .../hendrycksTest-prehistory-v0-loglikelihood |   2 +-
 .../hendrycksTest-prehistory-v0-res.json      |   2 +-
 ...t-professional_accounting-v0-loglikelihood |   2 +-
 ...ksTest-professional_accounting-v0-res.json |   2 +-
 ...ycksTest-professional_law-v0-loglikelihood |   2 +-
 ...hendrycksTest-professional_law-v0-res.json |   2 +-
 ...est-professional_medicine-v0-loglikelihood |   2 +-
 ...ycksTest-professional_medicine-v0-res.json |   2 +-
 ...t-professional_psychology-v0-loglikelihood |   2 +-
 ...ksTest-professional_psychology-v0-res.json |   2 +-
 ...ycksTest-public_relations-v0-loglikelihood |   2 +-
 ...hendrycksTest-public_relations-v0-res.json |   2 +-
 ...ycksTest-security_studies-v0-loglikelihood |   2 +-
 ...hendrycksTest-security_studies-v0-res.json |   2 +-
 .../hendrycksTest-sociology-v0-loglikelihood  |   2 +-
 .../hendrycksTest-sociology-v0-res.json       |   2 +-
 ...cksTest-us_foreign_policy-v0-loglikelihood |   2 +-
 ...endrycksTest-us_foreign_policy-v0-res.json |   2 +-
 .../hendrycksTest-virology-v0-loglikelihood   |   2 +-
 .../hendrycksTest-virology-v0-res.json        |   2 +-
 ...rycksTest-world_religions-v0-loglikelihood |   2 +-
 .../hendrycksTest-world_religions-v0-res.json |   2 +-
 tests/testdata/iwslt17-ar-en-v0-greedy_until  |   2 +-
 tests/testdata/iwslt17-ar-en-v0-res.json      |   2 +-
 tests/testdata/iwslt17-en-ar-v0-greedy_until  |   2 +-
 tests/testdata/iwslt17-en-ar-v0-res.json      |   2 +-
 tests/testdata/lambada-v0-loglikelihood       |   2 +-
 tests/testdata/lambada-v0-res.json            |   2 +-
 tests/testdata/lambada_cloze-v0-loglikelihood |   2 +-
 tests/testdata/lambada_cloze-v0-res.json      |   2 +-
 tests/testdata/lambada_mt_de-v0-loglikelihood |   2 +-
 tests/testdata/lambada_mt_de-v0-res.json      |   2 +-
 tests/testdata/lambada_mt_en-v0-loglikelihood |   2 +-
 tests/testdata/lambada_mt_en-v0-res.json      |   2 +-
 tests/testdata/lambada_mt_es-v0-loglikelihood |   2 +-
 tests/testdata/lambada_mt_es-v0-res.json      |   2 +-
 tests/testdata/lambada_mt_fr-v0-loglikelihood |   2 +-
 tests/testdata/lambada_mt_fr-v0-res.json      |   2 +-
 tests/testdata/lambada_mt_it-v0-loglikelihood |   2 +-
 tests/testdata/lambada_mt_it-v0-res.json      |   2 +-
 tests/testdata/logiqa-v0-loglikelihood        |   2 +-
 tests/testdata/logiqa-v0-res.json             |   2 +-
 tests/testdata/math_algebra-v0-greedy_until   |   2 +-
 tests/testdata/math_algebra-v0-res.json       |   2 +-
 tests/testdata/math_algebra-v1-greedy_until   |   2 +-
 tests/testdata/math_algebra-v1-res.json       |   2 +-
 .../math_counting_and_prob-v0-greedy_until    |   2 +-
 .../math_counting_and_prob-v0-res.json        |   2 +-
 .../math_counting_and_prob-v1-greedy_until    |   2 +-
 .../math_counting_and_prob-v1-res.json        |   2 +-
 tests/testdata/math_geometry-v0-greedy_until  |   2 +-
 tests/testdata/math_geometry-v0-res.json      |   2 +-
 tests/testdata/math_geometry-v1-greedy_until  |   2 +-
 tests/testdata/math_geometry-v1-res.json      |   2 +-
 .../math_intermediate_algebra-v0-greedy_until |   2 +-
 .../math_intermediate_algebra-v0-res.json     |   2 +-
 .../math_intermediate_algebra-v1-greedy_until |   2 +-
 .../math_intermediate_algebra-v1-res.json     |   2 +-
 .../testdata/math_num_theory-v0-greedy_until  |   2 +-
 tests/testdata/math_num_theory-v0-res.json    |   2 +-
 .../testdata/math_num_theory-v1-greedy_until  |   2 +-
 tests/testdata/math_num_theory-v1-res.json    |   2 +-
 .../testdata/math_prealgebra-v0-greedy_until  |   2 +-
 tests/testdata/math_prealgebra-v0-res.json    |   2 +-
 .../testdata/math_prealgebra-v1-greedy_until  |   2 +-
 tests/testdata/math_prealgebra-v1-res.json    |   2 +-
 tests/testdata/math_precalc-v0-greedy_until   |   2 +-
 tests/testdata/math_precalc-v0-res.json       |   2 +-
 tests/testdata/math_precalc-v1-greedy_until   |   2 +-
 tests/testdata/math_precalc-v1-res.json       |   2 +-
 tests/testdata/mathqa-v0-loglikelihood        |   2 +-
 tests/testdata/mathqa-v0-res.json             |   2 +-
 tests/testdata/mc_taco-v0-loglikelihood       |   2 +-
 tests/testdata/mc_taco-v0-res.json            |   2 +-
 tests/testdata/mnli-v0-loglikelihood          |   2 +-
 tests/testdata/mnli-v0-res.json               |   2 +-
 .../testdata/mnli_mismatched-v0-loglikelihood |   2 +-
 tests/testdata/mnli_mismatched-v0-res.json    |   2 +-
 tests/testdata/mrpc-v0-loglikelihood          |   2 +-
 tests/testdata/mrpc-v0-res.json               |   2 +-
 tests/testdata/multirc-v0-loglikelihood       |   2 +-
 tests/testdata/multirc-v0-res.json            |   2 +-
 tests/testdata/multirc-v1-loglikelihood       |   2 +-
 tests/testdata/multirc-v1-res.json            |   2 +-
 tests/testdata/mutual-v0-loglikelihood        |   2 +-
 tests/testdata/mutual-v0-res.json             |   2 +-
 tests/testdata/mutual-v1-loglikelihood        |   2 +-
 tests/testdata/mutual-v1-res.json             |   2 +-
 tests/testdata/mutual_plus-v0-loglikelihood   |   2 +-
 tests/testdata/mutual_plus-v0-res.json        |   2 +-
 tests/testdata/mutual_plus-v1-loglikelihood   |   2 +-
 tests/testdata/mutual_plus-v1-res.json        |   2 +-
 tests/testdata/openbookqa-v0-loglikelihood    |   2 +-
 tests/testdata/openbookqa-v0-res.json         |   2 +-
 .../pile_arxiv-v0-loglikelihood_rolling       |   2 +-
 tests/testdata/pile_arxiv-v0-res.json         |   2 +-
 .../pile_arxiv-v1-loglikelihood_rolling       |   2 +-
 tests/testdata/pile_arxiv-v1-res.json         |   2 +-
 .../pile_bookcorpus2-v0-loglikelihood_rolling |   2 +-
 tests/testdata/pile_bookcorpus2-v0-res.json   |   2 +-
 .../pile_bookcorpus2-v1-loglikelihood_rolling |   2 +-
 tests/testdata/pile_bookcorpus2-v1-res.json   |   2 +-
 .../pile_books3-v0-loglikelihood_rolling      |   2 +-
 tests/testdata/pile_books3-v0-res.json        |   2 +-
 .../pile_books3-v1-loglikelihood_rolling      |   2 +-
 tests/testdata/pile_books3-v1-res.json        |   2 +-
 ...le_dm-mathematics-v0-loglikelihood_rolling |   2 +-
 .../testdata/pile_dm-mathematics-v0-res.json  |   2 +-
 ...le_dm-mathematics-v1-loglikelihood_rolling |   2 +-
 .../testdata/pile_dm-mathematics-v1-res.json  |   2 +-
 .../pile_enron-v0-loglikelihood_rolling       |   2 +-
 tests/testdata/pile_enron-v0-res.json         |   2 +-
 .../pile_enron-v1-loglikelihood_rolling       |   2 +-
 tests/testdata/pile_enron-v1-res.json         |   2 +-
 .../pile_europarl-v0-loglikelihood_rolling    |   2 +-
 tests/testdata/pile_europarl-v0-res.json      |   2 +-
 .../pile_europarl-v1-loglikelihood_rolling    |   2 +-
 tests/testdata/pile_europarl-v1-res.json      |   2 +-
 .../pile_freelaw-v0-loglikelihood_rolling     |   2 +-
 tests/testdata/pile_freelaw-v0-res.json       |   2 +-
 .../pile_freelaw-v1-loglikelihood_rolling     |   2 +-
 tests/testdata/pile_freelaw-v1-res.json       |   2 +-
 .../pile_github-v0-loglikelihood_rolling      |   2 +-
 tests/testdata/pile_github-v0-res.json        |   2 +-
 .../pile_github-v1-loglikelihood_rolling      |   2 +-
 tests/testdata/pile_github-v1-res.json        |   2 +-
 .../pile_gutenberg-v0-loglikelihood_rolling   |   2 +-
 tests/testdata/pile_gutenberg-v0-res.json     |   2 +-
 .../pile_gutenberg-v1-loglikelihood_rolling   |   2 +-
 tests/testdata/pile_gutenberg-v1-res.json     |   2 +-
 .../pile_hackernews-v0-loglikelihood_rolling  |   2 +-
 tests/testdata/pile_hackernews-v0-res.json    |   2 +-
 .../pile_hackernews-v1-loglikelihood_rolling  |   2 +-
 tests/testdata/pile_hackernews-v1-res.json    |   2 +-
 ...pile_nih-exporter-v0-loglikelihood_rolling |   2 +-
 tests/testdata/pile_nih-exporter-v0-res.json  |   2 +-
 ...pile_nih-exporter-v1-loglikelihood_rolling |   2 +-
 tests/testdata/pile_nih-exporter-v1-res.json  |   2 +-
 ...ile_opensubtitles-v0-loglikelihood_rolling |   2 +-
 tests/testdata/pile_opensubtitles-v0-res.json |   2 +-
 ...ile_opensubtitles-v1-loglikelihood_rolling |   2 +-
 tests/testdata/pile_opensubtitles-v1-res.json |   2 +-
 ...pile_openwebtext2-v0-loglikelihood_rolling |   2 +-
 tests/testdata/pile_openwebtext2-v0-res.json  |   2 +-
 ...pile_openwebtext2-v1-loglikelihood_rolling |   2 +-
 tests/testdata/pile_openwebtext2-v1-res.json  |   2 +-
 .../pile_philpapers-v0-loglikelihood_rolling  |   2 +-
 tests/testdata/pile_philpapers-v0-res.json    |   2 +-
 .../pile_philpapers-v1-loglikelihood_rolling  |   2 +-
 tests/testdata/pile_philpapers-v1-res.json    |   2 +-
 .../pile_pile-cc-v0-loglikelihood_rolling     |   2 +-
 tests/testdata/pile_pile-cc-v0-res.json       |   2 +-
 .../pile_pile-cc-v1-loglikelihood_rolling     |   2 +-
 tests/testdata/pile_pile-cc-v1-res.json       |   2 +-
 ..._pubmed-abstracts-v0-loglikelihood_rolling |   2 +-
 .../pile_pubmed-abstracts-v0-res.json         |   2 +-
 ..._pubmed-abstracts-v1-loglikelihood_rolling |   2 +-
 .../pile_pubmed-abstracts-v1-res.json         |   2 +-
 ...le_pubmed-central-v0-loglikelihood_rolling |   2 +-
 .../testdata/pile_pubmed-central-v0-res.json  |   2 +-
 ...le_pubmed-central-v1-loglikelihood_rolling |   2 +-
 .../testdata/pile_pubmed-central-v1-res.json  |   2 +-
 ...ile_stackexchange-v0-loglikelihood_rolling |   2 +-
 tests/testdata/pile_stackexchange-v0-res.json |   2 +-
 ...ile_stackexchange-v1-loglikelihood_rolling |   2 +-
 tests/testdata/pile_stackexchange-v1-res.json |   2 +-
 .../pile_ubuntu-irc-v0-loglikelihood_rolling  |   2 +-
 tests/testdata/pile_ubuntu-irc-v0-res.json    |   2 +-
 .../pile_ubuntu-irc-v1-loglikelihood_rolling  |   2 +-
 tests/testdata/pile_ubuntu-irc-v1-res.json    |   2 +-
 .../pile_uspto-v0-loglikelihood_rolling       |   2 +-
 tests/testdata/pile_uspto-v0-res.json         |   2 +-
 .../pile_uspto-v1-loglikelihood_rolling       |   2 +-
 tests/testdata/pile_uspto-v1-res.json         |   2 +-
 .../pile_wikipedia-v0-loglikelihood_rolling   |   2 +-
 tests/testdata/pile_wikipedia-v0-res.json     |   2 +-
 .../pile_wikipedia-v1-loglikelihood_rolling   |   2 +-
 tests/testdata/pile_wikipedia-v1-res.json     |   2 +-
 ..._youtubesubtitles-v0-loglikelihood_rolling |   2 +-
 .../pile_youtubesubtitles-v0-res.json         |   2 +-
 ..._youtubesubtitles-v1-loglikelihood_rolling |   2 +-
 .../pile_youtubesubtitles-v1-res.json         |   2 +-
 tests/testdata/piqa-v0-loglikelihood          |   2 +-
 tests/testdata/piqa-v0-res.json               |   2 +-
 tests/testdata/prost-v0-loglikelihood         |   2 +-
 tests/testdata/prost-v0-res.json              |   2 +-
 tests/testdata/pubmedqa-v0-loglikelihood      |   2 +-
 tests/testdata/pubmedqa-v0-res.json           |   2 +-
 tests/testdata/qa4mre_2011-v0-loglikelihood   |   2 +-
 tests/testdata/qa4mre_2011-v0-res.json        |   2 +-
 tests/testdata/qa4mre_2012-v0-loglikelihood   |   2 +-
 tests/testdata/qa4mre_2012-v0-res.json        |   2 +-
 tests/testdata/qa4mre_2013-v0-loglikelihood   |   2 +-
 tests/testdata/qa4mre_2013-v0-res.json        |   2 +-
 tests/testdata/qnli-v0-loglikelihood          |   2 +-
 tests/testdata/qnli-v0-res.json               |   2 +-
 tests/testdata/qqp-v0-loglikelihood           |   2 +-
 tests/testdata/qqp-v0-res.json                |   2 +-
 tests/testdata/race-v0-loglikelihood          |   2 +-
 tests/testdata/race-v0-res.json               |   2 +-
 .../testdata/random_insertion-v0-greedy_until |   2 +-
 tests/testdata/random_insertion-v0-res.json   |   2 +-
 tests/testdata/record-v0-loglikelihood        |   2 +-
 tests/testdata/record-v0-res.json             |   2 +-
 tests/testdata/reversed_words-v0-greedy_until |   2 +-
 tests/testdata/reversed_words-v0-res.json     |   2 +-
 tests/testdata/rte-v0-loglikelihood           |   2 +-
 tests/testdata/rte-v0-res.json                |   2 +-
 tests/testdata/sciq-v0-loglikelihood          |   2 +-
 tests/testdata/sciq-v0-res.json               |   2 +-
 tests/testdata/squad2-v0-greedy_until         |   2 +-
 tests/testdata/squad2-v0-loglikelihood        |   2 +-
 tests/testdata/squad2-v0-res.json             |   2 +-
 tests/testdata/squad2-v1-greedy_until         |   2 +-
 tests/testdata/squad2-v1-loglikelihood        |   2 +-
 tests/testdata/squad2-v1-res.json             |   2 +-
 tests/testdata/sst-v0-loglikelihood           |   2 +-
 tests/testdata/sst-v0-res.json                |   2 +-
 tests/testdata/swag-v0-loglikelihood          |   2 +-
 tests/testdata/swag-v0-res.json               |   2 +-
 tests/testdata/triviaqa-v0-loglikelihood      |   2 +-
 tests/testdata/triviaqa-v0-res.json           |   2 +-
 tests/testdata/truthfulqa_gen-v0-greedy_until |   2 +-
 tests/testdata/truthfulqa_gen-v0-res.json     |   2 +-
 tests/testdata/truthfulqa_gen-v1-greedy_until |   2 +-
 tests/testdata/truthfulqa_gen-v1-res.json     |   2 +-
 tests/testdata/truthfulqa_mc-v0-loglikelihood |   2 +-
 tests/testdata/truthfulqa_mc-v0-res.json      |   2 +-
 tests/testdata/truthfulqa_mc-v1-loglikelihood |   2 +-
 tests/testdata/truthfulqa_mc-v1-res.json      |   2 +-
 tests/testdata/webqs-v0-loglikelihood         |   2 +-
 tests/testdata/webqs-v0-res.json              |   2 +-
 tests/testdata/wic-v0-loglikelihood           |   2 +-
 tests/testdata/wic-v0-res.json                |   2 +-
 .../wikitext-v0-loglikelihood_rolling         |   2 +-
 tests/testdata/wikitext-v0-res.json           |   2 +-
 .../wikitext-v1-loglikelihood_rolling         |   2 +-
 tests/testdata/wikitext-v1-res.json           |   2 +-
 tests/testdata/winogrande-v0-loglikelihood    |   2 +-
 tests/testdata/winogrande-v0-res.json         |   2 +-
 tests/testdata/wmt14-en-fr-v0-greedy_until    |   2 +-
 tests/testdata/wmt14-en-fr-v0-res.json        |   2 +-
 tests/testdata/wmt14-fr-en-v0-greedy_until    |   2 +-
 tests/testdata/wmt14-fr-en-v0-res.json        |   2 +-
 tests/testdata/wmt16-de-en-v0-greedy_until    |   2 +-
 tests/testdata/wmt16-de-en-v0-res.json        |   2 +-
 tests/testdata/wmt16-en-de-v0-greedy_until    |   2 +-
 tests/testdata/wmt16-en-de-v0-res.json        |   2 +-
 tests/testdata/wmt16-en-ro-v0-greedy_until    |   2 +-
 tests/testdata/wmt16-en-ro-v0-res.json        |   2 +-
 tests/testdata/wmt16-ro-en-v0-greedy_until    |   2 +-
 tests/testdata/wmt16-ro-en-v0-res.json        |   2 +-
 tests/testdata/wmt20-cs-en-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-cs-en-v0-res.json        |   2 +-
 tests/testdata/wmt20-de-en-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-de-en-v0-res.json        |   2 +-
 tests/testdata/wmt20-de-fr-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-de-fr-v0-res.json        |   2 +-
 tests/testdata/wmt20-en-cs-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-en-cs-v0-res.json        |   2 +-
 tests/testdata/wmt20-en-de-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-en-de-v0-res.json        |   2 +-
 tests/testdata/wmt20-en-iu-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-en-iu-v0-res.json        |   2 +-
 tests/testdata/wmt20-en-ja-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-en-ja-v0-res.json        |   2 +-
 tests/testdata/wmt20-en-ja-v1-greedy_until    |   2 +-
 tests/testdata/wmt20-en-ja-v1-res.json        |   2 +-
 tests/testdata/wmt20-en-km-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-en-km-v0-res.json        |   2 +-
 tests/testdata/wmt20-en-pl-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-en-pl-v0-res.json        |   2 +-
 tests/testdata/wmt20-en-ps-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-en-ps-v0-res.json        |   2 +-
 tests/testdata/wmt20-en-ru-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-en-ru-v0-res.json        |   2 +-
 tests/testdata/wmt20-en-ta-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-en-ta-v0-res.json        |   2 +-
 tests/testdata/wmt20-en-zh-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-en-zh-v0-res.json        |   2 +-
 tests/testdata/wmt20-en-zh-v1-greedy_until    |   2 +-
 tests/testdata/wmt20-en-zh-v1-res.json        |   2 +-
 tests/testdata/wmt20-fr-de-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-fr-de-v0-res.json        |   2 +-
 tests/testdata/wmt20-iu-en-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-iu-en-v0-res.json        |   2 +-
 tests/testdata/wmt20-ja-en-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-ja-en-v0-res.json        |   2 +-
 tests/testdata/wmt20-km-en-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-km-en-v0-res.json        |   2 +-
 tests/testdata/wmt20-pl-en-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-pl-en-v0-res.json        |   2 +-
 tests/testdata/wmt20-ps-en-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-ps-en-v0-res.json        |   2 +-
 tests/testdata/wmt20-ru-en-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-ru-en-v0-res.json        |   2 +-
 tests/testdata/wmt20-ta-en-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-ta-en-v0-res.json        |   2 +-
 tests/testdata/wmt20-zh-en-v0-greedy_until    |   2 +-
 tests/testdata/wmt20-zh-en-v0-res.json        |   2 +-
 tests/testdata/wnli-v0-loglikelihood          |   2 +-
 tests/testdata/wnli-v0-res.json               |   2 +-
 tests/testdata/wnli-v1-loglikelihood          |   2 +-
 tests/testdata/wnli-v1-res.json               |   2 +-
 tests/testdata/wsc-v0-loglikelihood           |   2 +-
 tests/testdata/wsc-v0-res.json                |   2 +-
 tests/testdata/wsc273-v0-loglikelihood        |   2 +-
 tests/testdata/wsc273-v0-res.json             |   2 +-
 .../blimp_adjunct_island-v0-loglikelihood     |   2 +-
 .../testdata/blimp_adjunct_island-v0-res.json |   2 +-
 ..._anaphor_gender_agreement-v0-loglikelihood |   2 +-
 ...blimp_anaphor_gender_agreement-v0-res.json |   2 +-
 ..._anaphor_number_agreement-v0-loglikelihood |   2 +-
 ...blimp_anaphor_number_agreement-v0-res.json |   2 +-
 ...p_animate_subject_passive-v0-loglikelihood |   2 +-
 .../blimp_animate_subject_passive-v0-res.json |   2 +-
 ...imp_animate_subject_trans-v0-loglikelihood |   2 +-
 .../blimp_animate_subject_trans-v0-res.json   |   2 +-
 .../testdata/blimp_causative-v0-loglikelihood |   2 +-
 .../testdata/blimp_causative-v0-res.json      |   2 +-
 .../blimp_complex_NP_island-v0-loglikelihood  |   2 +-
 .../blimp_complex_NP_island-v0-res.json       |   2 +-
 ...raint_complex_left_branch-v0-loglikelihood |   2 +-
 ...constraint_complex_left_branch-v0-res.json |   2 +-
 ...straint_object_extraction-v0-loglikelihood |   2 +-
 ...e_constraint_object_extraction-v0-res.json |   2 +-
 ...terminer_noun_agreement_1-v0-loglikelihood |   2 +-
 ...mp_determiner_noun_agreement_1-v0-res.json |   2 +-
 ...terminer_noun_agreement_2-v0-loglikelihood |   2 +-
 ...mp_determiner_noun_agreement_2-v0-res.json |   2 +-
 ...oun_agreement_irregular_1-v0-loglikelihood |   2 +-
 ...ner_noun_agreement_irregular_1-v0-res.json |   2 +-
 ...oun_agreement_irregular_2-v0-loglikelihood |   2 +-
 ...ner_noun_agreement_irregular_2-v0-res.json |   2 +-
 ...noun_agreement_with_adj_2-v0-loglikelihood |   2 +-
 ...iner_noun_agreement_with_adj_2-v0-res.json |   2 +-
 ...ment_with_adj_irregular_1-v0-loglikelihood |   2 +-
 ...agreement_with_adj_irregular_1-v0-res.json |   2 +-
 ...ment_with_adj_irregular_2-v0-loglikelihood |   2 +-
 ...agreement_with_adj_irregular_2-v0-res.json |   2 +-
 ...greement_with_adjective_1-v0-loglikelihood |   2 +-
 ...oun_agreement_with_adjective_1-v0-res.json |   2 +-
 ...agreement_relational_noun-v0-loglikelihood |   2 +-
 ...ctor_agreement_relational_noun-v0-res.json |   2 +-
 ...agreement_relative_clause-v0-loglikelihood |   2 +-
 ...ctor_agreement_relative_clause-v0-res.json |   2 +-
 .../blimp_drop_argument-v0-loglikelihood      |   2 +-
 .../testdata/blimp_drop_argument-v0-res.json  |   2 +-
 .../blimp_ellipsis_n_bar_1-v0-loglikelihood   |   2 +-
 .../blimp_ellipsis_n_bar_1-v0-res.json        |   2 +-
 .../blimp_ellipsis_n_bar_2-v0-loglikelihood   |   2 +-
 .../blimp_ellipsis_n_bar_2-v0-res.json        |   2 +-
 ...tial_there_object_raising-v0-loglikelihood |   2 +-
 ...istential_there_object_raising-v0-res.json |   2 +-
 ...ntial_there_quantifiers_1-v0-loglikelihood |   2 +-
 ...xistential_there_quantifiers_1-v0-res.json |   2 +-
 ...ntial_there_quantifiers_2-v0-loglikelihood |   2 +-
 ...xistential_there_quantifiers_2-v0-res.json |   2 +-
 ...ial_there_subject_raising-v0-loglikelihood |   2 +-
 ...stential_there_subject_raising-v0-res.json |   2 +-
 ...pletive_it_object_raising-v0-loglikelihood |   2 +-
 ...mp_expletive_it_object_raising-v0-res.json |   2 +-
 .../blimp_inchoative-v0-loglikelihood         |   2 +-
 .../testdata/blimp_inchoative-v0-res.json     |   2 +-
 .../blimp_intransitive-v0-loglikelihood       |   2 +-
 .../testdata/blimp_intransitive-v0-res.json   |   2 +-
 ...ast_participle_adjectives-v0-loglikelihood |   2 +-
 ...lar_past_participle_adjectives-v0-res.json |   2 +-
 ...lar_past_participle_verbs-v0-loglikelihood |   2 +-
 ...rregular_past_participle_verbs-v0-res.json |   2 +-
 ..._subject_verb_agreement_1-v0-loglikelihood |   2 +-
 ...lural_subject_verb_agreement_1-v0-res.json |   2 +-
 ..._subject_verb_agreement_2-v0-loglikelihood |   2 +-
 ...lural_subject_verb_agreement_2-v0-res.json |   2 +-
 ...anch_island_echo_question-v0-loglikelihood |   2 +-
 ...ft_branch_island_echo_question-v0-res.json |   2 +-
 ...ch_island_simple_question-v0-loglikelihood |   2 +-
 ..._branch_island_simple_question-v0-res.json |   2 +-
 ...tion_npi_licensor_present-v0-loglikelihood |   2 +-
 ..._question_npi_licensor_present-v0-res.json |   2 +-
 .../blimp_npi_present_1-v0-loglikelihood      |   2 +-
 .../testdata/blimp_npi_present_1-v0-res.json  |   2 +-
 .../blimp_npi_present_2-v0-loglikelihood      |   2 +-
 .../testdata/blimp_npi_present_2-v0-res.json  |   2 +-
 ...only_npi_licensor_present-v0-loglikelihood |   2 +-
 ...limp_only_npi_licensor_present-v0-res.json |   2 +-
 .../blimp_only_npi_scope-v0-loglikelihood     |   2 +-
 .../testdata/blimp_only_npi_scope-v0-res.json |   2 +-
 .../testdata/blimp_passive_1-v0-loglikelihood |   2 +-
 .../testdata/blimp_passive_1-v0-res.json      |   2 +-
 .../testdata/blimp_passive_2-v0-loglikelihood |   2 +-
 .../testdata/blimp_passive_2-v0-res.json      |   2 +-
 ...imp_principle_A_c_command-v0-loglikelihood |   2 +-
 .../blimp_principle_A_c_command-v0-res.json   |   2 +-
 .../blimp_principle_A_case_1-v0-loglikelihood |   2 +-
 .../blimp_principle_A_case_1-v0-res.json      |   2 +-
 .../blimp_principle_A_case_2-v0-loglikelihood |   2 +-
 .../blimp_principle_A_case_2-v0-res.json      |   2 +-
 ...limp_principle_A_domain_1-v0-loglikelihood |   2 +-
 .../blimp_principle_A_domain_1-v0-res.json    |   2 +-
 ...limp_principle_A_domain_2-v0-loglikelihood |   2 +-
 .../blimp_principle_A_domain_2-v0-res.json    |   2 +-
 ...limp_principle_A_domain_3-v0-loglikelihood |   2 +-
 .../blimp_principle_A_domain_3-v0-res.json    |   2 +-
 ...rinciple_A_reconstruction-v0-loglikelihood |   2 +-
 ...imp_principle_A_reconstruction-v0-res.json |   2 +-
 ..._subject_verb_agreement_1-v0-loglikelihood |   2 +-
 ...lural_subject_verb_agreement_1-v0-res.json |   2 +-
 ..._subject_verb_agreement_2-v0-loglikelihood |   2 +-
 ...lural_subject_verb_agreement_2-v0-res.json |   2 +-
 ...tion_npi_licensor_present-v0-loglikelihood |   2 +-
 ..._negation_npi_licensor_present-v0-res.json |   2 +-
 ...ential_negation_npi_scope-v0-loglikelihood |   2 +-
 ..._sentential_negation_npi_scope-v0-res.json |   2 +-
 ...sentential_subject_island-v0-loglikelihood |   2 +-
 ...limp_sentential_subject_island-v0-res.json |   2 +-
 ...superlative_quantifiers_1-v0-loglikelihood |   2 +-
 ...limp_superlative_quantifiers_1-v0-res.json |   2 +-
 ...superlative_quantifiers_2-v0-loglikelihood |   2 +-
 ...limp_superlative_quantifiers_2-v0-res.json |   2 +-
 .../blimp_tough_vs_raising_1-v0-loglikelihood |   2 +-
 .../blimp_tough_vs_raising_1-v0-res.json      |   2 +-
 .../blimp_tough_vs_raising_2-v0-loglikelihood |   2 +-
 .../blimp_tough_vs_raising_2-v0-res.json      |   2 +-
 .../blimp_transitive-v0-loglikelihood         |   2 +-
 .../testdata/blimp_transitive-v0-res.json     |   2 +-
 .../testdata/blimp_wh_island-v0-loglikelihood |   2 +-
 .../testdata/blimp_wh_island-v0-res.json      |   2 +-
 ...p_wh_questions_object_gap-v0-loglikelihood |   2 +-
 .../blimp_wh_questions_object_gap-v0-res.json |   2 +-
 ..._wh_questions_subject_gap-v0-loglikelihood |   2 +-
 ...blimp_wh_questions_subject_gap-v0-res.json |   2 +-
 ...subject_gap_long_distance-v0-loglikelihood |   2 +-
 ...ions_subject_gap_long_distance-v0-res.json |   2 +-
 .../blimp_wh_vs_that_no_gap-v0-loglikelihood  |   2 +-
 .../blimp_wh_vs_that_no_gap-v0-res.json       |   2 +-
 ...that_no_gap_long_distance-v0-loglikelihood |   2 +-
 ...h_vs_that_no_gap_long_distance-v0-res.json |   2 +-
 ...blimp_wh_vs_that_with_gap-v0-loglikelihood |   2 +-
 .../blimp_wh_vs_that_with_gap-v0-res.json     |   2 +-
 ...at_with_gap_long_distance-v0-loglikelihood |   2 +-
 ...vs_that_with_gap_long_distance-v0-res.json |   2 +-
 732 files changed, 4180 insertions(+), 3430 deletions(-)
 create mode 100644 .flake8
 create mode 100644 .github/workflows/pull_request.yml
 create mode 100644 .pre-commit-config.yaml

diff --git a/.coveragerc b/.coveragerc
index a514900778..1248476304 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,7 +1,7 @@
 [run]
 
 # tasks that aren't wired up.
-omit = 
+omit =
     lm_eval/tasks/quac.py
     lm_eval/tasks/storycloze.py
     lm_eval/tasks/cbt.py
@@ -25,4 +25,4 @@ exclude_lines =
     # Don't complain if tests don't hit defensive assertion code:
     raise AssertionError
     raise NotImplementedError
-    return NotImplemented
\ No newline at end of file
+    return NotImplemented
diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000000..73f6455d13
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,5 @@
+[flake8]
+ignore = E203, E266, E501, W503, F403, F401, C901
+max-line-length = 127
+max-complexity = 10
+select = B,C,E,F,W,T4,B9
diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
new file mode 100644
index 0000000000..a657fe68d7
--- /dev/null
+++ b/.github/workflows/pull_request.yml
@@ -0,0 +1,13 @@
+name: Pull Request
+
+on: [pull_request]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+      - uses: pre-commit/action@v2.0.3
diff --git a/.gitignore b/.gitignore
index 8eb7a357a5..2e53362283 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,4 @@ env
 *.pyc
 data/
 lm_cache
-.idea
\ No newline at end of file
+.idea
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000..23f960a58d
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,42 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.1.0
+    hooks:
+      - id: check-added-large-files
+      - id: check-ast
+      - id: check-byte-order-marker
+      - id: check-case-conflict
+      - id: check-json
+      - id: check-merge-conflict
+      - id: check-symlinks
+      - id: check-yaml
+      - id: destroyed-symlinks
+      - id: detect-private-key
+      - id: end-of-file-fixer
+      - id: no-commit-to-branch
+      - id: requirements-txt-fixer
+      - id: trailing-whitespace
+      - id: fix-byte-order-marker
+        exclude: docs/CNAME
+      - id: fix-encoding-pragma
+        args: [--remove]
+      - id: mixed-line-ending
+        args: [--fix=lf]
+  - repo: https://gitlab.com/pycqa/flake8
+    rev: 3.7.9
+    hooks:
+      - id: flake8
+  - repo: https://github.com/psf/black
+    rev: 22.3.0
+    hooks:
+      - id: black
+        language_version: python3.8
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.1.0
+    hooks:
+      - id: codespell
+        args: [
+            "--ignore-words-list=reord", # Word used in error messages that need rewording
+            --check-filenames,
+            --check-hidden,
+          ]
diff --git a/README.md b/README.md
index 828e3e3630..78466280da 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 ![](https://github.com/EleutherAI/lm-evaluation-harness/workflows/Build/badge.svg)
 [![codecov](https://codecov.io/gh/EleutherAI/lm-evaluation-harness/branch/master/graph/badge.svg?token=JSG3O2427J)](https://codecov.io/gh/EleutherAI/lm-evaluation-harness)
 
-## Overview 
+## Overview
 
 This project provides a unified framework to test autoregressive language models (GPT-2, GPT-3, GPTNeo, etc) on a large number of different evaluation tasks.
 
@@ -403,7 +403,7 @@ the ngram files and info.json. See the above guide for ngram generation for the
 python main.py \
     --model gpt2 \
     --device 0 \
-    --tasks sciq \ 
+    --tasks sciq \
     --decontamination_ngrams_path path/containing/training/set/ngrams
 ```
 
@@ -420,9 +420,9 @@ Both LMs (`lm_eval.models`) and Tasks (`lm_eval.tasks`) are kept in a registry d
 
 The [GPT-3 Evaluations Project](https://github.com/EleutherAI/lm_evaluation_harness/projects/1) tracks our progress implementing new tasks. Right now, we are focused on getting all the datasets loaded so that we can dedupe against the training data. Implementing the actual evaluations is nice but not necessary at the current moment.
 
-### Task Versioning 
+### Task Versioning
 
-To help improve reproducibility, all tasks have a VERSION field. When run from the command line, this is reported in a column in the table, or in the "version" field in the evaluator return dict. The purpose of the version is so that if the task definition changes (i.e to fix a bug), then we can know exactly which metrics were computed using the old buggy implementation to avoid unfair comparisons. To enforce this, there are unit tests that make sure the behavior of all tests remains the same as when they were first implemented. Task versions start at 0, and each time a breaking change is made, the version is incremented by one. 
+To help improve reproducibility, all tasks have a VERSION field. When run from the command line, this is reported in a column in the table, or in the "version" field in the evaluator return dict. The purpose of the version is so that if the task definition changes (i.e to fix a bug), then we can know exactly which metrics were computed using the old buggy implementation to avoid unfair comparisons. To enforce this, there are unit tests that make sure the behavior of all tests remains the same as when they were first implemented. Task versions start at 0, and each time a breaking change is made, the version is incremented by one.
 
 When reporting eval harness results, please also report the version of each task. This can be done either with a separate column in the table, or by reporting the task name with the version appended as such: taskname-v0.
 
diff --git a/docs/decontamination.md b/docs/decontamination.md
index e60120f6ba..a4214af89f 100644
--- a/docs/decontamination.md
+++ b/docs/decontamination.md
@@ -22,14 +22,14 @@ The basis for our decontamination procedure can be found in Appendix C of "Langu
 
 ## Implementation
 
-Contamination detection can be found in "lm_eval/decontaminate.py" with supporting code in "lm_eval/decontamination/". 
+Contamination detection can be found in "lm_eval/decontaminate.py" with supporting code in "lm_eval/decontamination/".
 
 decontaminate.py does the following:
 1. Build dictionaries of all ngrams and their corresponding evaluation/document ids.
 2. Scan through sorted files containing training set n-grams.
 3. If a match is found, the corresponding evaluation/document combinations are marked as contaminated.
 
-"lm_eval/evaluator.py" can then produce a clean version of the benchmark by excluding the results of contaminated documents. For each metric, a clean version will be shown in the results with a "decontaminate" suffix. 
+"lm_eval/evaluator.py" can then produce a clean version of the benchmark by excluding the results of contaminated documents. For each metric, a clean version will be shown in the results with a "decontaminate" suffix.
 
 This is disabled by default for new tasks, to support decontamination on a task override the "should_decontaminate" and "doc_to_decontamination_query" methods. For more details see the [task guide](task_guide.md).
 
@@ -73,4 +73,3 @@ python -m scripts/clean_training_data/compress_and_package \
 ```
 
 Congratulations, the final directory can now be passed to lm-evaulation-harness with the "--decontamination_ngrams_path" argument.
-
diff --git a/docs/task_guide.md b/docs/task_guide.md
index 1d0e9478e8..223a7c3713 100644
--- a/docs/task_guide.md
+++ b/docs/task_guide.md
@@ -16,7 +16,7 @@ pip install -e ".[dev]"
 
 ## Creating Your Task File
 
-From the `lm-evaluation-harness` project root, copy over the `new_task.py` template to `lm_eval/datasets`. 
+From the `lm-evaluation-harness` project root, copy over the `new_task.py` template to `lm_eval/datasets`.
 
 ```sh
 cp templates/new_task.py lm_eval/tasks/<task-name>.py
@@ -52,7 +52,7 @@ For example, take the QuAC dataset. We have:
 QuAC: Question Answering in Context
 https://arxiv.org/abs/1808.07036
 
-Question Answering in Context (QuAC) is a dataset for modeling, understanding, and 
+Question Answering in Context (QuAC) is a dataset for modeling, understanding, and
 participating in information seeking dialog. Data instances consist of an interactive
 dialog between two crowd workers: (1) a student who poses a sequence of freeform
 questions to learn as much as possible about a hidden Wikipedia text, and (2)
@@ -72,7 +72,7 @@ Now let's walk through the actual implementation - from data handling to evaluat
 ### Downloading your Data
 
 All data downloading and management is handled through the HuggingFace (**HF**) [`datasets`](https://github.com/huggingface/datasets) API. So, the first thing you should do is check to see if your task's dataset is already provided in their catalog [here](https://huggingface.co/datasets). If it's not in there, please consider adding it to their Hub to make it accessible to a wider user base by following their [new dataset guide](https://github.com/huggingface/datasets/blob/master/ADD_NEW_DATASET.md)
-. 
+.
 Now, that you have your HF dataset, you need to assign its path and name to your `Task` in the following fields:
 
 ```python
@@ -116,7 +116,7 @@ These should return a Python iterable (`list` or `generator`) of `dict`s that ca
 
 #### Processing Documents
 
-At this point, you can also process each individual document to, for example, strip whitespace or "detokenize" its fields. Put the processing logic into `_process_doc` and map the functions across training/validation/test docs inside of the respective functions. 
+At this point, you can also process each individual document to, for example, strip whitespace or "detokenize" its fields. Put the processing logic into `_process_doc` and map the functions across training/validation/test docs inside of the respective functions.
 🔠 If your task is **multiple-choice**, we require you to format your documents such that they contain `gold` and `choices` fields. They can also have other fields, but those will be ignored by `MultipleChoiceTask`. `choices` should be a list of possible continuations, and `gold` should be an integer specifying the index of the correct completion.
 See [this task](https://github.com/EleutherAI/lm-evaluation-harness/blob/6caa0afd96a7a7efb2ec4c1f24ad1756e48f3aa7/lm_eval/tasks/sat.py#L60) for an example. 🔠
 
@@ -154,7 +154,7 @@ Finally, be aware that the strings from `doc_to_text` and `doc_to_target` will b
 ### Decontamination
 For background on decontamination please see [this](./decontamination.md).
 
-If you wish to support decontamination studies for your task simply override the "should_decontaminate" method and return true. 
+If you wish to support decontamination studies for your task simply override the "should_decontaminate" method and return true.
 
 You also need to override "doc_to_decontamination_query" and return the data you wish to compare against the training set. This doesn't necessarily need to be the full document or request, and we leave this up to the implementor. For a multi-choice evaluation you could for example just return the question.
 
@@ -172,7 +172,7 @@ python -m scripts.write_out \
     --tasks <your-task> \
     --sets <train | val | test> \
     --num_fewshot K \
-    --num_examples N \ 
+    --num_examples N \
     --description_dict_path <path>
 ```
 
@@ -199,9 +199,9 @@ def construct_requests(self, doc, ctx):
     """
     return ...
 ```
-#### What's a `Request`? What's a `doc`? 
+#### What's a `Request`? What's a `doc`?
 To reiterate, a `doc` is just a `Dict` object that contains information about a document from your corpus. It can contain things like a prompt, question type information, answers and anything else you think will be needed in order to assess your model for a given task. Keep in mind that the fields of this can be basically whatever you want (you can sort this out in `training_docs` \ `validation_docs` \ `test_docs` if you need to customise things - see above), just remember to be consistent with them throughout the rest of the `Task` you write up.
-A `Request` is an object that takes the text prompt you want to present to a model and computes one of a few different types of response. These are evaluated lazily (meaning, only when the result is actually needed). If your task requires generating text you'll need to return a `rf.greedy_until` request otherwise an `rf.loglikelihood` across all labels in a classification tasks will do. 
+A `Request` is an object that takes the text prompt you want to present to a model and computes one of a few different types of response. These are evaluated lazily (meaning, only when the result is actually needed). If your task requires generating text you'll need to return a `rf.greedy_until` request otherwise an `rf.loglikelihood` across all labels in a classification tasks will do.
 The function `construct_requests` can return a list of `Request`s or an iterable; it's perfectly fine to `yield` them from something or other. This is particularly handy if you are creating more than one request per `doc` (usually because you're up to something like multi-task learning). The objects this function returns then get consumed one by one and turned into result objects.
 
 
@@ -232,7 +232,7 @@ def aggregation(self):
 ```
 In `process_results`, model outputs are converted into metrics. These metrics are per document metrics, however; the `aggregation` function is used to work out what to do with them to create a corpus-level metric. Imagine you have a bunch of documents, for each of which you have calculated an F1 score. What should that mean overall? Should they be summed, averaged, the min/max found? This function handles that problem.
 
-The contents of the function itself are pretty straightforward; it should simply return a dict that maps from each metric label that could be returned by `process_results` to a function that can be used to aggregate that metric. That is to say, if the metrics that `process_results` could return are given by `{'a', 'b', 'c'}`, then all of these keys should be present in the dict returned by `aggregation`. 
+The contents of the function itself are pretty straightforward; it should simply return a dict that maps from each metric label that could be returned by `process_results` to a function that can be used to aggregate that metric. That is to say, if the metrics that `process_results` could return are given by `{'a', 'b', 'c'}`, then all of these keys should be present in the dict returned by `aggregation`.
 __NOTE__: See `lm_eval/metrics.py` for a few "built-in" aggregate metrics you can easily import. The standard metrics available in this package are generally based on `sklearn` functions, so if you are in any doubt for how to set things up the documentation over there can be of assistance. If you need to write a custom metric for some reason, start by looking at the existing ones in `lm_eval/metrics.py` for an idea about what the function signature needs to be.
 
 ```python
diff --git a/lm_eval/base.py b/lm_eval/base.py
index 32a54a4fa8..73fcd110da 100644
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
@@ -24,17 +24,17 @@ def __init__(self):
     @abstractmethod
     def loglikelihood(self, requests):
         """Compute log-likelihood of generating a continuation from a context.
-        Downstream tasks should attempt to use loglikelihood instead of other 
+        Downstream tasks should attempt to use loglikelihood instead of other
         LM calls whenever possible.
 
         :param requests: list
             A list of pairs (context, continuation)
             context: str
-                Context string. Implementations of LM must be able to handle an 
+                Context string. Implementations of LM must be able to handle an
                 empty context string.
             continuation: str
-                The continuation over which log likelihood will be calculated. If 
-                there is a word boundary, the space should be in the continuation. 
+                The continuation over which log likelihood will be calculated. If
+                there is a word boundary, the space should be in the continuation.
                 For example, context="hello" continuation=" world" is correct.
         :return: list
             A list of pairs (logprob, isgreedy)
@@ -97,7 +97,7 @@ def greedy_until(self, requests):
             context: str
                 Context string
             until: [str]
-                The string sequences to generate until. These string sequences 
+                The string sequences to generate until. These string sequences
                 may each span across multiple tokens, or may be part of one token.
         :return: list
             A list of strings continuation
@@ -118,7 +118,6 @@ def set_cache_hook(self, cache_hook):
 
 
 class BaseLM(LM):
-
     @property
     @abstractmethod
     def eot_token_id(self):
@@ -145,13 +144,16 @@ def device(self):
         pass
 
     @abstractmethod
-    def tok_encode(self, string: str): pass
-    
+    def tok_encode(self, string: str):
+        pass
+
     @abstractmethod
-    def tok_decode(self, tokens: Iterable[int]): pass
+    def tok_decode(self, tokens: Iterable[int]):
+        pass
 
     @abstractmethod
-    def _model_generate(self, context, max_length, eos_token_id): pass
+    def _model_generate(self, context, max_length, eos_token_id):
+        pass
 
     @abstractmethod
     def _model_call(self, inps):
@@ -187,23 +189,30 @@ def loglikelihood_rolling(self, requests):
         # TODO: automatic batch size detection for vectorization
 
         loglikelihoods = []
-        for string, in tqdm(requests):
-            rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
-                token_list=self.tok_encode(string),
-                prefix_token=self.eot_token_id,
-                max_seq_len=self.max_length,
-                context_len=1,
-            )))
+        for (string,) in tqdm(requests):
+            rolling_token_windows = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.eot_token_id,
+                        max_seq_len=self.max_length,
+                        context_len=1,
+                    ),
+                )
+            )
 
             rolling_token_windows = [(None,) + x for x in rolling_token_windows]
 
             # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for
             # that
-            string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
-            
+            string_nll = self._loglikelihood_tokens(
+                rolling_token_windows, disable_tqdm=True
+            )
+
             # discard is_greedy
             string_nll = [x[0] for x in string_nll]
-            
+
             string_nll = sum(string_nll)
             loglikelihoods.append(string_nll)
 
@@ -223,10 +232,12 @@ def _collate(x):
 
             toks = x[1] + x[2]
             return -len(toks), tuple(toks)
-        
+
         # TODO: automatic (variable) batch size detection for vectorization
         reord = utils.Reorderer(requests, _collate)
-        for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
+        for chunk in utils.chunks(
+            tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size
+        ):
             inps = []
             cont_toks_list = []
             inplens = []
@@ -252,44 +263,60 @@ def _collate(x):
 
                 # when too long to fit in context, truncate from the left
                 inp = torch.tensor(
-                    (context_enc + continuation_enc)[-(self.max_length+1):][:-1],
-                    dtype=torch.long
+                    (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
+                    dtype=torch.long,
                 ).to(self.device)
-                inplen, = inp.shape
+                (inplen,) = inp.shape
 
                 cont = continuation_enc
 
                 # since in _collate we make sure length is descending, the longest is always the first one.
-                padding_length = padding_length if padding_length is not None else inplen
+                padding_length = (
+                    padding_length if padding_length is not None else inplen
+                )
 
                 # pad length from seq to padding_length
-                inp = torch.cat([
-                    inp,  # [seq]
-                    torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device)  # [padding_length - seq]
-                ], dim=0)
+                inp = torch.cat(
+                    [
+                        inp,  # [seq]
+                        torch.zeros(padding_length - inplen, dtype=torch.long).to(
+                            inp.device
+                        ),  # [padding_length - seq]
+                    ],
+                    dim=0,
+                )
 
                 inps.append(inp.unsqueeze(0))  # [1, padding_length]
                 cont_toks_list.append(cont)
                 inplens.append(inplen)
 
             batched_inps = torch.cat(inps, dim=0)  # [batch, padding_length
-            multi_logits = F.log_softmax(self._model_call(batched_inps), dim=-1).cpu()  # [batch, padding_length, vocab]
+            multi_logits = F.log_softmax(
+                self._model_call(batched_inps), dim=-1
+            ).cpu()  # [batch, padding_length, vocab]
 
-            for (cache_key, _, _), logits, inp, inplen, cont_toks \
-                    in zip(chunk, multi_logits, inps, inplens, cont_toks_list):
+            for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(
+                chunk, multi_logits, inps, inplens, cont_toks_list
+            ):
 
                 # Slice to original seq length
                 contlen = len(cont_toks)
-                logits = logits[inplen-contlen:inplen].unsqueeze(0)  # [1, seq, vocab]
+                logits = logits[inplen - contlen : inplen].unsqueeze(
+                    0
+                )  # [1, seq, vocab]
 
                 # Check if per-token argmax is exactly equal to continuation
                 greedy_tokens = logits.argmax(dim=-1)
-                cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0)  # [1, seq]
+                cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(
+                    0
+                )  # [1, seq]
                 max_equal = (greedy_tokens == cont_toks).all()
 
                 # Obtain log-probs at the corresponding continuation token indices
                 # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
-                logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)  # [1, seq]
+                logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
+                    -1
+                )  # [1, seq]
 
                 # Answer: (log prob, is-exact-match)
                 answer = (float(logits.sum()), bool(max_equal))
@@ -301,9 +328,9 @@ def _collate(x):
                 res.append(answer)
 
         return reord.get_original(res)
-    
+
     def greedy_until(self, requests):
-        # TODO: implement fully general `until` that handles untils that are 
+        # TODO: implement fully general `until` that handles untils that are
         #       multiple tokens or that span multiple tokens correctly
 
         # TODO: extract to TokenizedLM?
@@ -312,29 +339,33 @@ def greedy_until(self, requests):
         def _collate(x):
             toks = self.tok_encode(x[0])
             return len(toks), x[0]
-        
+
         reord = utils.Reorderer(requests, _collate)
 
         for context, until in tqdm(reord.get_reordered()):
             if isinstance(until, str):
                 until = [until]
 
-            primary_until, = self.tok_encode(until[0])
-            
-            context_enc = torch.tensor([self.tok_encode(context)[self.max_gen_toks - self.max_length:]]).to(self.device)
+            (primary_until,) = self.tok_encode(until[0])
 
-            cont = self._model_generate(context_enc, context_enc.shape[1] + self.max_gen_toks, primary_until)
+            context_enc = torch.tensor(
+                [self.tok_encode(context)[self.max_gen_toks - self.max_length :]]
+            ).to(self.device)
 
-            s = self.tok_decode(cont[0].tolist()[context_enc.shape[1]:])
+            cont = self._model_generate(
+                context_enc, context_enc.shape[1] + self.max_gen_toks, primary_until
+            )
+
+            s = self.tok_decode(cont[0].tolist()[context_enc.shape[1] :])
 
             for term in until:
                 s = s.split(term)[0]
-            
+
             # partial caching
             self.cache_hook.add_partial("greedy_until", (context, until), s)
-            
+
             res.append(s)
-        
+
         return reord.get_original(res)
 
 
@@ -383,7 +414,7 @@ def __init__(self, data_dir=None, cache_dir=None, download_mode=None):
         self._fewshot_docs = None
 
     def download(self, data_dir=None, cache_dir=None, download_mode=None):
-        """ Downloads and returns the task dataset.
+        """Downloads and returns the task dataset.
         Override this method to download the dataset from a custom API.
 
         :param data_dir: str
@@ -412,7 +443,7 @@ def download(self, data_dir=None, cache_dir=None, download_mode=None):
             name=self.DATASET_NAME,
             data_dir=data_dir,
             cache_dir=cache_dir,
-            download_mode=download_mode
+            download_mode=download_mode,
         )
 
     def should_decontaminate(self):
@@ -473,8 +504,10 @@ def fewshot_examples(self, k, rnd):
         return rnd.sample(self._training_docs, k)
 
     def doc_to_decontamination_query(self, doc):
-        print("Override doc_to_decontamination_query with document specific decontamination query.")
-        assert(False)
+        print(
+            "Override doc_to_decontamination_query with document specific decontamination query."
+        )
+        assert False
 
     @abstractmethod
     def doc_to_text(self, doc):
@@ -486,22 +519,22 @@ def doc_to_target(self, doc):
 
     @abstractmethod
     def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        """Uses RequestFactory to construct Requests and returns an iterable of
         Requests which will be sent to the LM.
 
         :param doc:
             The document as returned from training_docs, validation_docs, or test_docs.
         :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural 
+            The context string, generated by fewshot_context. This includes the natural
             language description, as well as the few shot examples, and the question
-            part of the document for `doc`. 
+            part of the document for `doc`.
         """
         pass
 
     @abstractmethod
     def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a 
-        dict where keys are the names of submetrics and values are the values of 
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
         the metric for that one document
 
         :param doc:
@@ -515,7 +548,7 @@ def process_results(self, doc, results):
     def aggregation(self):
         """
         :returns: {str: [metric_score] -> float}
-            A dictionary where keys are the names of submetrics and values are 
+            A dictionary where keys are the names of submetrics and values are
             functions that aggregate a list of metric scores
         """
         pass
@@ -524,22 +557,26 @@ def aggregation(self):
     def higher_is_better(self):
         """
         :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are 
+            A dictionary where keys are the names of submetrics and values are
             whether a higher value of the submetric is better
         """
         pass
 
     def fewshot_description(self):
         import warnings
+
         warnings.warn(
             "`fewshot_description` will be removed in futures versions. Pass "
             "any custom descriptions to the `evaluate` function instead.",
-            DeprecationWarning)
+            DeprecationWarning,
+        )
         return ""
 
     @utils.positional_deprecated
-    def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
-        """ Returns a fewshot context string that is made up of a prepended description
+    def fewshot_context(
+        self, doc, num_fewshot, provide_description=None, rnd=None, description=None
+    ):
+        """Returns a fewshot context string that is made up of a prepended description
         (if provided), the `num_fewshot` number of examples, and an appended prompt example.
 
         :param doc: str
@@ -556,7 +593,9 @@ def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None,
         :returns: str
             The fewshot context.
         """
-        assert rnd is not None, "A `random.Random` generator argument must be provided to `rnd`"
+        assert (
+            rnd is not None
+        ), "A `random.Random` generator argument must be provided to `rnd`"
         assert not provide_description, (
             "The `provide_description` arg will be removed in future versions. To prepend "
             "a custom description to the context, supply the corresponding string via the "
@@ -564,7 +603,9 @@ def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None,
         )
         if provide_description is not None:
             # nudge people to not specify it at all
-            print("WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict")
+            print(
+                "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
+            )
 
         description = description + "\n\n" if description else ""
 
@@ -577,7 +618,9 @@ def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None,
             else:
                 if self._fewshot_docs is None:
                     self._fewshot_docs = list(
-                        self.validation_docs() if self.has_validation_docs() else self.test_docs()
+                        self.validation_docs()
+                        if self.has_validation_docs()
+                        else self.test_docs()
                     )
 
                 fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
@@ -585,23 +628,27 @@ def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None,
                 # get rid of the doc that's the one we're evaluating, if it's in the fewshot
                 fewshotex = [x for x in fewshotex if x != doc][:num_fewshot]
 
-            labeled_examples = "\n\n".join(
-                [self.doc_to_text(doc) + self.doc_to_target(doc) for doc in fewshotex]
-            ) + "\n\n"
+            labeled_examples = (
+                "\n\n".join(
+                    [
+                        self.doc_to_text(doc) + self.doc_to_target(doc)
+                        for doc in fewshotex
+                    ]
+                )
+                + "\n\n"
+            )
 
         example = self.doc_to_text(doc)
         return description + labeled_examples + example
 
 
 class MultipleChoiceTask(Task):
-
     def doc_to_target(self, doc):
-        return " " + doc['choices'][doc['gold']]
+        return " " + doc["choices"][doc["gold"]]
 
     def construct_requests(self, doc, ctx):
         lls = [
-            rf.loglikelihood(ctx, " {}".format(choice))[0]
-            for choice in doc['choices']
+            rf.loglikelihood(ctx, " {}".format(choice))[0] for choice in doc["choices"]
         ]
 
         return lls
@@ -609,21 +656,21 @@ def construct_requests(self, doc, ctx):
     def process_results(self, doc, results):
         gold = doc["gold"]
 
-        acc = 1. if np.argmax(results) == gold else 0.
+        acc = 1.0 if np.argmax(results) == gold else 0.0
         completion_len = np.array([float(len(i)) for i in doc["choices"]])
-        acc_norm = 1. if np.argmax(results / completion_len) == gold else 0.
+        acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0
 
         return {
             "acc": acc,
             "acc_norm": acc_norm,
         }
-    
+
     def higher_is_better(self):
         return {
             "acc": True,
             "acc_norm": True,
         }
-    
+
     def aggregation(self):
         return {
             "acc": mean,
@@ -632,7 +679,6 @@ def aggregation(self):
 
 
 class PerplexityTask(Task, abc.ABC):
-
     def should_decontaminate(self):
         """Whether this task supports decontamination against model training set."""
         return True
@@ -644,9 +690,15 @@ def fewshot_examples(self, k, rnd):
         assert k == 0
         return []
 
-    def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
-        assert num_fewshot == 0, "The number of fewshot examples must be 0 for perplexity tasks."
-        assert rnd is not None, "A `random.Random` generator argument must be provided to `rnd`."
+    def fewshot_context(
+        self, doc, num_fewshot, provide_description=None, rnd=None, description=None
+    ):
+        assert (
+            num_fewshot == 0
+        ), "The number of fewshot examples must be 0 for perplexity tasks."
+        assert (
+            rnd is not None
+        ), "A `random.Random` generator argument must be provided to `rnd`."
         assert not provide_description, (
             "The `provide_description` arg will be removed in future versions. To prepend "
             "a custom description to the context, supply the corresponding string via the "
@@ -654,7 +706,9 @@ def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None,
         )
         if provide_description is not None:
             # nudge people to not specify it at all
-            print("WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict")
+            print(
+                "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
+            )
 
         return ""
 
@@ -680,7 +734,7 @@ def construct_requests(self, doc, ctx):
         return req
 
     def process_results(self, doc, results):
-        loglikelihood, = results
+        (loglikelihood,) = results
         words = self.count_words(doc)
         bytes_ = self.count_bytes(doc)
         return {
@@ -702,23 +756,23 @@ def count_bytes(cls, doc):
 
     @classmethod
     def count_words(cls, doc):
-        """ Downstream tasks with custom word boundaries should override this! """
+        """Downstream tasks with custom word boundaries should override this!"""
         return len(re.split(r"\s+", doc))
 
 
 def hash_args(attr, args):
     dat = json.dumps([attr] + list(args))
-    return hashlib.sha256(dat.encode('utf-8')).hexdigest()
+    return hashlib.sha256(dat.encode("utf-8")).hexdigest()
 
 
 class CacheHook:
     def __init__(self, cachinglm):
-        if cachinglm is None: 
+        if cachinglm is None:
             self.dbdict = None
             return
 
         self.dbdict = cachinglm.dbdict
-    
+
     def add_partial(self, attr, req, res):
         if self.dbdict is None:
             return
@@ -748,7 +802,7 @@ def __getattr__(self, attr):
         def fn(requests):
             res = []
             remaining_reqs = []
-            
+
             # figure out which ones are cached and which ones are new
             for req in requests:
                 hsh = hash_args(attr, req)
@@ -761,7 +815,7 @@ def fn(requests):
                 else:
                     res.append(None)
                     remaining_reqs.append(req)
-            
+
             # actually run the LM on the requests that do not have cached results
             rem_res = getattr(self.lm, attr)(remaining_reqs)
 
@@ -779,41 +833,48 @@ def fn(requests):
             self.dbdict.commit()
 
             return res
+
         return fn
-    
+
     def get_cache_hook(self):
         return CacheHook(self)
 
 
 REQUEST_RETURN_LENGTHS = {
-    'loglikelihood': 2,
-    'greedy_until': None,
-    'loglikelihood_rolling': None,
+    "loglikelihood": 2,
+    "greedy_until": None,
+    "loglikelihood_rolling": None,
 }
 
 
 class Request:
     def __init__(self, request_type, args, index=None):
         if request_type not in REQUEST_RETURN_LENGTHS.keys():
-            raise NotImplementedError('The request type {} is not implemented!'.format(request_type))
+            raise NotImplementedError(
+                "The request type {} is not implemented!".format(request_type)
+            )
 
         self.request_type = request_type
         self.args = args
         self.index = index
-    
+
     def __iter__(self):
         if REQUEST_RETURN_LENGTHS[self.request_type] is None:
-            raise IndexError('This request type does not return multiple arguments!')
+            raise IndexError("This request type does not return multiple arguments!")
         for i in range(REQUEST_RETURN_LENGTHS[self.request_type]):
             yield Request(self.request_type, self.args, i)
-    
+
     def __getitem__(self, i):
         if REQUEST_RETURN_LENGTHS[self.request_type] is None:
-            raise IndexError('This request type does not return multiple arguments!')
+            raise IndexError("This request type does not return multiple arguments!")
         return Request(self.request_type, self.args, i)
-    
+
     def __eq__(self, other):
-        return self.request_type == other.request_type and self.args == other.args and self.index == other.index
+        return (
+            self.request_type == other.request_type
+            and self.args == other.args
+            and self.index == other.index
+        )
 
     def __repr__(self):
         return f"Req_{self.request_type}{self.args}[{self.index}]\n"
@@ -823,6 +884,7 @@ class RequestFactory:
     def __getattr__(self, attr):
         def fn(*args):
             return Request(attr, args)
+
         return fn
 
 
diff --git a/lm_eval/datasets/README.md b/lm_eval/datasets/README.md
index 6cb009a50a..61515a31b9 100644
--- a/lm_eval/datasets/README.md
+++ b/lm_eval/datasets/README.md
@@ -2,5 +2,5 @@
 
 This directory contains custom EleutherAI datasets not available in the HuggingFace `datasets` hub.
 
-In the rare case that you need to add a custom dataset to this collection, follow the 
-HuggingFace `datasets` guide found [here](https://huggingface.co/docs/datasets/dataset_script).
\ No newline at end of file
+In the rare case that you need to add a custom dataset to this collection, follow the
+HuggingFace `datasets` guide found [here](https://huggingface.co/docs/datasets/dataset_script).
diff --git a/lm_eval/datasets/arithmetic/arithmetic.py b/lm_eval/datasets/arithmetic/arithmetic.py
index de87e0f89f..649577fcb9 100644
--- a/lm_eval/datasets/arithmetic/arithmetic.py
+++ b/lm_eval/datasets/arithmetic/arithmetic.py
@@ -68,61 +68,111 @@ class Arithmetic(datasets.GeneratorBasedBuilder):
         ArithmeticConfig(
             name="arithmetic_2da",
             url="https://raw.githubusercontent.com/openai/gpt-3/master/data/two_digit_addition.jsonl",
-            features=datasets.Features({"context": datasets.Value("string"), "completion": datasets.Value("string")}),
+            features=datasets.Features(
+                {
+                    "context": datasets.Value("string"),
+                    "completion": datasets.Value("string"),
+                }
+            ),
             description="2-digit addition",
         ),
         ArithmeticConfig(
             name="arithmetic_2ds",
             url="https://raw.githubusercontent.com/openai/gpt-3/master/data/two_digit_subtraction.jsonl",
-            features=datasets.Features({"context": datasets.Value("string"), "completion": datasets.Value("string")}),
+            features=datasets.Features(
+                {
+                    "context": datasets.Value("string"),
+                    "completion": datasets.Value("string"),
+                }
+            ),
             description="2-digit subtraction",
         ),
         ArithmeticConfig(
             name="arithmetic_3da",
             url="https://raw.githubusercontent.com/openai/gpt-3/master/data/three_digit_addition.jsonl",
-            features=datasets.Features({"context": datasets.Value("string"), "completion": datasets.Value("string")}),
+            features=datasets.Features(
+                {
+                    "context": datasets.Value("string"),
+                    "completion": datasets.Value("string"),
+                }
+            ),
             description="3-digit addition",
         ),
         ArithmeticConfig(
             name="arithmetic_3ds",
             url="https://raw.githubusercontent.com/openai/gpt-3/master/data/three_digit_subtraction.jsonl",
-            features=datasets.Features({"context": datasets.Value("string"), "completion": datasets.Value("string")}),
+            features=datasets.Features(
+                {
+                    "context": datasets.Value("string"),
+                    "completion": datasets.Value("string"),
+                }
+            ),
             description="3-digit subtraction",
         ),
         ArithmeticConfig(
             name="arithmetic_4da",
             url="https://raw.githubusercontent.com/openai/gpt-3/master/data/four_digit_addition.jsonl",
-            features=datasets.Features({"context": datasets.Value("string"), "completion": datasets.Value("string")}),
+            features=datasets.Features(
+                {
+                    "context": datasets.Value("string"),
+                    "completion": datasets.Value("string"),
+                }
+            ),
             description="4-digit addition",
         ),
         ArithmeticConfig(
             name="arithmetic_4ds",
             url="https://raw.githubusercontent.com/openai/gpt-3/master/data/four_digit_subtraction.jsonl",
-            features=datasets.Features({"context": datasets.Value("string"), "completion": datasets.Value("string")}),
+            features=datasets.Features(
+                {
+                    "context": datasets.Value("string"),
+                    "completion": datasets.Value("string"),
+                }
+            ),
             description="4-digit subtraction",
         ),
         ArithmeticConfig(
             name="arithmetic_5da",
             url="https://raw.githubusercontent.com/openai/gpt-3/master/data/five_digit_addition.jsonl",
-            features=datasets.Features({"context": datasets.Value("string"), "completion": datasets.Value("string")}),
+            features=datasets.Features(
+                {
+                    "context": datasets.Value("string"),
+                    "completion": datasets.Value("string"),
+                }
+            ),
             description="5-digit addition",
         ),
         ArithmeticConfig(
             name="arithmetic_5ds",
             url="https://raw.githubusercontent.com/openai/gpt-3/master/data/five_digit_subtraction.jsonl",
-            features=datasets.Features({"context": datasets.Value("string"), "completion": datasets.Value("string")}),
+            features=datasets.Features(
+                {
+                    "context": datasets.Value("string"),
+                    "completion": datasets.Value("string"),
+                }
+            ),
             description="5-digit subtraction",
         ),
         ArithmeticConfig(
             name="arithmetic_2dm",
             url="https://raw.githubusercontent.com/openai/gpt-3/master/data/two_digit_multiplication.jsonl",
-            features=datasets.Features({"context": datasets.Value("string"), "completion": datasets.Value("string")}),
+            features=datasets.Features(
+                {
+                    "context": datasets.Value("string"),
+                    "completion": datasets.Value("string"),
+                }
+            ),
             description="2-digit multiplication",
         ),
         ArithmeticConfig(
             name="arithmetic_1dc",
             url="https://raw.githubusercontent.com/openai/gpt-3/master/data/single_digit_three_ops.jsonl",
-            features=datasets.Features({"context": datasets.Value("string"), "completion": datasets.Value("string")}),
+            features=datasets.Features(
+                {
+                    "context": datasets.Value("string"),
+                    "completion": datasets.Value("string"),
+                }
+            ),
             description="Single digit 3 operations",
         ),
     ]
@@ -155,9 +205,12 @@ def _generate_examples(self, filepath, split):
         with open(filepath, encoding="utf-8") as f:
             for key, row in enumerate(f):
                 data = json.loads(row)
-                context = data['context'].strip() \
-                    .replace('\n\n', '\n') \
-                    .replace('Q:', 'Question:') \
-                    .replace('A:', 'Answer:')
-                completion = data['completion']
-                yield key, {'context': context, 'completion': completion}
+                context = (
+                    data["context"]
+                    .strip()
+                    .replace("\n\n", "\n")
+                    .replace("Q:", "Question:")
+                    .replace("A:", "Answer:")
+                )
+                completion = data["completion"]
+                yield key, {"context": context, "completion": completion}
diff --git a/lm_eval/datasets/arithmetic/dataset_infos.json b/lm_eval/datasets/arithmetic/dataset_infos.json
index dcf85ceb61..fedef8de28 100644
--- a/lm_eval/datasets/arithmetic/dataset_infos.json
+++ b/lm_eval/datasets/arithmetic/dataset_infos.json
@@ -1 +1 @@
-{"arithmetic_2da": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\n2-digit addition", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_2da", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 96624, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/two_digit_addition.jsonl": {"num_bytes": 138624, "checksum": "75a54b7a3db3b23369df74fe440c23025f3d3c51f664300bd3d56632b2617b3d"}}, "download_size": 138624, "post_processing_size": null, "dataset_size": 96624, "size_in_bytes": 235248}, "arithmetic_2ds": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\n2-digit subtraction", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_2ds", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 98216, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/two_digit_subtraction.jsonl": {"num_bytes": 140216, "checksum": "da956066ff108c00b341d360567472784f5fd872d6465071b44a14291205bc03"}}, "download_size": 140216, "post_processing_size": null, "dataset_size": 98216, "size_in_bytes": 238432}, "arithmetic_3da": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\n3-digit addition", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_3da", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 102612, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/three_digit_addition.jsonl": {"num_bytes": 144612, "checksum": "124865e30efd2abfbc1855dd34c218fc02d32d780ace970ab9b4ea3fa74c798b"}}, "download_size": 144612, "post_processing_size": null, "dataset_size": 102612, "size_in_bytes": 247224}, "arithmetic_3ds": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\n3-digit subtraction", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_3ds", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 104150, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/three_digit_subtraction.jsonl": {"num_bytes": 146150, "checksum": "7fc6aaedcb0e2bd17c398dd4147c5585b1e608278a8e98b914e69656707d6a29"}}, "download_size": 146150, "post_processing_size": null, "dataset_size": 104150, "size_in_bytes": 250300}, "arithmetic_4da": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\n4-digit addition", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_4da", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 108570, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/four_digit_addition.jsonl": {"num_bytes": 150570, "checksum": "459c6f75baa2e8d7cf50bdd07db6d0ca9133a6b137d95d09267db85b6e07f391"}}, "download_size": 150570, "post_processing_size": null, "dataset_size": 108570, "size_in_bytes": 259140}, "arithmetic_4ds": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\n4-digit subtraction", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_4ds", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 110150, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/four_digit_subtraction.jsonl": {"num_bytes": 152150, "checksum": "0c47db40a10c052ef0cf732a9ef2edaa53d66377d43eb47a9c382d33a8af7102"}}, "download_size": 152150, "post_processing_size": null, "dataset_size": 110150, "size_in_bytes": 262300}, "arithmetic_5da": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\n5-digit addition", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_5da", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 114476, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/five_digit_addition.jsonl": {"num_bytes": 156476, "checksum": "30ada42efe315b958c6e9649274005d3b720e50298e92c3a2d321f8996e58f54"}}, "download_size": 156476, "post_processing_size": null, "dataset_size": 114476, "size_in_bytes": 270952}, "arithmetic_5ds": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\n5-digit subtraction", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_5ds", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 116119, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/five_digit_subtraction.jsonl": {"num_bytes": 158119, "checksum": "8b98ccfc943cbf9193bcf1984954aa0b1a4527016072d972a2b055cc1482ca3c"}}, "download_size": 158119, "post_processing_size": null, "dataset_size": 116119, "size_in_bytes": 274238}, "arithmetic_2dm": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\n2-digit multiplication", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_2dm", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 100685, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/two_digit_multiplication.jsonl": {"num_bytes": 142685, "checksum": "5613d1d1cc3b2c03edc1990252247d34c10ec82944b2cdeb19e71b00f237f431"}}, "download_size": 142685, "post_processing_size": null, "dataset_size": 100685, "size_in_bytes": 243370}, "arithmetic_1dc": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\nSingle digit 3 operations", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_1dc", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 97651, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/single_digit_three_ops.jsonl": {"num_bytes": 139651, "checksum": "08b34e3272a8ff1d4932d63f251519d14c485c38d582366e1e323d0b859c3925"}}, "download_size": 139651, "post_processing_size": null, "dataset_size": 97651, "size_in_bytes": 237302}}
\ No newline at end of file
+{"arithmetic_2da": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\n2-digit addition", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_2da", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 96624, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/two_digit_addition.jsonl": {"num_bytes": 138624, "checksum": "75a54b7a3db3b23369df74fe440c23025f3d3c51f664300bd3d56632b2617b3d"}}, "download_size": 138624, "post_processing_size": null, "dataset_size": 96624, "size_in_bytes": 235248}, "arithmetic_2ds": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\n2-digit subtraction", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_2ds", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 98216, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/two_digit_subtraction.jsonl": {"num_bytes": 140216, "checksum": "da956066ff108c00b341d360567472784f5fd872d6465071b44a14291205bc03"}}, "download_size": 140216, "post_processing_size": null, "dataset_size": 98216, "size_in_bytes": 238432}, "arithmetic_3da": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\n3-digit addition", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_3da", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 102612, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/three_digit_addition.jsonl": {"num_bytes": 144612, "checksum": "124865e30efd2abfbc1855dd34c218fc02d32d780ace970ab9b4ea3fa74c798b"}}, "download_size": 144612, "post_processing_size": null, "dataset_size": 102612, "size_in_bytes": 247224}, "arithmetic_3ds": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\n3-digit subtraction", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_3ds", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 104150, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/three_digit_subtraction.jsonl": {"num_bytes": 146150, "checksum": "7fc6aaedcb0e2bd17c398dd4147c5585b1e608278a8e98b914e69656707d6a29"}}, "download_size": 146150, "post_processing_size": null, "dataset_size": 104150, "size_in_bytes": 250300}, "arithmetic_4da": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\n4-digit addition", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_4da", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 108570, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/four_digit_addition.jsonl": {"num_bytes": 150570, "checksum": "459c6f75baa2e8d7cf50bdd07db6d0ca9133a6b137d95d09267db85b6e07f391"}}, "download_size": 150570, "post_processing_size": null, "dataset_size": 108570, "size_in_bytes": 259140}, "arithmetic_4ds": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\n4-digit subtraction", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_4ds", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 110150, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/four_digit_subtraction.jsonl": {"num_bytes": 152150, "checksum": "0c47db40a10c052ef0cf732a9ef2edaa53d66377d43eb47a9c382d33a8af7102"}}, "download_size": 152150, "post_processing_size": null, "dataset_size": 110150, "size_in_bytes": 262300}, "arithmetic_5da": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\n5-digit addition", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_5da", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 114476, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/five_digit_addition.jsonl": {"num_bytes": 156476, "checksum": "30ada42efe315b958c6e9649274005d3b720e50298e92c3a2d321f8996e58f54"}}, "download_size": 156476, "post_processing_size": null, "dataset_size": 114476, "size_in_bytes": 270952}, "arithmetic_5ds": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\n5-digit subtraction", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_5ds", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 116119, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/five_digit_subtraction.jsonl": {"num_bytes": 158119, "checksum": "8b98ccfc943cbf9193bcf1984954aa0b1a4527016072d972a2b055cc1482ca3c"}}, "download_size": 158119, "post_processing_size": null, "dataset_size": 116119, "size_in_bytes": 274238}, "arithmetic_2dm": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\n2-digit multiplication", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_2dm", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 100685, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/two_digit_multiplication.jsonl": {"num_bytes": 142685, "checksum": "5613d1d1cc3b2c03edc1990252247d34c10ec82944b2cdeb19e71b00f237f431"}}, "download_size": 142685, "post_processing_size": null, "dataset_size": 100685, "size_in_bytes": 243370}, "arithmetic_1dc": {"description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.\n\nSingle digit 3 operations", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arithmetic", "config_name": "arithmetic_1dc", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 97651, "num_examples": 2000, "dataset_name": "arithmetic"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/single_digit_three_ops.jsonl": {"num_bytes": 139651, "checksum": "08b34e3272a8ff1d4932d63f251519d14c485c38d582366e1e323d0b859c3925"}}, "download_size": 139651, "post_processing_size": null, "dataset_size": 97651, "size_in_bytes": 237302}}
diff --git a/lm_eval/datasets/asdiv/asdiv.py b/lm_eval/datasets/asdiv/asdiv.py
index 9e7da76472..927de50c2b 100644
--- a/lm_eval/datasets/asdiv/asdiv.py
+++ b/lm_eval/datasets/asdiv/asdiv.py
@@ -50,13 +50,16 @@
 
 
 class ASDiv(datasets.GeneratorBasedBuilder):
-    """ ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers """
+    """ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers"""
 
     VERSION = datasets.Version("0.0.1")
 
     BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="asdiv", version=VERSION,
-                               description="A diverse corpus for evaluating and developing english math word problem solvers")
+        datasets.BuilderConfig(
+            name="asdiv",
+            version=VERSION,
+            description="A diverse corpus for evaluating and developing english math word problem solvers",
+        )
     ]
 
     def _info(self):
@@ -86,7 +89,9 @@ def _split_generators(self, dl_manager):
                 name=datasets.Split.VALIDATION,
                 # These kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "filepath": os.path.join(data_dir, base_filepath, "dataset", "ASDiv.xml"),
+                    "filepath": os.path.join(
+                        data_dir, base_filepath, "dataset", "ASDiv.xml"
+                    ),
                     "split": datasets.Split.VALIDATION,
                 },
             ),
diff --git a/lm_eval/datasets/asdiv/dataset_infos.json b/lm_eval/datasets/asdiv/dataset_infos.json
index f6757fbcdc..cfeea0d389 100644
--- a/lm_eval/datasets/asdiv/dataset_infos.json
+++ b/lm_eval/datasets/asdiv/dataset_infos.json
@@ -1 +1 @@
-{"asdiv": {"description": "ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language\npatterns and problem types) English math word problem (MWP) corpus for evaluating\nthe capability of various MWP solvers. Existing MWP corpora for studying AI progress\nremain limited either in language usage patterns or in problem types. We thus present\na new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem\ntypes taught in elementary school. Each MWP is annotated with its problem type and grade\nlevel (for indicating the level of difficulty).\n", "citation": "@misc{miao2021diverse,\n    title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},\n    author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},\n    year={2021},\n    eprint={2106.15772},\n    archivePrefix={arXiv},\n    primaryClass={cs.AI}\n}\n", "homepage": "https://github.com/chaochun/nlu-asdiv-dataset", "license": "", "features": {"body": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "solution_type": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "formula": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "as_div", "config_name": "asdiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 501489, "num_examples": 2305, "dataset_name": "as_div"}}, "download_checksums": {"https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip": {"num_bytes": 440966, "checksum": "8f1fe4f6d5f170ec1e24ab78c244153c14c568b1bb2b1dad0324e71f37939a2d"}}, "download_size": 440966, "post_processing_size": null, "dataset_size": 501489, "size_in_bytes": 942455}}
\ No newline at end of file
+{"asdiv": {"description": "ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language\npatterns and problem types) English math word problem (MWP) corpus for evaluating\nthe capability of various MWP solvers. Existing MWP corpora for studying AI progress\nremain limited either in language usage patterns or in problem types. We thus present\na new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem\ntypes taught in elementary school. Each MWP is annotated with its problem type and grade\nlevel (for indicating the level of difficulty).\n", "citation": "@misc{miao2021diverse,\n    title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},\n    author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},\n    year={2021},\n    eprint={2106.15772},\n    archivePrefix={arXiv},\n    primaryClass={cs.AI}\n}\n", "homepage": "https://github.com/chaochun/nlu-asdiv-dataset", "license": "", "features": {"body": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "solution_type": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "formula": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "as_div", "config_name": "asdiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 501489, "num_examples": 2305, "dataset_name": "as_div"}}, "download_checksums": {"https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip": {"num_bytes": 440966, "checksum": "8f1fe4f6d5f170ec1e24ab78c244153c14c568b1bb2b1dad0324e71f37939a2d"}}, "download_size": 440966, "post_processing_size": null, "dataset_size": 501489, "size_in_bytes": 942455}}
diff --git a/lm_eval/datasets/coqa/coqa.py b/lm_eval/datasets/coqa/coqa.py
index cc50f20f38..0f0983e172 100644
--- a/lm_eval/datasets/coqa/coqa.py
+++ b/lm_eval/datasets/coqa/coqa.py
@@ -61,7 +61,7 @@
             "span_end": -1,
             "span_text": "",
             "input_text": "",
-            "turn_id": -1
+            "turn_id": -1,
         }
     ],
     "1": [
@@ -70,7 +70,7 @@
             "span_end": -1,
             "span_text": "",
             "input_text": "",
-            "turn_id": -1
+            "turn_id": -1,
         }
     ],
     "2": [
@@ -79,7 +79,7 @@
             "span_end": -1,
             "span_text": "",
             "input_text": "",
-            "turn_id": -1
+            "turn_id": -1,
         }
     ],
 }
@@ -91,8 +91,9 @@ class Coqa(datasets.GeneratorBasedBuilder):
     VERSION = datasets.Version("0.0.1")
 
     BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="coqa", version=VERSION,
-                               description="The CoQA dataset."),
+        datasets.BuilderConfig(
+            name="coqa", version=VERSION, description="The CoQA dataset."
+        ),
     ]
 
     def _info(self):
@@ -101,41 +102,52 @@ def _info(self):
                 "id": datasets.Value("string"),
                 "source": datasets.Value("string"),
                 "story": datasets.Value("string"),
-                "questions": datasets.features.Sequence({
-                    "input_text": datasets.Value("string"),
-                    "turn_id": datasets.Value("int32"),
-                }),
-                "answers": datasets.features.Sequence({
-                    "span_start": datasets.Value("int32"),
-                    "span_end": datasets.Value("int32"),
-                    "span_text": datasets.Value("string"),
-                    "input_text": datasets.Value("string"),
-                    "turn_id": datasets.Value("int32"),
-                }),
-                "additional_answers": {
-                    "0": datasets.features.Sequence({
-                        "span_start": datasets.Value("int32"),
-                        "span_end": datasets.Value("int32"),
-                        "span_text": datasets.Value("string"),
-                        "input_text": datasets.Value("string"),
-                        "turn_id": datasets.Value("int32"),
-                    }),
-                    "1": datasets.features.Sequence({
-                        "span_start": datasets.Value("int32"),
-                        "span_end": datasets.Value("int32"),
-                        "span_text": datasets.Value("string"),
+                "questions": datasets.features.Sequence(
+                    {
                         "input_text": datasets.Value("string"),
                         "turn_id": datasets.Value("int32"),
-                    }),
-                    "2": datasets.features.Sequence({
+                    }
+                ),
+                "answers": datasets.features.Sequence(
+                    {
                         "span_start": datasets.Value("int32"),
                         "span_end": datasets.Value("int32"),
                         "span_text": datasets.Value("string"),
                         "input_text": datasets.Value("string"),
                         "turn_id": datasets.Value("int32"),
-                    }),
-                }
-            })
+                    }
+                ),
+                "additional_answers": {
+                    "0": datasets.features.Sequence(
+                        {
+                            "span_start": datasets.Value("int32"),
+                            "span_end": datasets.Value("int32"),
+                            "span_text": datasets.Value("string"),
+                            "input_text": datasets.Value("string"),
+                            "turn_id": datasets.Value("int32"),
+                        }
+                    ),
+                    "1": datasets.features.Sequence(
+                        {
+                            "span_start": datasets.Value("int32"),
+                            "span_end": datasets.Value("int32"),
+                            "span_text": datasets.Value("string"),
+                            "input_text": datasets.Value("string"),
+                            "turn_id": datasets.Value("int32"),
+                        }
+                    ),
+                    "2": datasets.features.Sequence(
+                        {
+                            "span_start": datasets.Value("int32"),
+                            "span_end": datasets.Value("int32"),
+                            "span_text": datasets.Value("string"),
+                            "input_text": datasets.Value("string"),
+                            "turn_id": datasets.Value("int32"),
+                        }
+                    ),
+                },
+            }
+        )
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,
@@ -175,10 +187,7 @@ def _generate_examples(self, filepath, split):
                 source = row["source"]
                 story = row["story"]
                 questions = [
-                    {
-                        "input_text": q["input_text"],
-                        "turn_id": q["turn_id"]
-                    }
+                    {"input_text": q["input_text"], "turn_id": q["turn_id"]}
                     for q in row["questions"]
                 ]
                 answers = [
@@ -187,7 +196,7 @@ def _generate_examples(self, filepath, split):
                         "span_end": a["span_end"],
                         "span_text": a["span_text"],
                         "input_text": a["input_text"],
-                        "turn_id": a["turn_id"]
+                        "turn_id": a["turn_id"],
                     }
                     for a in row["answers"]
                 ]
@@ -201,7 +210,7 @@ def _generate_examples(self, filepath, split):
                                 "span_end": a0["span_end"],
                                 "span_text": a0["span_text"],
                                 "input_text": a0["input_text"],
-                                "turn_id": a0["turn_id"]
+                                "turn_id": a0["turn_id"],
                             }
                             for a0 in row["additional_answers"]["0"]
                         ],
@@ -211,7 +220,7 @@ def _generate_examples(self, filepath, split):
                                 "span_end": a1["span_end"],
                                 "span_text": a1["span_text"],
                                 "input_text": a1["input_text"],
-                                "turn_id": a1["turn_id"]
+                                "turn_id": a1["turn_id"],
                             }
                             for a1 in row["additional_answers"]["1"]
                         ],
@@ -221,7 +230,7 @@ def _generate_examples(self, filepath, split):
                                 "span_end": a2["span_end"],
                                 "span_text": a2["span_text"],
                                 "input_text": a2["input_text"],
-                                "turn_id": a2["turn_id"]
+                                "turn_id": a2["turn_id"],
                             }
                             for a2 in row["additional_answers"]["2"]
                         ],
@@ -232,5 +241,5 @@ def _generate_examples(self, filepath, split):
                     "source": source,
                     "questions": questions,
                     "answers": answers,
-                    "additional_answers": additional_answers
+                    "additional_answers": additional_answers,
                 }
diff --git a/lm_eval/datasets/coqa/dataset_infos.json b/lm_eval/datasets/coqa/dataset_infos.json
index 87cdb48586..ff8ab4a731 100644
--- a/lm_eval/datasets/coqa/dataset_infos.json
+++ b/lm_eval/datasets/coqa/dataset_infos.json
@@ -1 +1 @@
-{"coqa": {"description": "CoQA is a large-scale dataset for building Conversational Question Answering\nsystems. The goal of the CoQA challenge is to measure the ability of machines to\nunderstand a text passage and answer a series of interconnected questions that\nappear in a conversation.\n", "citation": "@misc{reddy2018coqa,\n    title={CoQA: A Conversational Question Answering Challenge},\n    author={Siva Reddy and Danqi Chen and Christopher D. Manning},\n    year={2018},\n    eprint={1808.07042},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://stanfordnlp.github.io/coqa/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "story": {"dtype": "string", "id": null, "_type": "Value"}, "questions": {"feature": {"input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answers": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "additional_answers": {"0": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "1": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "2": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "coqa", "config_name": "coqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 26250528, "num_examples": 7199, "dataset_name": "coqa"}, "validation": {"name": "validation", "num_bytes": 3765933, "num_examples": 500, "dataset_name": "coqa"}}, "download_checksums": {"https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json": {"num_bytes": 49001836, "checksum": "b0fdb2bc1bd38dd3ca2ce5fa2ac3e02c6288ac914f241ac409a655ffb6619fa6"}, "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json": {"num_bytes": 9090845, "checksum": "dfa367a9733ce53222918d0231d9b3bedc2b8ee831a2845f62dfc70701f2540a"}}, "download_size": 58092681, "post_processing_size": null, "dataset_size": 30016461, "size_in_bytes": 88109142}}
\ No newline at end of file
+{"coqa": {"description": "CoQA is a large-scale dataset for building Conversational Question Answering\nsystems. The goal of the CoQA challenge is to measure the ability of machines to\nunderstand a text passage and answer a series of interconnected questions that\nappear in a conversation.\n", "citation": "@misc{reddy2018coqa,\n    title={CoQA: A Conversational Question Answering Challenge},\n    author={Siva Reddy and Danqi Chen and Christopher D. Manning},\n    year={2018},\n    eprint={1808.07042},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://stanfordnlp.github.io/coqa/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "story": {"dtype": "string", "id": null, "_type": "Value"}, "questions": {"feature": {"input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answers": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "additional_answers": {"0": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "1": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "2": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "coqa", "config_name": "coqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 26250528, "num_examples": 7199, "dataset_name": "coqa"}, "validation": {"name": "validation", "num_bytes": 3765933, "num_examples": 500, "dataset_name": "coqa"}}, "download_checksums": {"https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json": {"num_bytes": 49001836, "checksum": "b0fdb2bc1bd38dd3ca2ce5fa2ac3e02c6288ac914f241ac409a655ffb6619fa6"}, "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json": {"num_bytes": 9090845, "checksum": "dfa367a9733ce53222918d0231d9b3bedc2b8ee831a2845f62dfc70701f2540a"}}, "download_size": 58092681, "post_processing_size": null, "dataset_size": 30016461, "size_in_bytes": 88109142}}
diff --git a/lm_eval/datasets/drop/dataset_infos.json b/lm_eval/datasets/drop/dataset_infos.json
index b4e4a96a09..f155e7720d 100644
--- a/lm_eval/datasets/drop/dataset_infos.json
+++ b/lm_eval/datasets/drop/dataset_infos.json
@@ -1 +1 @@
-{"drop": {"description": "DROP is a QA dataset which tests comprehensive understanding of paragraphs. In \nthis crowdsourced, adversarially-created, 96k question-answering benchmark, a \nsystem must resolve multiple references in a question, map them onto a paragraph,\nand perform discrete operations over them (such as addition, counting, or sorting).\n", "citation": "@misc{dua2019drop,\n    title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, \n    author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},\n    year={2019},\n    eprint={1903.00161},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://allenai.org/data/drop", "license": "", "features": {"section_id": {"dtype": "string", "id": null, "_type": "Value"}, "passage": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "query_id": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "validated_answers": {"feature": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "drop", "config_name": "drop", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 108858121, "num_examples": 77409, "dataset_name": "drop"}, "validation": {"name": "validation", "num_bytes": 12560739, "num_examples": 9536, "dataset_name": "drop"}}, "download_checksums": {"https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip": {"num_bytes": 8308692, "checksum": "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"}}, "download_size": 8308692, "post_processing_size": null, "dataset_size": 121418860, "size_in_bytes": 129727552}}
\ No newline at end of file
+{"drop": {"description": "DROP is a QA dataset which tests comprehensive understanding of paragraphs. In \nthis crowdsourced, adversarially-created, 96k question-answering benchmark, a \nsystem must resolve multiple references in a question, map them onto a paragraph,\nand perform discrete operations over them (such as addition, counting, or sorting).\n", "citation": "@misc{dua2019drop,\n    title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, \n    author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},\n    year={2019},\n    eprint={1903.00161},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://allenai.org/data/drop", "license": "", "features": {"section_id": {"dtype": "string", "id": null, "_type": "Value"}, "passage": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "query_id": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "validated_answers": {"feature": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "drop", "config_name": "drop", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 108858121, "num_examples": 77409, "dataset_name": "drop"}, "validation": {"name": "validation", "num_bytes": 12560739, "num_examples": 9536, "dataset_name": "drop"}}, "download_checksums": {"https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip": {"num_bytes": 8308692, "checksum": "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"}}, "download_size": 8308692, "post_processing_size": null, "dataset_size": 121418860, "size_in_bytes": 129727552}}
diff --git a/lm_eval/datasets/drop/drop.py b/lm_eval/datasets/drop/drop.py
index b72f129593..8d3bae0010 100644
--- a/lm_eval/datasets/drop/drop.py
+++ b/lm_eval/datasets/drop/drop.py
@@ -25,7 +25,7 @@
 
 _CITATION = """\
 @misc{dua2019drop,
-    title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, 
+    title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs},
     author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},
     year={2019},
     eprint={1903.00161},
@@ -35,8 +35,8 @@
 """
 
 _DESCRIPTION = """\
-DROP is a QA dataset which tests comprehensive understanding of paragraphs. In 
-this crowdsourced, adversarially-created, 96k question-answering benchmark, a 
+DROP is a QA dataset which tests comprehensive understanding of paragraphs. In
+this crowdsourced, adversarially-created, 96k question-answering benchmark, a
 system must resolve multiple references in a question, map them onto a paragraph,
 and perform discrete operations over them (such as addition, counting, or sorting).
 """
@@ -50,17 +50,19 @@
     "drop": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip",
 }
 
-_EMPTY_VALIDATED_ANSWER = [{
-    "number": "",
-    "date": {
-        "day": "",
-        "month": "",
-        "year": "",
-    },
-    "spans": [],
-    "worker_id": "",
-    "hit_id": ""
-}]
+_EMPTY_VALIDATED_ANSWER = [
+    {
+        "number": "",
+        "date": {
+            "day": "",
+            "month": "",
+            "year": "",
+        },
+        "spans": [],
+        "worker_id": "",
+        "hit_id": "",
+    }
+]
 
 
 class Drop(datasets.GeneratorBasedBuilder):
@@ -69,39 +71,44 @@ class Drop(datasets.GeneratorBasedBuilder):
     VERSION = datasets.Version("0.0.1")
 
     BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="drop", version=VERSION,
-                               description="The DROP dataset."),
+        datasets.BuilderConfig(
+            name="drop", version=VERSION, description="The DROP dataset."
+        ),
     ]
 
     def _info(self):
-        features = datasets.Features({
-            "section_id": datasets.Value("string"),
-            "passage": datasets.Value("string"),
-            "question": datasets.Value("string"),
-            "query_id": datasets.Value("string"),
-            "answer": {
-                "number": datasets.Value("string"),
-                "date": {
-                    "day": datasets.Value("string"),
-                    "month": datasets.Value("string"),
-                    "year": datasets.Value("string"),
+        features = datasets.Features(
+            {
+                "section_id": datasets.Value("string"),
+                "passage": datasets.Value("string"),
+                "question": datasets.Value("string"),
+                "query_id": datasets.Value("string"),
+                "answer": {
+                    "number": datasets.Value("string"),
+                    "date": {
+                        "day": datasets.Value("string"),
+                        "month": datasets.Value("string"),
+                        "year": datasets.Value("string"),
+                    },
+                    "spans": datasets.features.Sequence(datasets.Value("string")),
+                    "worker_id": datasets.Value("string"),
+                    "hit_id": datasets.Value("string"),
                 },
-                "spans": datasets.features.Sequence(datasets.Value("string")),
-                "worker_id": datasets.Value("string"),
-                "hit_id": datasets.Value("string"),
-            },
-            "validated_answers": datasets.features.Sequence({
-                "number": datasets.Value("string"),
-                "date": {
-                    "day": datasets.Value("string"),
-                    "month": datasets.Value("string"),
-                    "year": datasets.Value("string"),
-                },
-                "spans": datasets.features.Sequence(datasets.Value("string")),
-                "worker_id": datasets.Value("string"),
-                "hit_id": datasets.Value("string"),
-            }),
-        })
+                "validated_answers": datasets.features.Sequence(
+                    {
+                        "number": datasets.Value("string"),
+                        "date": {
+                            "day": datasets.Value("string"),
+                            "month": datasets.Value("string"),
+                            "year": datasets.Value("string"),
+                        },
+                        "spans": datasets.features.Sequence(datasets.Value("string")),
+                        "worker_id": datasets.Value("string"),
+                        "hit_id": datasets.Value("string"),
+                    }
+                ),
+            }
+        )
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,
@@ -118,7 +125,9 @@ def _split_generators(self, dl_manager):
                 name=datasets.Split.TRAIN,
                 # These kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "filepath": os.path.join(data_dir, "drop_dataset", "drop_dataset_train.json"),
+                    "filepath": os.path.join(
+                        data_dir, "drop_dataset", "drop_dataset_train.json"
+                    ),
                     "split": "train",
                 },
             ),
@@ -126,7 +135,9 @@ def _split_generators(self, dl_manager):
                 name=datasets.Split.VALIDATION,
                 # These kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "filepath": os.path.join(data_dir, "drop_dataset", "drop_dataset_dev.json"),
+                    "filepath": os.path.join(
+                        data_dir, "drop_dataset", "drop_dataset_dev.json"
+                    ),
                     "split": "validation",
                 },
             ),
diff --git a/lm_eval/datasets/gsm8k/dataset_infos.json b/lm_eval/datasets/gsm8k/dataset_infos.json
index ea0f68371c..8984ea99b2 100644
--- a/lm_eval/datasets/gsm8k/dataset_infos.json
+++ b/lm_eval/datasets/gsm8k/dataset_infos.json
@@ -1 +1 @@
-{"gsm8k": {"description": "State-of-the-art language models can match human performance on many tasks, but \nthey still struggle to robustly perform multi-step mathematical reasoning. To \ndiagnose the failures of current models and support research, we introduce GSM8K,\na dataset of 8.5K high quality linguistically diverse grade school math word problems.\nWe find that even the largest transformer models fail to achieve high test performance, \ndespite the conceptual simplicity of this problem distribution.\n", "citation": "@misc{cobbe2021training,\n      title={Training Verifiers to Solve Math Word Problems},\n      author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},\n      year={2021},\n      eprint={2110.14168},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG}\n}\n", "homepage": "https://github.com/openai/grade-school-math", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "gsm8_k", "config_name": "gsm8k", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 3963202, "num_examples": 7473, "dataset_name": "gsm8_k"}, "test": {"name": "test", "num_bytes": 713732, "num_examples": 1319, "dataset_name": "gsm8_k"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train.jsonl": {"num_bytes": 4166206, "checksum": "17f347dc51477c50d4efb83959dbb7c56297aba886e5544ee2aaed3024813465"}, "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl": {"num_bytes": 749738, "checksum": "3730d312f6e3440559ace48831e51066acaca737f6eabec99bccb9e4b3c39d14"}}, "download_size": 4915944, "post_processing_size": null, "dataset_size": 4676934, "size_in_bytes": 9592878}}
\ No newline at end of file
+{"gsm8k": {"description": "State-of-the-art language models can match human performance on many tasks, but \nthey still struggle to robustly perform multi-step mathematical reasoning. To \ndiagnose the failures of current models and support research, we introduce GSM8K,\na dataset of 8.5K high quality linguistically diverse grade school math word problems.\nWe find that even the largest transformer models fail to achieve high test performance, \ndespite the conceptual simplicity of this problem distribution.\n", "citation": "@misc{cobbe2021training,\n      title={Training Verifiers to Solve Math Word Problems},\n      author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},\n      year={2021},\n      eprint={2110.14168},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG}\n}\n", "homepage": "https://github.com/openai/grade-school-math", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "gsm8_k", "config_name": "gsm8k", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 3963202, "num_examples": 7473, "dataset_name": "gsm8_k"}, "test": {"name": "test", "num_bytes": 713732, "num_examples": 1319, "dataset_name": "gsm8_k"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train.jsonl": {"num_bytes": 4166206, "checksum": "17f347dc51477c50d4efb83959dbb7c56297aba886e5544ee2aaed3024813465"}, "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl": {"num_bytes": 749738, "checksum": "3730d312f6e3440559ace48831e51066acaca737f6eabec99bccb9e4b3c39d14"}}, "download_size": 4915944, "post_processing_size": null, "dataset_size": 4676934, "size_in_bytes": 9592878}}
diff --git a/lm_eval/datasets/gsm8k/gsm8k.py b/lm_eval/datasets/gsm8k/gsm8k.py
index 94cb161c86..02c981e654 100644
--- a/lm_eval/datasets/gsm8k/gsm8k.py
+++ b/lm_eval/datasets/gsm8k/gsm8k.py
@@ -31,11 +31,11 @@
 """
 
 _DESCRIPTION = """\
-State-of-the-art language models can match human performance on many tasks, but 
-they still struggle to robustly perform multi-step mathematical reasoning. To 
+State-of-the-art language models can match human performance on many tasks, but
+they still struggle to robustly perform multi-step mathematical reasoning. To
 diagnose the failures of current models and support research, we introduce GSM8K,
 a dataset of 8.5K high quality linguistically diverse grade school math word problems.
-We find that even the largest transformer models fail to achieve high test performance, 
+We find that even the largest transformer models fail to achieve high test performance,
 despite the conceptual simplicity of this problem distribution.
 """
 
@@ -56,8 +56,11 @@ class GSM8K(datasets.GeneratorBasedBuilder):
     VERSION = datasets.Version("0.0.1")
 
     BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="gsm8k", version=VERSION,
-                               description="The Grade School Math 8k dataset."),
+        datasets.BuilderConfig(
+            name="gsm8k",
+            version=VERSION,
+            description="The Grade School Math 8k dataset.",
+        ),
     ]
 
     def _info(self):
@@ -90,10 +93,7 @@ def _split_generators(self, dl_manager):
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
                 # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dir["test"],
-                    "split": "test"
-                },
+                gen_kwargs={"filepath": data_dir["test"], "split": "test"},
             ),
         ]
 
diff --git a/lm_eval/datasets/headqa/dataset_infos.json b/lm_eval/datasets/headqa/dataset_infos.json
index f32cb50ece..47d6707dbd 100644
--- a/lm_eval/datasets/headqa/dataset_infos.json
+++ b/lm_eval/datasets/headqa/dataset_infos.json
@@ -1 +1 @@
-{"es": {"description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "citation": "@inproceedings{vilares-gomez-rodriguez-2019-head,\n    title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n    author = \"Vilares, David  and\n      G{'o}mez-Rodr{'i}guez, Carlos\",\n    booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n    month = jul,\n    year = \"2019\",\n    address = \"Florence, Italy\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/P19-1092\",\n    doi = \"10.18653/v1/P19-1092\",\n    pages = \"960--966\",\n    abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n", "homepage": "https://aghie.github.io/head-qa/", "license": "MIT License", "features": {"name": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"dtype": "string", "id": null, "_type": "Value"}, "qid": {"dtype": "int32", "id": null, "_type": "Value"}, "qtext": {"dtype": "string", "id": null, "_type": "Value"}, "ra": {"dtype": "int32", "id": null, "_type": "Value"}, "answers": [{"aid": {"dtype": "int32", "id": null, "_type": "Value"}, "atext": {"dtype": "string", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "head_qa", "config_name": "es", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1196021, "num_examples": 2657, "dataset_name": "head_qa"}, "test": {"name": "test", "num_bytes": 1169819, "num_examples": 2742, "dataset_name": "head_qa"}, "validation": {"name": "validation", "num_bytes": 556924, "num_examples": 1366, "dataset_name": "head_qa"}}, "download_checksums": {"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t": {"num_bytes": 79365502, "checksum": "6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}}, "download_size": 79365502, "post_processing_size": null, "dataset_size": 2922764, "size_in_bytes": 82288266}, "en": {"description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "citation": "@inproceedings{vilares-gomez-rodriguez-2019-head,\n    title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n    author = \"Vilares, David  and\n      G{'o}mez-Rodr{'i}guez, Carlos\",\n    booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n    month = jul,\n    year = \"2019\",\n    address = \"Florence, Italy\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/P19-1092\",\n    doi = \"10.18653/v1/P19-1092\",\n    pages = \"960--966\",\n    abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n", "homepage": "https://aghie.github.io/head-qa/", "license": "MIT License", "features": {"name": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"dtype": "string", "id": null, "_type": "Value"}, "qid": {"dtype": "int32", "id": null, "_type": "Value"}, "qtext": {"dtype": "string", "id": null, "_type": "Value"}, "ra": {"dtype": "int32", "id": null, "_type": "Value"}, "answers": [{"aid": {"dtype": "int32", "id": null, "_type": "Value"}, "atext": {"dtype": "string", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "head_qa", "config_name": "en", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1123151, "num_examples": 2657, "dataset_name": "head_qa"}, "test": {"name": "test", "num_bytes": 1097349, "num_examples": 2742, "dataset_name": "head_qa"}, "validation": {"name": "validation", "num_bytes": 523462, "num_examples": 1366, "dataset_name": "head_qa"}}, "download_checksums": {"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t": {"num_bytes": 79365502, "checksum": "6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}}, "download_size": 79365502, "post_processing_size": null, "dataset_size": 2743962, "size_in_bytes": 82109464}}
\ No newline at end of file
+{"es": {"description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "citation": "@inproceedings{vilares-gomez-rodriguez-2019-head,\n    title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n    author = \"Vilares, David  and\n      G{'o}mez-Rodr{'i}guez, Carlos\",\n    booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n    month = jul,\n    year = \"2019\",\n    address = \"Florence, Italy\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/P19-1092\",\n    doi = \"10.18653/v1/P19-1092\",\n    pages = \"960--966\",\n    abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n", "homepage": "https://aghie.github.io/head-qa/", "license": "MIT License", "features": {"name": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"dtype": "string", "id": null, "_type": "Value"}, "qid": {"dtype": "int32", "id": null, "_type": "Value"}, "qtext": {"dtype": "string", "id": null, "_type": "Value"}, "ra": {"dtype": "int32", "id": null, "_type": "Value"}, "answers": [{"aid": {"dtype": "int32", "id": null, "_type": "Value"}, "atext": {"dtype": "string", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "head_qa", "config_name": "es", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1196021, "num_examples": 2657, "dataset_name": "head_qa"}, "test": {"name": "test", "num_bytes": 1169819, "num_examples": 2742, "dataset_name": "head_qa"}, "validation": {"name": "validation", "num_bytes": 556924, "num_examples": 1366, "dataset_name": "head_qa"}}, "download_checksums": {"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t": {"num_bytes": 79365502, "checksum": "6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}}, "download_size": 79365502, "post_processing_size": null, "dataset_size": 2922764, "size_in_bytes": 82288266}, "en": {"description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "citation": "@inproceedings{vilares-gomez-rodriguez-2019-head,\n    title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n    author = \"Vilares, David  and\n      G{'o}mez-Rodr{'i}guez, Carlos\",\n    booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n    month = jul,\n    year = \"2019\",\n    address = \"Florence, Italy\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/P19-1092\",\n    doi = \"10.18653/v1/P19-1092\",\n    pages = \"960--966\",\n    abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n", "homepage": "https://aghie.github.io/head-qa/", "license": "MIT License", "features": {"name": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"dtype": "string", "id": null, "_type": "Value"}, "qid": {"dtype": "int32", "id": null, "_type": "Value"}, "qtext": {"dtype": "string", "id": null, "_type": "Value"}, "ra": {"dtype": "int32", "id": null, "_type": "Value"}, "answers": [{"aid": {"dtype": "int32", "id": null, "_type": "Value"}, "atext": {"dtype": "string", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "head_qa", "config_name": "en", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1123151, "num_examples": 2657, "dataset_name": "head_qa"}, "test": {"name": "test", "num_bytes": 1097349, "num_examples": 2742, "dataset_name": "head_qa"}, "validation": {"name": "validation", "num_bytes": 523462, "num_examples": 1366, "dataset_name": "head_qa"}}, "download_checksums": {"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t": {"num_bytes": 79365502, "checksum": "6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}}, "download_size": 79365502, "post_processing_size": null, "dataset_size": 2743962, "size_in_bytes": 82109464}}
diff --git a/lm_eval/datasets/headqa/headqa.py b/lm_eval/datasets/headqa/headqa.py
index 8e5a4d9c14..73be342b98 100644
--- a/lm_eval/datasets/headqa/headqa.py
+++ b/lm_eval/datasets/headqa/headqa.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,7 +13,7 @@
 # limitations under the License.
 #
 # NOTE: This is an exact copy of
-# https://github.com/huggingface/datasets/blob/3804442bb7cfcb9d52044d92688115cfdc69c2da/datasets/head_qa/head_qa.py 
+# https://github.com/huggingface/datasets/blob/3804442bb7cfcb9d52044d92688115cfdc69c2da/datasets/head_qa/head_qa.py
 # with the exception of the `image` feature. This is to avoid adding `Pillow`
 # as a dependency.
 """HEAD-QA: A Healthcare Dataset for Complex Reasoning."""
@@ -65,8 +64,12 @@ class HeadQA(datasets.GeneratorBasedBuilder):
     VERSION = datasets.Version("1.1.0")
 
     BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="es", version=VERSION, description="Spanish HEAD dataset"),
-        datasets.BuilderConfig(name="en", version=VERSION, description="English HEAD dataset"),
+        datasets.BuilderConfig(
+            name="es", version=VERSION, description="Spanish HEAD dataset"
+        ),
+        datasets.BuilderConfig(
+            name="en", version=VERSION, description="English HEAD dataset"
+        ),
     ]
 
     DEFAULT_CONFIG_NAME = "es"
@@ -106,15 +109,24 @@ def _split_generators(self, dl_manager):
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
-                gen_kwargs={"data_dir": data_dir, "filepath": os.path.join(data_lang_dir, f"train_{dir}.json")},
+                gen_kwargs={
+                    "data_dir": data_dir,
+                    "filepath": os.path.join(data_lang_dir, f"train_{dir}.json"),
+                },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
-                gen_kwargs={"data_dir": data_dir, "filepath": os.path.join(data_lang_dir, f"test_{dir}.json")},
+                gen_kwargs={
+                    "data_dir": data_dir,
+                    "filepath": os.path.join(data_lang_dir, f"test_{dir}.json"),
+                },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
-                gen_kwargs={"data_dir": data_dir, "filepath": os.path.join(data_lang_dir, f"dev_{dir}.json")},
+                gen_kwargs={
+                    "data_dir": data_dir,
+                    "filepath": os.path.join(data_lang_dir, f"dev_{dir}.json"),
+                },
             ),
         ]
 
@@ -134,7 +146,9 @@ def _generate_examples(self, data_dir, filepath):
 
                     aids = [answer["aid"] for answer in question["answers"]]
                     atexts = [answer["atext"].strip() for answer in question["answers"]]
-                    answers = [{"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)]
+                    answers = [
+                        {"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)
+                    ]
 
                     id_ = f"{exam_id}_{qid}"
                     yield id_, {
diff --git a/lm_eval/datasets/hendrycks_ethics/dataset_infos.json b/lm_eval/datasets/hendrycks_ethics/dataset_infos.json
index 2a7dad72dd..54aecc3bed 100644
--- a/lm_eval/datasets/hendrycks_ethics/dataset_infos.json
+++ b/lm_eval/datasets/hendrycks_ethics/dataset_infos.json
@@ -1 +1 @@
-{"commonsense": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"label": {"dtype": "int32", "id": null, "_type": "Value"}, "input": {"dtype": "string", "id": null, "_type": "Value"}, "is_short": {"dtype": "bool", "id": null, "_type": "Value"}, "edited": {"dtype": "bool", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "commonsense", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 14435215, "num_examples": 13910, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 3150094, "num_examples": 3885, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 17585309, "size_in_bytes": 53170333}, "deontology": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "excuse": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "deontology", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1931475, "num_examples": 18164, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 384602, "num_examples": 3596, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2316077, "size_in_bytes": 37901101}, "justice": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Justice subset contains examples focusing on how a character treats another person", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "justice", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2516501, "num_examples": 21791, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 309427, "num_examples": 2704, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2825928, "size_in_bytes": 38410952}, "utilitarianism": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"activity": {"dtype": "string", "id": null, "_type": "Value"}, "baseline": {"dtype": "string", "id": null, "_type": "Value"}, "rating": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "utilitarianism", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2241770, "num_examples": 13738, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 749768, "num_examples": 4808, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2991538, "size_in_bytes": 38576562}, "virtue": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "trait": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "virtue", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2640328, "num_examples": 28245, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 473473, "num_examples": 4975, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 3113801, "size_in_bytes": 38698825}}
\ No newline at end of file
+{"commonsense": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"label": {"dtype": "int32", "id": null, "_type": "Value"}, "input": {"dtype": "string", "id": null, "_type": "Value"}, "is_short": {"dtype": "bool", "id": null, "_type": "Value"}, "edited": {"dtype": "bool", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "commonsense", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 14435215, "num_examples": 13910, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 3150094, "num_examples": 3885, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 17585309, "size_in_bytes": 53170333}, "deontology": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "excuse": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "deontology", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1931475, "num_examples": 18164, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 384602, "num_examples": 3596, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2316077, "size_in_bytes": 37901101}, "justice": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Justice subset contains examples focusing on how a character treats another person", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "justice", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2516501, "num_examples": 21791, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 309427, "num_examples": 2704, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2825928, "size_in_bytes": 38410952}, "utilitarianism": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"activity": {"dtype": "string", "id": null, "_type": "Value"}, "baseline": {"dtype": "string", "id": null, "_type": "Value"}, "rating": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "utilitarianism", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2241770, "num_examples": 13738, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 749768, "num_examples": 4808, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2991538, "size_in_bytes": 38576562}, "virtue": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "trait": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "virtue", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2640328, "num_examples": 28245, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 473473, "num_examples": 4975, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 3113801, "size_in_bytes": 38698825}}
diff --git a/lm_eval/datasets/hendrycks_ethics/hendrycks_ethics.py b/lm_eval/datasets/hendrycks_ethics/hendrycks_ethics.py
index ca516c35a7..520f912e27 100644
--- a/lm_eval/datasets/hendrycks_ethics/hendrycks_ethics.py
+++ b/lm_eval/datasets/hendrycks_ethics/hendrycks_ethics.py
@@ -71,54 +71,64 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder):
         EthicsConfig(
             name="commonsense",
             prefix="cm",
-            features=datasets.Features({
-                "label": datasets.Value("int32"),
-                "input": datasets.Value("string"),
-                "is_short": datasets.Value("bool"),
-                "edited": datasets.Value("bool"),
-            }),
-            description="The Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept."
+            features=datasets.Features(
+                {
+                    "label": datasets.Value("int32"),
+                    "input": datasets.Value("string"),
+                    "is_short": datasets.Value("bool"),
+                    "edited": datasets.Value("bool"),
+                }
+            ),
+            description="The Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.",
         ),
         EthicsConfig(
             name="deontology",
             prefix="deontology",
-            features=datasets.Features({
-                "group_id": datasets.Value("int32"),
-                "label": datasets.Value("int32"),
-                "scenario": datasets.Value("string"),
-                "excuse": datasets.Value("string"),
-            }),
+            features=datasets.Features(
+                {
+                    "group_id": datasets.Value("int32"),
+                    "label": datasets.Value("int32"),
+                    "scenario": datasets.Value("string"),
+                    "excuse": datasets.Value("string"),
+                }
+            ),
             description="The Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints",
         ),
         EthicsConfig(
             name="justice",
             prefix="justice",
-            features=datasets.Features({
-                "group_id": datasets.Value("int32"),
-                "label": datasets.Value("int32"),
-                "scenario": datasets.Value("string"),
-            }),
+            features=datasets.Features(
+                {
+                    "group_id": datasets.Value("int32"),
+                    "label": datasets.Value("int32"),
+                    "scenario": datasets.Value("string"),
+                }
+            ),
             description="The Justice subset contains examples focusing on how a character treats another person",
         ),
         EthicsConfig(
             name="utilitarianism",
             prefix="util",
-            features=datasets.Features({
-                "activity": datasets.Value("string"),
-                "baseline": datasets.Value("string"),
-                "rating": datasets.Value("string"),  # Empty rating.
-            }),
+            features=datasets.Features(
+                {
+                    "activity": datasets.Value("string"),
+                    "baseline": datasets.Value("string"),
+                    "rating": datasets.Value("string"),  # Empty rating.
+                }
+            ),
             description="The Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario",
         ),
         EthicsConfig(
             name="virtue",
             prefix="virtue",
-            features=datasets.Features({
-                "group_id": datasets.Value("int32"),
-                "label": datasets.Value("int32"),
-                "scenario": datasets.Value("string"),
-                "trait": datasets.Value("string"),
-            }),
+            features=datasets.Features(
+                {
+                    "group_id": datasets.Value("int32"),
+                    "label": datasets.Value("int32"),
+                    "scenario": datasets.Value("string"),
+                    "trait": datasets.Value("string"),
+                }
+            ),
             description="The Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified",
         ),
     ]
@@ -140,7 +150,12 @@ def _split_generators(self, dl_manager):
                 name=datasets.Split.TRAIN,
                 # These kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "filepath": os.path.join(data_dir, "ethics", self.config.name, f"{self.config.prefix}_train.csv"),
+                    "filepath": os.path.join(
+                        data_dir,
+                        "ethics",
+                        self.config.name,
+                        f"{self.config.prefix}_train.csv",
+                    ),
                     "split": "train",
                 },
             ),
@@ -148,18 +163,22 @@ def _split_generators(self, dl_manager):
                 name=datasets.Split.TEST,
                 # These kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "filepath": os.path.join(data_dir, "ethics", self.config.name, f"{self.config.prefix}_test.csv"),
-                    "split": "test"
+                    "filepath": os.path.join(
+                        data_dir,
+                        "ethics",
+                        self.config.name,
+                        f"{self.config.prefix}_test.csv",
+                    ),
+                    "split": "test",
                 },
-            )
+            ),
         ]
 
     # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
     def _generate_examples(self, filepath, split):
-        with open(filepath, newline='') as f:
+        with open(filepath, newline="") as f:
             if self.config.name == "utilitarianism":
-                contents = csv.DictReader(
-                    f, fieldnames=['activity', "baseline"])
+                contents = csv.DictReader(f, fieldnames=["activity", "baseline"])
             else:
                 contents = csv.DictReader(f)
             # For subsets with grouped scenarios, tag them with an id.
diff --git a/lm_eval/datasets/hendrycks_math/dataset_infos.json b/lm_eval/datasets/hendrycks_math/dataset_infos.json
index 0f85c51a39..27d154efa5 100644
--- a/lm_eval/datasets/hendrycks_math/dataset_infos.json
+++ b/lm_eval/datasets/hendrycks_math/dataset_infos.json
@@ -1 +1 @@
-{"algebra": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "algebra", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 955021, "num_examples": 1744, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 648291, "num_examples": 1187, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1603312, "size_in_bytes": 21931248}, "counting_and_probability": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "counting_and_probability", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 667385, "num_examples": 771, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 353803, "num_examples": 474, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1021188, "size_in_bytes": 21349124}, "geometry": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "geometry", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1077241, "num_examples": 870, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 523126, "num_examples": 479, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1600367, "size_in_bytes": 21928303}, "intermediate_algebra": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "intermediate_algebra", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1157476, "num_examples": 1295, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 795070, "num_examples": 903, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1952546, "size_in_bytes": 22280482}, "number_theory": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "number_theory", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 595793, "num_examples": 869, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 349455, "num_examples": 540, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 945248, "size_in_bytes": 21273184}, "prealgebra": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "prealgebra", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 715611, "num_examples": 1205, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 510195, "num_examples": 871, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1225806, "size_in_bytes": 21553742}, "precalculus": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "precalculus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 816245, "num_examples": 746, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 552893, "num_examples": 546, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1369138, "size_in_bytes": 21697074}}
\ No newline at end of file
+{"algebra": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "algebra", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 955021, "num_examples": 1744, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 648291, "num_examples": 1187, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1603312, "size_in_bytes": 21931248}, "counting_and_probability": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "counting_and_probability", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 667385, "num_examples": 771, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 353803, "num_examples": 474, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1021188, "size_in_bytes": 21349124}, "geometry": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "geometry", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1077241, "num_examples": 870, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 523126, "num_examples": 479, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1600367, "size_in_bytes": 21928303}, "intermediate_algebra": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "intermediate_algebra", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1157476, "num_examples": 1295, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 795070, "num_examples": 903, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1952546, "size_in_bytes": 22280482}, "number_theory": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "number_theory", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 595793, "num_examples": 869, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 349455, "num_examples": 540, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 945248, "size_in_bytes": 21273184}, "prealgebra": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "prealgebra", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 715611, "num_examples": 1205, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 510195, "num_examples": 871, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1225806, "size_in_bytes": 21553742}, "precalculus": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "precalculus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 816245, "num_examples": 746, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 552893, "num_examples": 546, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1369138, "size_in_bytes": 21697074}}
diff --git a/lm_eval/datasets/hendrycks_math/hendrycks_math.py b/lm_eval/datasets/hendrycks_math/hendrycks_math.py
index 4d48ccfdcf..043adeeed6 100644
--- a/lm_eval/datasets/hendrycks_math/hendrycks_math.py
+++ b/lm_eval/datasets/hendrycks_math/hendrycks_math.py
@@ -44,13 +44,13 @@
 _URLS = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"
 
 _NAMES = [
-    'algebra',
-    'counting_and_probability',
-    'geometry',
-    'intermediate_algebra',
-    'number_theory',
-    'prealgebra',
-    'precalculus',
+    "algebra",
+    "counting_and_probability",
+    "geometry",
+    "intermediate_algebra",
+    "number_theory",
+    "prealgebra",
+    "precalculus",
 ]
 
 
@@ -89,7 +89,9 @@ def _split_generators(self, dl_manager):
                 name=datasets.Split.TRAIN,
                 # These kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "basepath": os.path.join(data_dir, "MATH", "train", self.config.name),
+                    "basepath": os.path.join(
+                        data_dir, "MATH", "train", self.config.name
+                    ),
                     "split": "train",
                 },
             ),
@@ -97,8 +99,10 @@ def _split_generators(self, dl_manager):
                 name=datasets.Split.TEST,
                 # These kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "basepath": os.path.join(data_dir, "MATH", "test", self.config.name),
-                    "split": "test"
+                    "basepath": os.path.join(
+                        data_dir, "MATH", "test", self.config.name
+                    ),
+                    "split": "test",
                 },
             ),
         ]
@@ -107,7 +111,7 @@ def _split_generators(self, dl_manager):
     def _generate_examples(self, basepath, split):
         key = 0
         for file in sorted(pathlib.Path(basepath).iterdir()):
-            with open(file, "r", encoding='utf-8') as f:
+            with open(file, "r", encoding="utf-8") as f:
                 data = json.load(f)
                 yield key, {
                     "problem": data["problem"],
diff --git a/lm_eval/datasets/lambada/dataset_infos.json b/lm_eval/datasets/lambada/dataset_infos.json
index 99f1a9dc40..855c6aef53 100644
--- a/lm_eval/datasets/lambada/dataset_infos.json
+++ b/lm_eval/datasets/lambada/dataset_infos.json
@@ -1 +1 @@
-{"original": {"description": "LAMBADA is a dataset to evaluate the capabilities of computational models for text\nunderstanding by means of a word prediction task. LAMBADA is a collection of narrative\ntexts sharing the characteristic that human subjects are able to guess their last\nword if they are exposed to the whole text, but not if they only see the last\nsentence preceding the target word. To succeed on LAMBADA, computational models\ncannot simply rely on local context, but must be able to keep track of information\nin the broader discourse.\n\nThe LAMBADA dataset", "citation": "@misc{\n    author={Paperno, Denis and Kruszewski, Germ\u00e1n and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fern\u00e1ndez, Raquel}, \n    title={The LAMBADA dataset},\n    DOI={10.5281/zenodo.2630551},\n    publisher={Zenodo},\n    year={2016},\n    month={Aug}\n}\n", "homepage": "https://zenodo.org/record/2630551#.X4Xzn5NKjUI", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lambada", "config_name": "original", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 1709449, "num_examples": 5153, "dataset_name": "lambada"}}, "download_checksums": {"http://eaidata.bmk.sh/data/lambada_test.jsonl": {"num_bytes": 1819752, "checksum": "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"}}, "download_size": 1819752, "post_processing_size": null, "dataset_size": 1709449, "size_in_bytes": 3529201}, "en": {"description": "LAMBADA is a dataset to evaluate the capabilities of computational models for text\nunderstanding by means of a word prediction task. LAMBADA is a collection of narrative\ntexts sharing the characteristic that human subjects are able to guess their last\nword if they are exposed to the whole text, but not if they only see the last\nsentence preceding the target word. To succeed on LAMBADA, computational models\ncannot simply rely on local context, but must be able to keep track of information\nin the broader discourse.\n\nThe English translated LAMBADA dataset", "citation": "@misc{\n    author={Paperno, Denis and Kruszewski, Germ\u00e1n and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fern\u00e1ndez, Raquel}, \n    title={The LAMBADA dataset},\n    DOI={10.5281/zenodo.2630551},\n    publisher={Zenodo},\n    year={2016},\n    month={Aug}\n}\n", "homepage": "https://zenodo.org/record/2630551#.X4Xzn5NKjUI", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lambada", "config_name": "en", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 1709449, "num_examples": 5153, "dataset_name": "lambada"}}, "download_checksums": {"http://eaidata.bmk.sh/data/lambada_test_en.jsonl": {"num_bytes": 1819752, "checksum": "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"}}, "download_size": 1819752, "post_processing_size": null, "dataset_size": 1709449, "size_in_bytes": 3529201}, "fr": {"description": "LAMBADA is a dataset to evaluate the capabilities of computational models for text\nunderstanding by means of a word prediction task. LAMBADA is a collection of narrative\ntexts sharing the characteristic that human subjects are able to guess their last\nword if they are exposed to the whole text, but not if they only see the last\nsentence preceding the target word. To succeed on LAMBADA, computational models\ncannot simply rely on local context, but must be able to keep track of information\nin the broader discourse.\n\nThe French translated LAMBADA dataset", "citation": "@misc{\n    author={Paperno, Denis and Kruszewski, Germ\u00e1n and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fern\u00e1ndez, Raquel}, \n    title={The LAMBADA dataset},\n    DOI={10.5281/zenodo.2630551},\n    publisher={Zenodo},\n    year={2016},\n    month={Aug}\n}\n", "homepage": "https://zenodo.org/record/2630551#.X4Xzn5NKjUI", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lambada", "config_name": "fr", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 1948795, "num_examples": 5153, "dataset_name": "lambada"}}, "download_checksums": {"http://eaidata.bmk.sh/data/lambada_test_fr.jsonl": {"num_bytes": 2028703, "checksum": "941ec6a73dba7dc91c860bf493eb66a527cd430148827a4753a4535a046bf362"}}, "download_size": 2028703, "post_processing_size": null, "dataset_size": 1948795, "size_in_bytes": 3977498}, "de": {"description": "LAMBADA is a dataset to evaluate the capabilities of computational models for text\nunderstanding by means of a word prediction task. LAMBADA is a collection of narrative\ntexts sharing the characteristic that human subjects are able to guess their last\nword if they are exposed to the whole text, but not if they only see the last\nsentence preceding the target word. To succeed on LAMBADA, computational models\ncannot simply rely on local context, but must be able to keep track of information\nin the broader discourse.\n\nThe German translated LAMBADA dataset", "citation": "@misc{\n    author={Paperno, Denis and Kruszewski, Germ\u00e1n and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fern\u00e1ndez, Raquel}, \n    title={The LAMBADA dataset},\n    DOI={10.5281/zenodo.2630551},\n    publisher={Zenodo},\n    year={2016},\n    month={Aug}\n}\n", "homepage": "https://zenodo.org/record/2630551#.X4Xzn5NKjUI", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lambada", "config_name": "de", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 1904576, "num_examples": 5153, "dataset_name": "lambada"}}, "download_checksums": {"http://eaidata.bmk.sh/data/lambada_test_de.jsonl": {"num_bytes": 1985231, "checksum": "51c6c1795894c46e88e4c104b5667f488efe79081fb34d746b82b8caa663865e"}}, "download_size": 1985231, "post_processing_size": null, "dataset_size": 1904576, "size_in_bytes": 3889807}, "it": {"description": "LAMBADA is a dataset to evaluate the capabilities of computational models for text\nunderstanding by means of a word prediction task. LAMBADA is a collection of narrative\ntexts sharing the characteristic that human subjects are able to guess their last\nword if they are exposed to the whole text, but not if they only see the last\nsentence preceding the target word. To succeed on LAMBADA, computational models\ncannot simply rely on local context, but must be able to keep track of information\nin the broader discourse.\n\nThe Italian translated LAMBADA dataset", "citation": "@misc{\n    author={Paperno, Denis and Kruszewski, Germ\u00e1n and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fern\u00e1ndez, Raquel}, \n    title={The LAMBADA dataset},\n    DOI={10.5281/zenodo.2630551},\n    publisher={Zenodo},\n    year={2016},\n    month={Aug}\n}\n", "homepage": "https://zenodo.org/record/2630551#.X4Xzn5NKjUI", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lambada", "config_name": "it", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 1813420, "num_examples": 5153, "dataset_name": "lambada"}}, "download_checksums": {"http://eaidata.bmk.sh/data/lambada_test_it.jsonl": {"num_bytes": 1894613, "checksum": "86654237716702ab74f42855ae5a78455c1b0e50054a4593fb9c6fcf7fad0850"}}, "download_size": 1894613, "post_processing_size": null, "dataset_size": 1813420, "size_in_bytes": 3708033}, "es": {"description": "LAMBADA is a dataset to evaluate the capabilities of computational models for text\nunderstanding by means of a word prediction task. LAMBADA is a collection of narrative\ntexts sharing the characteristic that human subjects are able to guess their last\nword if they are exposed to the whole text, but not if they only see the last\nsentence preceding the target word. To succeed on LAMBADA, computational models\ncannot simply rely on local context, but must be able to keep track of information\nin the broader discourse.\n\nThe Spanish translated LAMBADA dataset", "citation": "@misc{\n    author={Paperno, Denis and Kruszewski, Germ\u00e1n and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fern\u00e1ndez, Raquel}, \n    title={The LAMBADA dataset},\n    DOI={10.5281/zenodo.2630551},\n    publisher={Zenodo},\n    year={2016},\n    month={Aug}\n}\n", "homepage": "https://zenodo.org/record/2630551#.X4Xzn5NKjUI", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lambada", "config_name": "es", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 1821735, "num_examples": 5153, "dataset_name": "lambada"}}, "download_checksums": {"http://eaidata.bmk.sh/data/lambada_test_es.jsonl": {"num_bytes": 1902349, "checksum": "ffd760026c647fb43c67ce1bc56fd527937304b348712dce33190ea6caba6f9c"}}, "download_size": 1902349, "post_processing_size": null, "dataset_size": 1821735, "size_in_bytes": 3724084}}
\ No newline at end of file
+{"original": {"description": "LAMBADA is a dataset to evaluate the capabilities of computational models for text\nunderstanding by means of a word prediction task. LAMBADA is a collection of narrative\ntexts sharing the characteristic that human subjects are able to guess their last\nword if they are exposed to the whole text, but not if they only see the last\nsentence preceding the target word. To succeed on LAMBADA, computational models\ncannot simply rely on local context, but must be able to keep track of information\nin the broader discourse.\n\nThe LAMBADA dataset", "citation": "@misc{\n    author={Paperno, Denis and Kruszewski, Germ\u00e1n and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fern\u00e1ndez, Raquel}, \n    title={The LAMBADA dataset},\n    DOI={10.5281/zenodo.2630551},\n    publisher={Zenodo},\n    year={2016},\n    month={Aug}\n}\n", "homepage": "https://zenodo.org/record/2630551#.X4Xzn5NKjUI", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lambada", "config_name": "original", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 1709449, "num_examples": 5153, "dataset_name": "lambada"}}, "download_checksums": {"http://eaidata.bmk.sh/data/lambada_test.jsonl": {"num_bytes": 1819752, "checksum": "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"}}, "download_size": 1819752, "post_processing_size": null, "dataset_size": 1709449, "size_in_bytes": 3529201}, "en": {"description": "LAMBADA is a dataset to evaluate the capabilities of computational models for text\nunderstanding by means of a word prediction task. LAMBADA is a collection of narrative\ntexts sharing the characteristic that human subjects are able to guess their last\nword if they are exposed to the whole text, but not if they only see the last\nsentence preceding the target word. To succeed on LAMBADA, computational models\ncannot simply rely on local context, but must be able to keep track of information\nin the broader discourse.\n\nThe English translated LAMBADA dataset", "citation": "@misc{\n    author={Paperno, Denis and Kruszewski, Germ\u00e1n and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fern\u00e1ndez, Raquel}, \n    title={The LAMBADA dataset},\n    DOI={10.5281/zenodo.2630551},\n    publisher={Zenodo},\n    year={2016},\n    month={Aug}\n}\n", "homepage": "https://zenodo.org/record/2630551#.X4Xzn5NKjUI", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lambada", "config_name": "en", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 1709449, "num_examples": 5153, "dataset_name": "lambada"}}, "download_checksums": {"http://eaidata.bmk.sh/data/lambada_test_en.jsonl": {"num_bytes": 1819752, "checksum": "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"}}, "download_size": 1819752, "post_processing_size": null, "dataset_size": 1709449, "size_in_bytes": 3529201}, "fr": {"description": "LAMBADA is a dataset to evaluate the capabilities of computational models for text\nunderstanding by means of a word prediction task. LAMBADA is a collection of narrative\ntexts sharing the characteristic that human subjects are able to guess their last\nword if they are exposed to the whole text, but not if they only see the last\nsentence preceding the target word. To succeed on LAMBADA, computational models\ncannot simply rely on local context, but must be able to keep track of information\nin the broader discourse.\n\nThe French translated LAMBADA dataset", "citation": "@misc{\n    author={Paperno, Denis and Kruszewski, Germ\u00e1n and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fern\u00e1ndez, Raquel}, \n    title={The LAMBADA dataset},\n    DOI={10.5281/zenodo.2630551},\n    publisher={Zenodo},\n    year={2016},\n    month={Aug}\n}\n", "homepage": "https://zenodo.org/record/2630551#.X4Xzn5NKjUI", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lambada", "config_name": "fr", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 1948795, "num_examples": 5153, "dataset_name": "lambada"}}, "download_checksums": {"http://eaidata.bmk.sh/data/lambada_test_fr.jsonl": {"num_bytes": 2028703, "checksum": "941ec6a73dba7dc91c860bf493eb66a527cd430148827a4753a4535a046bf362"}}, "download_size": 2028703, "post_processing_size": null, "dataset_size": 1948795, "size_in_bytes": 3977498}, "de": {"description": "LAMBADA is a dataset to evaluate the capabilities of computational models for text\nunderstanding by means of a word prediction task. LAMBADA is a collection of narrative\ntexts sharing the characteristic that human subjects are able to guess their last\nword if they are exposed to the whole text, but not if they only see the last\nsentence preceding the target word. To succeed on LAMBADA, computational models\ncannot simply rely on local context, but must be able to keep track of information\nin the broader discourse.\n\nThe German translated LAMBADA dataset", "citation": "@misc{\n    author={Paperno, Denis and Kruszewski, Germ\u00e1n and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fern\u00e1ndez, Raquel}, \n    title={The LAMBADA dataset},\n    DOI={10.5281/zenodo.2630551},\n    publisher={Zenodo},\n    year={2016},\n    month={Aug}\n}\n", "homepage": "https://zenodo.org/record/2630551#.X4Xzn5NKjUI", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lambada", "config_name": "de", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 1904576, "num_examples": 5153, "dataset_name": "lambada"}}, "download_checksums": {"http://eaidata.bmk.sh/data/lambada_test_de.jsonl": {"num_bytes": 1985231, "checksum": "51c6c1795894c46e88e4c104b5667f488efe79081fb34d746b82b8caa663865e"}}, "download_size": 1985231, "post_processing_size": null, "dataset_size": 1904576, "size_in_bytes": 3889807}, "it": {"description": "LAMBADA is a dataset to evaluate the capabilities of computational models for text\nunderstanding by means of a word prediction task. LAMBADA is a collection of narrative\ntexts sharing the characteristic that human subjects are able to guess their last\nword if they are exposed to the whole text, but not if they only see the last\nsentence preceding the target word. To succeed on LAMBADA, computational models\ncannot simply rely on local context, but must be able to keep track of information\nin the broader discourse.\n\nThe Italian translated LAMBADA dataset", "citation": "@misc{\n    author={Paperno, Denis and Kruszewski, Germ\u00e1n and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fern\u00e1ndez, Raquel}, \n    title={The LAMBADA dataset},\n    DOI={10.5281/zenodo.2630551},\n    publisher={Zenodo},\n    year={2016},\n    month={Aug}\n}\n", "homepage": "https://zenodo.org/record/2630551#.X4Xzn5NKjUI", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lambada", "config_name": "it", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 1813420, "num_examples": 5153, "dataset_name": "lambada"}}, "download_checksums": {"http://eaidata.bmk.sh/data/lambada_test_it.jsonl": {"num_bytes": 1894613, "checksum": "86654237716702ab74f42855ae5a78455c1b0e50054a4593fb9c6fcf7fad0850"}}, "download_size": 1894613, "post_processing_size": null, "dataset_size": 1813420, "size_in_bytes": 3708033}, "es": {"description": "LAMBADA is a dataset to evaluate the capabilities of computational models for text\nunderstanding by means of a word prediction task. LAMBADA is a collection of narrative\ntexts sharing the characteristic that human subjects are able to guess their last\nword if they are exposed to the whole text, but not if they only see the last\nsentence preceding the target word. To succeed on LAMBADA, computational models\ncannot simply rely on local context, but must be able to keep track of information\nin the broader discourse.\n\nThe Spanish translated LAMBADA dataset", "citation": "@misc{\n    author={Paperno, Denis and Kruszewski, Germ\u00e1n and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fern\u00e1ndez, Raquel}, \n    title={The LAMBADA dataset},\n    DOI={10.5281/zenodo.2630551},\n    publisher={Zenodo},\n    year={2016},\n    month={Aug}\n}\n", "homepage": "https://zenodo.org/record/2630551#.X4Xzn5NKjUI", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lambada", "config_name": "es", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 1821735, "num_examples": 5153, "dataset_name": "lambada"}}, "download_checksums": {"http://eaidata.bmk.sh/data/lambada_test_es.jsonl": {"num_bytes": 1902349, "checksum": "ffd760026c647fb43c67ce1bc56fd527937304b348712dce33190ea6caba6f9c"}}, "download_size": 1902349, "post_processing_size": null, "dataset_size": 1821735, "size_in_bytes": 3724084}}
diff --git a/lm_eval/datasets/lambada/lambada.py b/lm_eval/datasets/lambada/lambada.py
index 37debf2a88..506bef8279 100644
--- a/lm_eval/datasets/lambada/lambada.py
+++ b/lm_eval/datasets/lambada/lambada.py
@@ -22,7 +22,7 @@
 
 _CITATION = """\
 @misc{
-    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel}, 
+    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
     title={The LAMBADA dataset},
     DOI={10.5281/zenodo.2630551},
     publisher={Zenodo},
@@ -62,12 +62,34 @@ class Lambada(datasets.GeneratorBasedBuilder):
     VERSION = datasets.Version("0.0.1")
 
     BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="original", version=VERSION, description="The LAMBADA dataset"),
-        datasets.BuilderConfig(name="en", version=VERSION, description="The English translated LAMBADA dataset"),
-        datasets.BuilderConfig(name="fr", version=VERSION, description="The French translated LAMBADA dataset"),
-        datasets.BuilderConfig(name="de", version=VERSION, description="The German translated LAMBADA dataset"),
-        datasets.BuilderConfig(name="it", version=VERSION, description="The Italian translated LAMBADA dataset"),
-        datasets.BuilderConfig(name="es", version=VERSION, description="The Spanish translated LAMBADA dataset"),
+        datasets.BuilderConfig(
+            name="original", version=VERSION, description="The LAMBADA dataset"
+        ),
+        datasets.BuilderConfig(
+            name="en",
+            version=VERSION,
+            description="The English translated LAMBADA dataset",
+        ),
+        datasets.BuilderConfig(
+            name="fr",
+            version=VERSION,
+            description="The French translated LAMBADA dataset",
+        ),
+        datasets.BuilderConfig(
+            name="de",
+            version=VERSION,
+            description="The German translated LAMBADA dataset",
+        ),
+        datasets.BuilderConfig(
+            name="it",
+            version=VERSION,
+            description="The Italian translated LAMBADA dataset",
+        ),
+        datasets.BuilderConfig(
+            name="es",
+            version=VERSION,
+            description="The Spanish translated LAMBADA dataset",
+        ),
     ]
 
     DEFAULT_CONFIG_NAME = "original"
@@ -105,6 +127,4 @@ def _generate_examples(self, filepath, split):
         with open(filepath, encoding="utf-8") as f:
             for key, row in enumerate(f):
                 data = json.loads(row)
-                yield key, {
-                    "text": data["text"]
-                }
+                yield key, {"text": data["text"]}
diff --git a/lm_eval/datasets/logiqa/dataset_infos.json b/lm_eval/datasets/logiqa/dataset_infos.json
index ddf1853590..12a203cb05 100644
--- a/lm_eval/datasets/logiqa/dataset_infos.json
+++ b/lm_eval/datasets/logiqa/dataset_infos.json
@@ -1 +1 @@
-{"logiqa": {"description": "LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA\ninstances, covering multiple types of deductive reasoning. Results show that state-\nof-the-art neural models perform by far worse than human ceiling. The dataset can\nalso serve as a benchmark for reinvestigating logical AI under the deep learning\nNLP setting.\n", "citation": "@misc{liu2020logiqa,\n    title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, \n    author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},\n    year={2020},\n    eprint={2007.08124},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/lgw863/LogiQA-dataset", "license": "", "features": {"label": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "logiqa", "config_name": "logiqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 6419852, "num_examples": 7376, "dataset_name": "logiqa"}, "test": {"name": "test", "num_bytes": 571705, "num_examples": 651, "dataset_name": "logiqa"}, "validation": {"name": "validation", "num_bytes": 562437, "num_examples": 651, "dataset_name": "logiqa"}}, "download_checksums": {"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt": {"num_bytes": 6281272, "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt": {"num_bytes": 559060, "checksum": "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt": {"num_bytes": 550021, "checksum": "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"}}, "download_size": 7390353, "post_processing_size": null, "dataset_size": 7553994, "size_in_bytes": 14944347}}
\ No newline at end of file
+{"logiqa": {"description": "LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA\ninstances, covering multiple types of deductive reasoning. Results show that state-\nof-the-art neural models perform by far worse than human ceiling. The dataset can\nalso serve as a benchmark for reinvestigating logical AI under the deep learning\nNLP setting.\n", "citation": "@misc{liu2020logiqa,\n    title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, \n    author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},\n    year={2020},\n    eprint={2007.08124},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/lgw863/LogiQA-dataset", "license": "", "features": {"label": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "logiqa", "config_name": "logiqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 6419852, "num_examples": 7376, "dataset_name": "logiqa"}, "test": {"name": "test", "num_bytes": 571705, "num_examples": 651, "dataset_name": "logiqa"}, "validation": {"name": "validation", "num_bytes": 562437, "num_examples": 651, "dataset_name": "logiqa"}}, "download_checksums": {"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt": {"num_bytes": 6281272, "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt": {"num_bytes": 559060, "checksum": "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt": {"num_bytes": 550021, "checksum": "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"}}, "download_size": 7390353, "post_processing_size": null, "dataset_size": 7553994, "size_in_bytes": 14944347}}
diff --git a/lm_eval/datasets/logiqa/logiqa.py b/lm_eval/datasets/logiqa/logiqa.py
index a8d04f84dd..b1f5521596 100644
--- a/lm_eval/datasets/logiqa/logiqa.py
+++ b/lm_eval/datasets/logiqa/logiqa.py
@@ -19,7 +19,7 @@
 
 _CITATION = """\
 @misc{liu2020logiqa,
-    title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, 
+    title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning},
     author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},
     year={2020},
     eprint={2007.08124},
@@ -54,7 +54,9 @@ class Logiqa(datasets.GeneratorBasedBuilder):
     VERSION = datasets.Version("0.0.1")
 
     BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="logiqa", version=VERSION, description="The LogiQA dataset."),
+        datasets.BuilderConfig(
+            name="logiqa", version=VERSION, description="The LogiQA dataset."
+        ),
     ]
 
     def _info(self):
@@ -63,9 +65,7 @@ def _info(self):
                 "label": datasets.Value("string"),
                 "context": datasets.Value("string"),
                 "question": datasets.Value("string"),
-                "options": datasets.features.Sequence(
-                    datasets.Value("string")
-                ),
+                "options": datasets.features.Sequence(datasets.Value("string")),
             }
         )
         return datasets.DatasetInfo(
@@ -77,7 +77,11 @@ def _info(self):
         )
 
     def _split_generators(self, dl_manager):
-        urls = {"train": _URLS["train"], "test": _URLS["test"], "validation": _URLS["validation"]}
+        urls = {
+            "train": _URLS["train"],
+            "test": _URLS["test"],
+            "validation": _URLS["validation"],
+        }
         data_dir = dl_manager.download_and_extract(urls)
         return [
             datasets.SplitGenerator(
@@ -91,10 +95,7 @@ def _split_generators(self, dl_manager):
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
                 # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dir["test"],
-                    "split": "test"
-                },
+                gen_kwargs={"filepath": data_dir["test"], "split": "test"},
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
@@ -110,6 +111,7 @@ def _split_generators(self, dl_manager):
     def _generate_examples(self, filepath, split):
         def normalize(text):
             return text.replace(".", ". ").strip()
+
         with open(filepath, encoding="utf-8") as f:
             data = f.read().strip().split("\n\n")
             for key, row in enumerate(data):
diff --git a/lm_eval/datasets/mutual/dataset_infos.json b/lm_eval/datasets/mutual/dataset_infos.json
index 9ea2ac2de8..f8c438b3f8 100644
--- a/lm_eval/datasets/mutual/dataset_infos.json
+++ b/lm_eval/datasets/mutual/dataset_infos.json
@@ -1 +1 @@
-{"mutual": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nThe MuTual dataset.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 5141602, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 634396, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 624271, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6400269, "size_in_bytes": 17398147}, "mutual_plus": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nMuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual_plus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 4921179, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 606620, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 597340, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6125139, "size_in_bytes": 17123017}}
\ No newline at end of file
+{"mutual": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nThe MuTual dataset.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 5141602, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 634396, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 624271, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6400269, "size_in_bytes": 17398147}, "mutual_plus": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nMuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual_plus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 4921179, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 606620, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 597340, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6125139, "size_in_bytes": 17123017}}
diff --git a/lm_eval/datasets/mutual/mutual.py b/lm_eval/datasets/mutual/mutual.py
index 87453985b7..c519e663be 100644
--- a/lm_eval/datasets/mutual/mutual.py
+++ b/lm_eval/datasets/mutual/mutual.py
@@ -50,8 +50,14 @@ class Mutual(datasets.GeneratorBasedBuilder):
     VERSION = datasets.Version("0.0.1")
 
     BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="mutual", version=VERSION, description="The MuTual dataset."),
-        datasets.BuilderConfig(name="mutual_plus", version=VERSION, description="MuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses."),
+        datasets.BuilderConfig(
+            name="mutual", version=VERSION, description="The MuTual dataset."
+        ),
+        datasets.BuilderConfig(
+            name="mutual_plus",
+            version=VERSION,
+            description="MuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.",
+        ),
     ]
 
     def _info(self):
@@ -79,7 +85,9 @@ def _split_generators(self, dl_manager):
                 name=datasets.Split.TRAIN,
                 # These kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "basepath": os.path.join(data_dir, "MuTual-master", "data", self.config.name, "train"),
+                    "basepath": os.path.join(
+                        data_dir, "MuTual-master", "data", self.config.name, "train"
+                    ),
                     "split": "train",
                 },
             ),
@@ -87,7 +95,9 @@ def _split_generators(self, dl_manager):
                 name=datasets.Split.TEST,
                 # These kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "basepath": os.path.join(data_dir, "MuTual-master", "data", self.config.name, "test"),
+                    "basepath": os.path.join(
+                        data_dir, "MuTual-master", "data", self.config.name, "test"
+                    ),
                     "split": "test",
                 },
             ),
@@ -95,7 +105,9 @@ def _split_generators(self, dl_manager):
                 name=datasets.Split.VALIDATION,
                 # These kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "basepath": os.path.join(data_dir, "MuTual-master", "data", self.config.name, "dev"),
+                    "basepath": os.path.join(
+                        data_dir, "MuTual-master", "data", self.config.name, "dev"
+                    ),
                     "split": "dev",
                 },
             ),
@@ -109,7 +121,7 @@ def _generate_examples(self, basepath, split):
         for file in sorted(Path(basepath).iterdir()):
             if file.suffix != ".txt":
                 continue
-            with open(file, "r", encoding='utf-8') as f:
+            with open(file, "r", encoding="utf-8") as f:
                 data_str = f.read()
                 # Ignore the occasional empty file.
                 if not data_str:
diff --git a/lm_eval/datasets/pile/dataset_infos.json b/lm_eval/datasets/pile/dataset_infos.json
index edd9885712..d91e9b5704 100644
--- a/lm_eval/datasets/pile/dataset_infos.json
+++ b/lm_eval/datasets/pile/dataset_infos.json
@@ -1 +1 @@
-{"pile_arxiv": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nArXiv", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_arxiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 113218251, "num_examples": 2407, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 115653720, "num_examples": 2434, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 228871971, "size_in_bytes": 1160030307}, "pile_books3": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nBooks3", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_books3", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 150095743, "num_examples": 269, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 177359876, "num_examples": 301, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 327455619, "size_in_bytes": 1258613955}, "pile_bookcorpus2": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nBookCorpus2", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_bookcorpus2", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 9680652, "num_examples": 28, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9776271, "num_examples": 26, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 19456923, "size_in_bytes": 950615259}, "pile_dm-mathematics": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nDM Mathematics", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_dm-mathematics", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 15756556, "num_examples": 1922, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 16453386, "num_examples": 2007, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 32209942, "size_in_bytes": 963368278}, "pile_enron": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nEnron Emails", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_enron", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 1638859, "num_examples": 1010, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 1556487, "num_examples": 947, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 3195346, "size_in_bytes": 934353682}, "pile_europarl": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nEuroParl", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_europarl", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 8789652, "num_examples": 157, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9111791, "num_examples": 133, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 17901443, "size_in_bytes": 949059779}, "pile_freelaw": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nFreeLaw", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_freelaw", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 80808693, "num_examples": 5101, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 80363814, "num_examples": 5094, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 161172507, "size_in_bytes": 1092330843}, "pile_github": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nGithub", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_github", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 95654706, "num_examples": 18195, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 97179576, "num_examples": 18337, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 192834282, "size_in_bytes": 1123992618}, "pile_gutenberg": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nGutenberg (PG-19)", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_gutenberg", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 30243176, "num_examples": 80, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 24685980, "num_examples": 60, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 54929156, "size_in_bytes": 986087492}, "pile_hackernews": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nHackerNews", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_hackernews", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 8124255, "num_examples": 1632, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9803822, "num_examples": 1619, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 17928077, "size_in_bytes": 949086413}, "pile_nih-exporter": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nNIH ExPorter", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_nih-exporter", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 3928804, "num_examples": 1884, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 3927967, "num_examples": 1825, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 7856771, "size_in_bytes": 939015107}, "pile_opensubtitles": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nOpenSubtitles", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_opensubtitles", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 21008996, "num_examples": 642, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 19622904, "num_examples": 621, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 40631900, "size_in_bytes": 971790236}, "pile_openwebtext2": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nOpenWebText2", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_openwebtext2", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 128624303, "num_examples": 32925, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 131554302, "num_examples": 33400, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 260178605, "size_in_bytes": 1191336941}, "pile_philpapers": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPhilPapers", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_philpapers", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 5090158, "num_examples": 68, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 6499078, "num_examples": 64, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 11589236, "size_in_bytes": 942747572}, "pile_pile-cc": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPile-CC", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pile-cc", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 235004043, "num_examples": 52790, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 233535650, "num_examples": 52792, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 468539693, "size_in_bytes": 1399698029}, "pile_pubmed-abstracts": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPubMed Abstracts", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pubmed-abstracts", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 39908950, "num_examples": 29895, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 40008336, "num_examples": 29871, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 79917286, "size_in_bytes": 1011075622}, "pile_pubmed-central": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPubMed Central", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pubmed-central", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 187251519, "num_examples": 5911, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 184791818, "num_examples": 5977, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 372043337, "size_in_bytes": 1303201673}, "pile_stackexchange": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nStackExchange", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_stackexchange", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 66441557, "num_examples": 30378, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 66011397, "num_examples": 29950, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 132452954, "size_in_bytes": 1063611290}, "pile_upsto": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nUSPTO Backgrounds", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_upsto", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 47345405, "num_examples": 11415, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 48122320, "num_examples": 11387, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 95467725, "size_in_bytes": 1026626061}, "pile_ubuntu-irc": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nUbuntu IRC", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_ubuntu-irc", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 5694218, "num_examples": 22, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 7410104, "num_examples": 21, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 13104322, "size_in_bytes": 944262658}, "pile_wikipedia": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nWikipedia (en)", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_wikipedia", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 52166968, "num_examples": 17511, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 53186137, "num_examples": 17478, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 105353105, "size_in_bytes": 1036511441}, "pile_youtubesubtitles": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nYoutubeSubtitles", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_youtubesubtitles", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 7377448, "num_examples": 342, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 8937546, "num_examples": 326, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 16314994, "size_in_bytes": 947473330}}
\ No newline at end of file
+{"pile_arxiv": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nArXiv", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_arxiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 113218251, "num_examples": 2407, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 115653720, "num_examples": 2434, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 228871971, "size_in_bytes": 1160030307}, "pile_books3": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nBooks3", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_books3", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 150095743, "num_examples": 269, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 177359876, "num_examples": 301, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 327455619, "size_in_bytes": 1258613955}, "pile_bookcorpus2": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nBookCorpus2", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_bookcorpus2", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 9680652, "num_examples": 28, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9776271, "num_examples": 26, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 19456923, "size_in_bytes": 950615259}, "pile_dm-mathematics": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nDM Mathematics", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_dm-mathematics", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 15756556, "num_examples": 1922, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 16453386, "num_examples": 2007, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 32209942, "size_in_bytes": 963368278}, "pile_enron": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nEnron Emails", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_enron", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 1638859, "num_examples": 1010, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 1556487, "num_examples": 947, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 3195346, "size_in_bytes": 934353682}, "pile_europarl": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nEuroParl", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_europarl", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 8789652, "num_examples": 157, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9111791, "num_examples": 133, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 17901443, "size_in_bytes": 949059779}, "pile_freelaw": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nFreeLaw", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_freelaw", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 80808693, "num_examples": 5101, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 80363814, "num_examples": 5094, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 161172507, "size_in_bytes": 1092330843}, "pile_github": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nGithub", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_github", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 95654706, "num_examples": 18195, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 97179576, "num_examples": 18337, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 192834282, "size_in_bytes": 1123992618}, "pile_gutenberg": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nGutenberg (PG-19)", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_gutenberg", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 30243176, "num_examples": 80, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 24685980, "num_examples": 60, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 54929156, "size_in_bytes": 986087492}, "pile_hackernews": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nHackerNews", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_hackernews", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 8124255, "num_examples": 1632, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9803822, "num_examples": 1619, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 17928077, "size_in_bytes": 949086413}, "pile_nih-exporter": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nNIH ExPorter", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_nih-exporter", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 3928804, "num_examples": 1884, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 3927967, "num_examples": 1825, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 7856771, "size_in_bytes": 939015107}, "pile_opensubtitles": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nOpenSubtitles", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_opensubtitles", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 21008996, "num_examples": 642, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 19622904, "num_examples": 621, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 40631900, "size_in_bytes": 971790236}, "pile_openwebtext2": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nOpenWebText2", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_openwebtext2", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 128624303, "num_examples": 32925, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 131554302, "num_examples": 33400, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 260178605, "size_in_bytes": 1191336941}, "pile_philpapers": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPhilPapers", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_philpapers", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 5090158, "num_examples": 68, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 6499078, "num_examples": 64, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 11589236, "size_in_bytes": 942747572}, "pile_pile-cc": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPile-CC", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pile-cc", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 235004043, "num_examples": 52790, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 233535650, "num_examples": 52792, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 468539693, "size_in_bytes": 1399698029}, "pile_pubmed-abstracts": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPubMed Abstracts", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pubmed-abstracts", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 39908950, "num_examples": 29895, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 40008336, "num_examples": 29871, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 79917286, "size_in_bytes": 1011075622}, "pile_pubmed-central": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPubMed Central", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pubmed-central", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 187251519, "num_examples": 5911, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 184791818, "num_examples": 5977, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 372043337, "size_in_bytes": 1303201673}, "pile_stackexchange": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nStackExchange", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_stackexchange", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 66441557, "num_examples": 30378, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 66011397, "num_examples": 29950, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 132452954, "size_in_bytes": 1063611290}, "pile_upsto": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nUSPTO Backgrounds", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_upsto", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 47345405, "num_examples": 11415, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 48122320, "num_examples": 11387, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 95467725, "size_in_bytes": 1026626061}, "pile_ubuntu-irc": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nUbuntu IRC", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_ubuntu-irc", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 5694218, "num_examples": 22, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 7410104, "num_examples": 21, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 13104322, "size_in_bytes": 944262658}, "pile_wikipedia": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nWikipedia (en)", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_wikipedia", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 52166968, "num_examples": 17511, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 53186137, "num_examples": 17478, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 105353105, "size_in_bytes": 1036511441}, "pile_youtubesubtitles": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nYoutubeSubtitles", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_youtubesubtitles", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 7377448, "num_examples": 342, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 8937546, "num_examples": 326, "dataset_name": "pile"}}, "download_checksums": {"http://eaidata.bmk.sh/data/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "http://eaidata.bmk.sh/data/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 16314994, "size_in_bytes": 947473330}}
diff --git a/lm_eval/datasets/pile/pile.py b/lm_eval/datasets/pile/pile.py
index 728ffb2930..208f453656 100644
--- a/lm_eval/datasets/pile/pile.py
+++ b/lm_eval/datasets/pile/pile.py
@@ -103,10 +103,7 @@ def _split_generators(self, dl_manager):
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
                 # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dir["test"],
-                    "split": "test"
-                },
+                gen_kwargs={"filepath": data_dir["test"], "split": "test"},
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
diff --git a/lm_eval/datasets/quac/dataset_infos.json b/lm_eval/datasets/quac/dataset_infos.json
index 56ebe7a2f5..86fe853167 100644
--- a/lm_eval/datasets/quac/dataset_infos.json
+++ b/lm_eval/datasets/quac/dataset_infos.json
@@ -1 +1 @@
-{"quac": {"description": "Question Answering in Context (QuAC) is a dataset for modeling, understanding, and \nparticipating in information seeking dialog. Data instances consist of an interactive\ndialog between two crowd workers: (1) a student who poses a sequence of freeform\nquestions to learn as much as possible about a hidden Wikipedia text, and (2)\na teacher who answers the questions by providing short excerpts (spans) from the text.\n", "citation": "@article{choi2018quac,\n    title={Quac: Question answering in context},\n    author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},\n    journal={arXiv preprint arXiv:1808.07036},\n    year={2018}\n}\n", "homepage": "https://quac.ai/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "section_title": {"dtype": "string", "id": null, "_type": "Value"}, "paragraph": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "quac", "config_name": "quac", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 212391958, "num_examples": 83568, "dataset_name": "quac"}, "validation": {"name": "validation", "num_bytes": 20678483, "num_examples": 7354, "dataset_name": "quac"}}, "download_checksums": {"https://s3.amazonaws.com/my89public/quac/train_v0.2.json": {"num_bytes": 68114819, "checksum": "ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a"}, "https://s3.amazonaws.com/my89public/quac/val_v0.2.json": {"num_bytes": 8929167, "checksum": "09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378"}}, "download_size": 77043986, "post_processing_size": null, "dataset_size": 233070441, "size_in_bytes": 310114427}}
\ No newline at end of file
+{"quac": {"description": "Question Answering in Context (QuAC) is a dataset for modeling, understanding, and \nparticipating in information seeking dialog. Data instances consist of an interactive\ndialog between two crowd workers: (1) a student who poses a sequence of freeform\nquestions to learn as much as possible about a hidden Wikipedia text, and (2)\na teacher who answers the questions by providing short excerpts (spans) from the text.\n", "citation": "@article{choi2018quac,\n    title={Quac: Question answering in context},\n    author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},\n    journal={arXiv preprint arXiv:1808.07036},\n    year={2018}\n}\n", "homepage": "https://quac.ai/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "section_title": {"dtype": "string", "id": null, "_type": "Value"}, "paragraph": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "quac", "config_name": "quac", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 212391958, "num_examples": 83568, "dataset_name": "quac"}, "validation": {"name": "validation", "num_bytes": 20678483, "num_examples": 7354, "dataset_name": "quac"}}, "download_checksums": {"https://s3.amazonaws.com/my89public/quac/train_v0.2.json": {"num_bytes": 68114819, "checksum": "ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a"}, "https://s3.amazonaws.com/my89public/quac/val_v0.2.json": {"num_bytes": 8929167, "checksum": "09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378"}}, "download_size": 77043986, "post_processing_size": null, "dataset_size": 233070441, "size_in_bytes": 310114427}}
diff --git a/lm_eval/datasets/quac/quac.py b/lm_eval/datasets/quac/quac.py
index e54b645ccf..4328ec89eb 100644
--- a/lm_eval/datasets/quac/quac.py
+++ b/lm_eval/datasets/quac/quac.py
@@ -30,7 +30,7 @@
 """
 
 _DESCRIPTION = """\
-Question Answering in Context (QuAC) is a dataset for modeling, understanding, and 
+Question Answering in Context (QuAC) is a dataset for modeling, understanding, and
 participating in information seeking dialog. Data instances consist of an interactive
 dialog between two crowd workers: (1) a student who poses a sequence of freeform
 questions to learn as much as possible about a hidden Wikipedia text, and (2)
@@ -54,7 +54,9 @@ class Quac(datasets.GeneratorBasedBuilder):
     VERSION = datasets.Version("1.1.0")
 
     BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="quac", version=VERSION, description="The QuAC dataset"),
+        datasets.BuilderConfig(
+            name="quac", version=VERSION, description="The QuAC dataset"
+        ),
     ]
 
     def _info(self):
@@ -90,10 +92,7 @@ def _split_generators(self, dl_manager):
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
                 # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dir["validation"],
-                    "split": "validation"
-                },
+                gen_kwargs={"filepath": data_dir["validation"], "split": "validation"},
             ),
         ]
 
@@ -105,7 +104,7 @@ def _generate_examples(self, filepath, split):
             for row in data:
                 paragraph = row["paragraphs"][0]["context"].replace("CANNOTANSWER", "")
                 qas = row["paragraphs"][0]["qas"]
-                qa_pairs = [(qa['question'], qa['answers'][0]['text']) for qa in qas]
+                qa_pairs = [(qa["question"], qa["answers"][0]["text"]) for qa in qas]
                 for (question, answer) in qa_pairs:
                     # Yields examples as (key, example) tuples
                     yield key, {
diff --git a/lm_eval/datasets/sat_analogies/sat_analogies.py b/lm_eval/datasets/sat_analogies/sat_analogies.py
index 73cc5a68d4..7bfdf096b7 100644
--- a/lm_eval/datasets/sat_analogies/sat_analogies.py
+++ b/lm_eval/datasets/sat_analogies/sat_analogies.py
@@ -44,13 +44,16 @@
 
 
 class SatAnalogies(datasets.GeneratorBasedBuilder):
-    """ SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 multiple-choice analogy questions. """
+    """SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 multiple-choice analogy questions."""
 
     VERSION = datasets.Version("0.0.1")
 
     BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="sat_analogies", version=VERSION,
-                               description="The SAT Analogy Questions dataset"),
+        datasets.BuilderConfig(
+            name="sat_analogies",
+            version=VERSION,
+            description="The SAT Analogy Questions dataset",
+        ),
     ]
 
     @property
@@ -68,9 +71,7 @@ def _info(self):
             {
                 "source": datasets.Value("string"),
                 "stem": datasets.Value("string"),
-                "choices": datasets.features.Sequence(
-                    datasets.Value("string")
-                ),
+                "choices": datasets.features.Sequence(datasets.Value("string")),
                 "solution": datasets.Value("string"),
             }
         )
@@ -108,7 +109,7 @@ def _generate_examples(self, filepath):
                 if len(line) == 0 and record:
                     data.append(record)
                     record = []
-                elif len(line) > 0 and line[0] == '#':
+                elif len(line) > 0 and line[0] == "#":
                     # Skip comments.
                     continue
                 else:
@@ -120,8 +121,8 @@ def _generate_examples(self, filepath):
             choices = record[-6:-1]
             solution = record[-1]
             yield key, {
-                'source': source,
-                'stem': stem,
-                'choices': choices,
-                'solution': solution,
+                "source": source,
+                "stem": stem,
+                "choices": choices,
+                "solution": solution,
             }
diff --git a/lm_eval/datasets/triviaqa/dataset_infos.json b/lm_eval/datasets/triviaqa/dataset_infos.json
index 441ed40746..87f4e064cf 100644
--- a/lm_eval/datasets/triviaqa/dataset_infos.json
+++ b/lm_eval/datasets/triviaqa/dataset_infos.json
@@ -1 +1 @@
-{"triviaqa": {"description": "TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n", "citation": "@InProceedings{JoshiTriviaQA2017,\n    author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n    title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n    booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n    month = {July},\n    year = {2017},\n    address = {Vancouver, Canada},\n    publisher = {Association for Computational Linguistics},\n}\n", "homepage": "https://nlp.cs.washington.edu/triviaqa/", "license": "Apache License 2.0", "features": {"question_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_source": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"aliases": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "value": {"dtype": "string", "id": null, "_type": "Value"}}, "search_results": {"feature": {"description": {"dtype": "string", "id": null, "_type": "Value"}, "filename": {"dtype": "string", "id": null, "_type": "Value"}, "rank": {"dtype": "int32", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}, "search_context": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "triviaqa", "config_name": "triviaqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1271393601, "num_examples": 87622, "dataset_name": "triviaqa"}, "validation": {"name": "validation", "num_bytes": 163819509, "num_examples": 11313, "dataset_name": "triviaqa"}}, "download_checksums": {"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz": {"num_bytes": 546481381, "checksum": "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}}, "download_size": 546481381, "post_processing_size": null, "dataset_size": 1435213110, "size_in_bytes": 1981694491}}
\ No newline at end of file
+{"triviaqa": {"description": "TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n", "citation": "@InProceedings{JoshiTriviaQA2017,\n    author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n    title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n    booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n    month = {July},\n    year = {2017},\n    address = {Vancouver, Canada},\n    publisher = {Association for Computational Linguistics},\n}\n", "homepage": "https://nlp.cs.washington.edu/triviaqa/", "license": "Apache License 2.0", "features": {"question_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_source": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"aliases": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "value": {"dtype": "string", "id": null, "_type": "Value"}}, "search_results": {"feature": {"description": {"dtype": "string", "id": null, "_type": "Value"}, "filename": {"dtype": "string", "id": null, "_type": "Value"}, "rank": {"dtype": "int32", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}, "search_context": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "triviaqa", "config_name": "triviaqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1271393601, "num_examples": 87622, "dataset_name": "triviaqa"}, "validation": {"name": "validation", "num_bytes": 163819509, "num_examples": 11313, "dataset_name": "triviaqa"}}, "download_checksums": {"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz": {"num_bytes": 546481381, "checksum": "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}}, "download_size": 546481381, "post_processing_size": null, "dataset_size": 1435213110, "size_in_bytes": 1981694491}}
diff --git a/lm_eval/datasets/triviaqa/triviaqa.py b/lm_eval/datasets/triviaqa/triviaqa.py
index 4ef84885a2..a1c179ca20 100644
--- a/lm_eval/datasets/triviaqa/triviaqa.py
+++ b/lm_eval/datasets/triviaqa/triviaqa.py
@@ -50,13 +50,14 @@
 
 
 class Triviaqa(datasets.GeneratorBasedBuilder):
-    """ TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples """
+    """TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples"""
 
     VERSION = datasets.Version("0.0.1")
 
     BUILDER_CONFIGS = [
         datasets.BuilderConfig(
-            name="triviaqa", version=VERSION, description="The TriviaQA dataset"),
+            name="triviaqa", version=VERSION, description="The TriviaQA dataset"
+        ),
     ]
 
     def _info(self):
@@ -66,10 +67,10 @@ def _info(self):
                 "question_source": datasets.Value("string"),
                 "question": datasets.Value("string"),
                 "answer": {
-                    "aliases":  datasets.features.Sequence(
+                    "aliases": datasets.features.Sequence(
                         datasets.Value("string"),
                     ),
-                    "value": datasets.Value("string")
+                    "value": datasets.Value("string"),
                 },
                 "search_results": datasets.features.Sequence(
                     {
@@ -120,12 +121,24 @@ def _generate_examples(self, filepath):
                 for search_result in data["SearchResults"]:
                     search_results.append(
                         {
-                            "description": search_result["Description"] if "Description" in search_result else "",
-                            "filename": search_result["Filename"] if "Filename" in search_result else "",
-                            "rank": search_result["Rank"] if "Rank" in search_result else -1,
-                            "title": search_result["Title"] if "Title" in search_result else "",
-                            "url": search_result["Url"] if "Url" in search_result else "",
-                            "search_context": search_result["SearchContext"] if "SearchContext" in search_result else "",
+                            "description": search_result["Description"]
+                            if "Description" in search_result
+                            else "",
+                            "filename": search_result["Filename"]
+                            if "Filename" in search_result
+                            else "",
+                            "rank": search_result["Rank"]
+                            if "Rank" in search_result
+                            else -1,
+                            "title": search_result["Title"]
+                            if "Title" in search_result
+                            else "",
+                            "url": search_result["Url"]
+                            if "Url" in search_result
+                            else "",
+                            "search_context": search_result["SearchContext"]
+                            if "SearchContext" in search_result
+                            else "",
                         }
                     )
                 yield key, {
diff --git a/lm_eval/datasets/truthfulqa/dataset_infos.json b/lm_eval/datasets/truthfulqa/dataset_infos.json
index 6b4f3f6fc0..18d52feaa8 100644
--- a/lm_eval/datasets/truthfulqa/dataset_infos.json
+++ b/lm_eval/datasets/truthfulqa/dataset_infos.json
@@ -1 +1 @@
-{"multiple_choice": {"description": "TruthfulQA is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.\n\nThe multiple choice TruthfulQA task", "citation": "@misc{lin2021truthfulqa,\n    title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},\n    author={Stephanie Lin and Jacob Hilton and Owain Evans},\n    year={2021},\n    eprint={2109.07958},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/sylinrl/TruthfulQA", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "mc1_targets": {"choices": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "mc2_targets": {"choices": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "truthfulqa", "config_name": "multiple_choice", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 610333, "num_examples": 817, "dataset_name": "truthfulqa"}}, "download_checksums": {"https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json": {"num_bytes": 710607, "checksum": "6eb4125d25750c0145c4be2dce00440736684ab6f74ce6bff2139571cc758954"}}, "download_size": 710607, "post_processing_size": null, "dataset_size": 610333, "size_in_bytes": 1320940}, "generation": {"description": "TruthfulQA is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.\n\nThe generative TruthfulQA task", "citation": "@misc{lin2021truthfulqa,\n    title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},\n    author={Stephanie Lin and Jacob Hilton and Owain Evans},\n    year={2021},\n    eprint={2109.07958},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/sylinrl/TruthfulQA", "license": "", "features": {"category": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "best_answer": {"dtype": "string", "id": null, "_type": "Value"}, "correct_answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "incorrect_answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "truthfulqa", "config_name": "generation", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 463860, "num_examples": 817, "dataset_name": "truthfulqa"}}, "download_checksums": {"https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv": {"num_bytes": 443723, "checksum": "8d7dd15f033196140f032d97d30f037da7a7b1192c3f36f9937c1850925335a2"}}, "download_size": 443723, "post_processing_size": null, "dataset_size": 463860, "size_in_bytes": 907583}}
\ No newline at end of file
+{"multiple_choice": {"description": "TruthfulQA is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.\n\nThe multiple choice TruthfulQA task", "citation": "@misc{lin2021truthfulqa,\n    title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},\n    author={Stephanie Lin and Jacob Hilton and Owain Evans},\n    year={2021},\n    eprint={2109.07958},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/sylinrl/TruthfulQA", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "mc1_targets": {"choices": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "mc2_targets": {"choices": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "truthfulqa", "config_name": "multiple_choice", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 610333, "num_examples": 817, "dataset_name": "truthfulqa"}}, "download_checksums": {"https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json": {"num_bytes": 710607, "checksum": "6eb4125d25750c0145c4be2dce00440736684ab6f74ce6bff2139571cc758954"}}, "download_size": 710607, "post_processing_size": null, "dataset_size": 610333, "size_in_bytes": 1320940}, "generation": {"description": "TruthfulQA is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.\n\nThe generative TruthfulQA task", "citation": "@misc{lin2021truthfulqa,\n    title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},\n    author={Stephanie Lin and Jacob Hilton and Owain Evans},\n    year={2021},\n    eprint={2109.07958},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/sylinrl/TruthfulQA", "license": "", "features": {"category": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "best_answer": {"dtype": "string", "id": null, "_type": "Value"}, "correct_answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "incorrect_answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "truthfulqa", "config_name": "generation", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 463860, "num_examples": 817, "dataset_name": "truthfulqa"}}, "download_checksums": {"https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv": {"num_bytes": 443723, "checksum": "8d7dd15f033196140f032d97d30f037da7a7b1192c3f36f9937c1850925335a2"}}, "download_size": 443723, "post_processing_size": null, "dataset_size": 463860, "size_in_bytes": 907583}}
diff --git a/lm_eval/datasets/truthfulqa/truthfulqa.py b/lm_eval/datasets/truthfulqa/truthfulqa.py
index 296c87ac34..af1eb896f6 100644
--- a/lm_eval/datasets/truthfulqa/truthfulqa.py
+++ b/lm_eval/datasets/truthfulqa/truthfulqa.py
@@ -65,38 +65,46 @@ def __init__(self, url, features, **kwargs):
 
 class Truthfulqa(datasets.GeneratorBasedBuilder):
     """TruthfulQA is a benchmark to measure whether a language model is truthful in
-generating answers to questions."""
+    generating answers to questions."""
 
     BUILDER_CONFIGS = [
         TruthfulqaConfig(
             name="multiple_choice",
             url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json",
-            features=datasets.Features({
-                "question": datasets.Value("string"),
-                "mc1_targets": {
-                    "choices": datasets.features.Sequence(datasets.Value("string")),
-                    "labels": datasets.features.Sequence(datasets.Value("int32")),
-                },
-                "mc2_targets": {
-                    "choices": datasets.features.Sequence(datasets.Value("string")),
-                    "labels": datasets.features.Sequence(datasets.Value("int32")),
+            features=datasets.Features(
+                {
+                    "question": datasets.Value("string"),
+                    "mc1_targets": {
+                        "choices": datasets.features.Sequence(datasets.Value("string")),
+                        "labels": datasets.features.Sequence(datasets.Value("int32")),
+                    },
+                    "mc2_targets": {
+                        "choices": datasets.features.Sequence(datasets.Value("string")),
+                        "labels": datasets.features.Sequence(datasets.Value("int32")),
+                    },
                 }
-            }),
-            description="The multiple choice TruthfulQA task"
+            ),
+            description="The multiple choice TruthfulQA task",
         ),
         TruthfulqaConfig(
             name="generation",
             url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv",
-            features=datasets.Features({
-                "category": datasets.Value("string"),
-                "question": datasets.Value("string"),
-                "best_answer": datasets.Value("string"),
-                "correct_answers": datasets.features.Sequence(datasets.Value("string")),
-                "incorrect_answers": datasets.features.Sequence(datasets.Value("string")),
-                "source": datasets.Value("string"),
-            }),
-            description="The generative TruthfulQA task"
-        )
+            features=datasets.Features(
+                {
+                    "category": datasets.Value("string"),
+                    "question": datasets.Value("string"),
+                    "best_answer": datasets.Value("string"),
+                    "correct_answers": datasets.features.Sequence(
+                        datasets.Value("string")
+                    ),
+                    "incorrect_answers": datasets.features.Sequence(
+                        datasets.Value("string")
+                    ),
+                    "source": datasets.Value("string"),
+                }
+            ),
+            description="The generative TruthfulQA task",
+        ),
     ]
 
     def _info(self):
@@ -138,15 +146,15 @@ def _generate_examples(self, filepath, split):
                         "mc2_targets": {
                             "choices": row["mc2_targets"].keys(),
                             "labels": row["mc2_targets"].values(),
-                        }
+                        },
                     }
         else:
             # Generation data is in a `CSV` file.
-            with open(filepath, newline='') as f:
+            with open(filepath, newline="") as f:
                 contents = csv.DictReader(f)
                 for key, row in enumerate(contents):
                     # Ensure that references exist.
-                    if not row['Correct Answers'] or not row['Incorrect Answers']:
+                    if not row["Correct Answers"] or not row["Incorrect Answers"]:
                         continue
                     yield key, {
                         "category": row["Category"],
@@ -154,6 +162,8 @@ def _generate_examples(self, filepath, split):
                         "best_answer": row["Best Answer"],
                         # split on ";"
                         "correct_answers": row["Correct Answers"].strip().split(";"),
-                        "incorrect_answers": row["Incorrect Answers"].strip().split(";"),
+                        "incorrect_answers": row["Incorrect Answers"]
+                        .strip()
+                        .split(";"),
                         "source": row["Source"],
                     }
diff --git a/lm_eval/datasets/unscramble/dataset_infos.json b/lm_eval/datasets/unscramble/dataset_infos.json
index a9ff54bbe4..bae29209da 100644
--- a/lm_eval/datasets/unscramble/dataset_infos.json
+++ b/lm_eval/datasets/unscramble/dataset_infos.json
@@ -1 +1 @@
-{"mid_word_1_anagrams": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "mid_word_1_anagrams", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 271516, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/mid_word_1_anagrams.jsonl.gz": {"num_bytes": 106533, "checksum": "6768a86896083199de4815d4964cb2f6f1046476cfd80c2a562784f182905979"}}, "download_size": 106533, "post_processing_size": null, "dataset_size": 271516, "size_in_bytes": 378049}, "mid_word_2_anagrams": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "mid_word_2_anagrams", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/mid_word_2_anagrams.jsonl.gz": {"num_bytes": 109091, "checksum": "c3d839d09a7954b78a27cd2cd75d4ed0488656c56ef4dbd741a005343826cb01"}}, "download_size": 109091, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 391745}, "cycle_letters_in_word": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "cycle_letters_in_word", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/cycle_letters_in_word.jsonl.gz": {"num_bytes": 98451, "checksum": "1689c9002bb8c5988bf5f05e977c9db92f57932c1b5a38998c29ac0dd71e1d42"}}, "download_size": 98451, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 381105}, "random_insertion_in_word": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "random_insertion_in_word", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 353981, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/random_insertion_in_word.jsonl.gz": {"num_bytes": 143626, "checksum": "72e65d83da53d15752ee0c47379509de149ddbad32d61184e5991df29616b78a"}}, "download_size": 143626, "post_processing_size": null, "dataset_size": 353981, "size_in_bytes": 497607}, "reversed_words": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "reversed_words", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/reversed_words.jsonl.gz": {"num_bytes": 91917, "checksum": "133a08f875cd6c1ef8608a3233571a773881cc27b1c707de738cc6543439332a"}}, "download_size": 91917, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 374571}}
\ No newline at end of file
+{"mid_word_1_anagrams": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "mid_word_1_anagrams", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 271516, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/mid_word_1_anagrams.jsonl.gz": {"num_bytes": 106533, "checksum": "6768a86896083199de4815d4964cb2f6f1046476cfd80c2a562784f182905979"}}, "download_size": 106533, "post_processing_size": null, "dataset_size": 271516, "size_in_bytes": 378049}, "mid_word_2_anagrams": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "mid_word_2_anagrams", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/mid_word_2_anagrams.jsonl.gz": {"num_bytes": 109091, "checksum": "c3d839d09a7954b78a27cd2cd75d4ed0488656c56ef4dbd741a005343826cb01"}}, "download_size": 109091, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 391745}, "cycle_letters_in_word": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "cycle_letters_in_word", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/cycle_letters_in_word.jsonl.gz": {"num_bytes": 98451, "checksum": "1689c9002bb8c5988bf5f05e977c9db92f57932c1b5a38998c29ac0dd71e1d42"}}, "download_size": 98451, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 381105}, "random_insertion_in_word": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "random_insertion_in_word", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 353981, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/random_insertion_in_word.jsonl.gz": {"num_bytes": 143626, "checksum": "72e65d83da53d15752ee0c47379509de149ddbad32d61184e5991df29616b78a"}}, "download_size": 143626, "post_processing_size": null, "dataset_size": 353981, "size_in_bytes": 497607}, "reversed_words": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "reversed_words", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/reversed_words.jsonl.gz": {"num_bytes": 91917, "checksum": "133a08f875cd6c1ef8608a3233571a773881cc27b1c707de738cc6543439332a"}}, "download_size": 91917, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 374571}}
diff --git a/lm_eval/datasets/unscramble/unscramble.py b/lm_eval/datasets/unscramble/unscramble.py
index e16aa2a850..86f1e1ef5c 100644
--- a/lm_eval/datasets/unscramble/unscramble.py
+++ b/lm_eval/datasets/unscramble/unscramble.py
@@ -64,8 +64,9 @@ class Unscramble(datasets.GeneratorBasedBuilder):
     VERSION = datasets.Version("0.0.1")
 
     BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name=name, version=version,
-                               description=_DESCRIPTIONS[name])
+        datasets.BuilderConfig(
+            name=name, version=version, description=_DESCRIPTIONS[name]
+        )
         for name, version in zip(_NAMES, [VERSION] * len(_NAMES))
     ]
 
diff --git a/lm_eval/datasets/wikitext/dataset_infos.json b/lm_eval/datasets/wikitext/dataset_infos.json
index 4313e9543d..db8249c1b1 100644
--- a/lm_eval/datasets/wikitext/dataset_infos.json
+++ b/lm_eval/datasets/wikitext/dataset_infos.json
@@ -1 +1 @@
-{"wikitext-103-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n      title={Pointer Sentinel Mixture Models},\n      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n      year={2016},\n      eprint={1609.07843},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-103-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1281262, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 539297488, "num_examples": 29444, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1142488, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip": {"num_bytes": 190229076, "checksum": "242ba0f20b329cfdf1ccc61e9e9e5b59becf189db7f7a81cd2a0e2fc31539590"}}, "download_size": 190229076, "post_processing_size": null, "dataset_size": 541721238, "size_in_bytes": 731950314}, "wikitext-2-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n      title={Pointer Sentinel Mixture Models},\n      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n      year={2016},\n      eprint={1609.07843},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-2-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1256634, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 10799034, "num_examples": 629, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1121860, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip": {"num_bytes": 4475746, "checksum": "92675f1d63015c1c8b51f1656a52d5bdbc33aafa60cc47a218a66e7ee817488c"}}, "download_size": 4475746, "post_processing_size": null, "dataset_size": 13177528, "size_in_bytes": 17653274}, "wikitext-103-raw-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n      title={Pointer Sentinel Mixture Models},\n      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n      year={2016},\n      eprint={1609.07843},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-103-raw-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1290775, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 540656522, "num_examples": 29444, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1147025, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip": {"num_bytes": 191984949, "checksum": "91c00ae287f0d699e18605c84afc9e45c192bc6b7797ff8837e5474655a33794"}}, "download_size": 191984949, "post_processing_size": null, "dataset_size": 543094322, "size_in_bytes": 735079271}, "wikitext-2-raw-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n      title={Pointer Sentinel Mixture Models},\n      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n      year={2016},\n      eprint={1609.07843},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-2-raw-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1290775, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 10942633, "num_examples": 629, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1147025, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip": {"num_bytes": 4721645, "checksum": "ef7edb566e3e2b2d31b29c1fdb0c89a4cc683597484c3dc2517919c615435a11"}}, "download_size": 4721645, "post_processing_size": null, "dataset_size": 13380433, "size_in_bytes": 18102078}}
\ No newline at end of file
+{"wikitext-103-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n      title={Pointer Sentinel Mixture Models},\n      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n      year={2016},\n      eprint={1609.07843},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-103-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1281262, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 539297488, "num_examples": 29444, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1142488, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip": {"num_bytes": 190229076, "checksum": "242ba0f20b329cfdf1ccc61e9e9e5b59becf189db7f7a81cd2a0e2fc31539590"}}, "download_size": 190229076, "post_processing_size": null, "dataset_size": 541721238, "size_in_bytes": 731950314}, "wikitext-2-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n      title={Pointer Sentinel Mixture Models},\n      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n      year={2016},\n      eprint={1609.07843},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-2-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1256634, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 10799034, "num_examples": 629, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1121860, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip": {"num_bytes": 4475746, "checksum": "92675f1d63015c1c8b51f1656a52d5bdbc33aafa60cc47a218a66e7ee817488c"}}, "download_size": 4475746, "post_processing_size": null, "dataset_size": 13177528, "size_in_bytes": 17653274}, "wikitext-103-raw-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n      title={Pointer Sentinel Mixture Models},\n      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n      year={2016},\n      eprint={1609.07843},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-103-raw-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1290775, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 540656522, "num_examples": 29444, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1147025, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip": {"num_bytes": 191984949, "checksum": "91c00ae287f0d699e18605c84afc9e45c192bc6b7797ff8837e5474655a33794"}}, "download_size": 191984949, "post_processing_size": null, "dataset_size": 543094322, "size_in_bytes": 735079271}, "wikitext-2-raw-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n      title={Pointer Sentinel Mixture Models},\n      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n      year={2016},\n      eprint={1609.07843},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-2-raw-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1290775, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 10942633, "num_examples": 629, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1147025, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip": {"num_bytes": 4721645, "checksum": "ef7edb566e3e2b2d31b29c1fdb0c89a4cc683597484c3dc2517919c615435a11"}}, "download_size": 4721645, "post_processing_size": null, "dataset_size": 13380433, "size_in_bytes": 18102078}}
diff --git a/lm_eval/datasets/wikitext/wikitext.py b/lm_eval/datasets/wikitext/wikitext.py
index 0c81f1f264..a7d305303b 100644
--- a/lm_eval/datasets/wikitext/wikitext.py
+++ b/lm_eval/datasets/wikitext/wikitext.py
@@ -123,86 +123,111 @@ def _split_generators(self, dl_manager):
             return [
                 datasets.SplitGenerator(
                     name=datasets.Split.TEST,
-                    gen_kwargs={"data_file": os.path.join(
-                        data_dir, "wiki.test.tokens"), "split": "test"},
+                    gen_kwargs={
+                        "data_file": os.path.join(data_dir, "wiki.test.tokens"),
+                        "split": "test",
+                    },
                 ),
                 datasets.SplitGenerator(
                     name=datasets.Split.TRAIN,
-                    gen_kwargs={"data_file": os.path.join(
-                        data_dir, "wiki.train.tokens"), "split": "train"},
+                    gen_kwargs={
+                        "data_file": os.path.join(data_dir, "wiki.train.tokens"),
+                        "split": "train",
+                    },
                 ),
                 datasets.SplitGenerator(
                     name=datasets.Split.VALIDATION,
-                    gen_kwargs={"data_file": os.path.join(
-                        data_dir, "wiki.valid.tokens"), "split": "valid"},
+                    gen_kwargs={
+                        "data_file": os.path.join(data_dir, "wiki.valid.tokens"),
+                        "split": "valid",
+                    },
                 ),
             ]
         else:
             if self.config.name == "wikitext-103-raw-v1":
-                data_file = dl_manager.download_and_extract(
-                    self.config.data_url)
+                data_file = dl_manager.download_and_extract(self.config.data_url)
                 data_dir = os.path.join(data_file, "wikitext-103-raw")
                 return [
                     datasets.SplitGenerator(
                         name=datasets.Split.TEST,
-                        gen_kwargs={"data_file": os.path.join(
-                            data_dir, "wiki.test.raw"), "split": "test"},
+                        gen_kwargs={
+                            "data_file": os.path.join(data_dir, "wiki.test.raw"),
+                            "split": "test",
+                        },
                     ),
                     datasets.SplitGenerator(
                         name=datasets.Split.TRAIN,
-                        gen_kwargs={"data_file": os.path.join(
-                            data_dir, "wiki.train.raw"), "split": "train"},
+                        gen_kwargs={
+                            "data_file": os.path.join(data_dir, "wiki.train.raw"),
+                            "split": "train",
+                        },
                     ),
                     datasets.SplitGenerator(
                         name=datasets.Split.VALIDATION,
-                        gen_kwargs={"data_file": os.path.join(
-                            data_dir, "wiki.valid.raw"), "split": "valid"},
+                        gen_kwargs={
+                            "data_file": os.path.join(data_dir, "wiki.valid.raw"),
+                            "split": "valid",
+                        },
                     ),
                 ]
             else:
                 if self.config.name == "wikitext-2-raw-v1":
-                    data_file = dl_manager.download_and_extract(
-                        self.config.data_url)
+                    data_file = dl_manager.download_and_extract(self.config.data_url)
                     data_dir = os.path.join(data_file, "wikitext-2-raw")
                     return [
                         datasets.SplitGenerator(
                             name=datasets.Split.TEST,
-                            gen_kwargs={"data_file": os.path.join(
-                                data_dir, "wiki.test.raw"), "split": "test"},
+                            gen_kwargs={
+                                "data_file": os.path.join(data_dir, "wiki.test.raw"),
+                                "split": "test",
+                            },
                         ),
                         datasets.SplitGenerator(
                             name=datasets.Split.TRAIN,
-                            gen_kwargs={"data_file": os.path.join(
-                                data_dir, "wiki.train.raw"), "split": "train"},
+                            gen_kwargs={
+                                "data_file": os.path.join(data_dir, "wiki.train.raw"),
+                                "split": "train",
+                            },
                         ),
                         datasets.SplitGenerator(
                             name=datasets.Split.VALIDATION,
-                            gen_kwargs={"data_file": os.path.join(
-                                data_dir, "wiki.valid.raw"), "split": "valid"},
+                            gen_kwargs={
+                                "data_file": os.path.join(data_dir, "wiki.valid.raw"),
+                                "split": "valid",
+                            },
                         ),
                     ]
                 else:
                     if self.config.name == "wikitext-2-v1":
                         data_file = dl_manager.download_and_extract(
-                            self.config.data_url)
+                            self.config.data_url
+                        )
                         data_dir = os.path.join(data_file, "wikitext-2")
                         return [
                             datasets.SplitGenerator(
                                 name=datasets.Split.TEST,
-                                gen_kwargs={"data_file": os.path.join(
-                                    data_dir, "wiki.test.tokens"), "split": "test"},
+                                gen_kwargs={
+                                    "data_file": os.path.join(
+                                        data_dir, "wiki.test.tokens"
+                                    ),
+                                    "split": "test",
+                                },
                             ),
                             datasets.SplitGenerator(
                                 name=datasets.Split.TRAIN,
                                 gen_kwargs={
-                                    "data_file": os.path.join(data_dir, "wiki.train.tokens"),
+                                    "data_file": os.path.join(
+                                        data_dir, "wiki.train.tokens"
+                                    ),
                                     "split": "train",
                                 },
                             ),
                             datasets.SplitGenerator(
                                 name=datasets.Split.VALIDATION,
                                 gen_kwargs={
-                                    "data_file": os.path.join(data_dir, "wiki.valid.tokens"),
+                                    "data_file": os.path.join(
+                                        data_dir, "wiki.valid.tokens"
+                                    ),
                                     "split": "valid",
                                 },
                             ),
@@ -216,12 +241,12 @@ def _generate_examples(self, data_file, split):
             data = f.read().split("\n")
             for line in data:
                 rline = line.replace("= = =", "===").replace("= =", "==").strip()
-                if rline.startswith('= ') and rline.strip().endswith(' ='):
-                    page = '\n'.join(ret)
+                if rline.startswith("= ") and rline.strip().endswith(" ="):
+                    page = "\n".join(ret)
                     if page.strip():
                         yield key, {"page": page}
                         key += 1
                     ret = []
                 ret.append(line)
-            page = '\n'.join(ret)
+            page = "\n".join(ret)
             yield key, {"page": page}
diff --git a/lm_eval/decontamination/archiver.py b/lm_eval/decontamination/archiver.py
index fd0ef68807..488a55dd73 100644
--- a/lm_eval/decontamination/archiver.py
+++ b/lm_eval/decontamination/archiver.py
@@ -8,12 +8,14 @@
 import tqdm
 from pathlib import Path
 
+
 def json_serial(obj):
     """JSON serializer for objects not serializable by default json code"""
 
     if isinstance(obj, (datetime.datetime,)):
         return obj.isoformat()
-    raise TypeError ("Type %s not serializable" % type(obj))
+    raise TypeError("Type %s not serializable" % type(obj))
+
 
 # Modified version of lm_dataformat Archive for single file.
 class Archive:
@@ -21,26 +23,32 @@ def __init__(self, file_path, compression_level=3):
         self.file_path = file_path
         dir_name = os.path.dirname(file_path)
         if dir_name:
-            os.makedirs(dir_name, exist_ok=True)    
-        self.fh = open(self.file_path, 'wb')
+            os.makedirs(dir_name, exist_ok=True)
+        self.fh = open(self.file_path, "wb")
         self.cctx = zstandard.ZstdCompressor(level=compression_level)
-        self.compressor = self.cctx.stream_writer(self.fh)        
-    
+        self.compressor = self.cctx.stream_writer(self.fh)
+
     def add_data(self, data, meta={}):
-        self.compressor.write(json.dumps({'text': data, 'meta': meta}, default=json_serial).encode('UTF-8') + b'\n')
-    
+        self.compressor.write(
+            json.dumps({"text": data, "meta": meta}, default=json_serial).encode(
+                "UTF-8"
+            )
+            + b"\n"
+        )
+
     def commit(self):
-        self.compressor.flush(zstandard.FLUSH_FRAME)        
+        self.compressor.flush(zstandard.FLUSH_FRAME)
         self.fh.flush()
         self.fh.close()
 
+
 # Modified version of lm_dataformat Reader with self.fh set, allowing peeking for tqdm.
 class Reader:
     def __init__(self):
         pass
 
-    def read(self, file, get_meta=False, autojoin_paragraphs=True, para_joiner='\n\n'):
-        with open(file, 'rb') as fh:
+    def read(self, file, get_meta=False, autojoin_paragraphs=True, para_joiner="\n\n"):
+        with open(file, "rb") as fh:
             self.fh = fh
             cctx = zstandard.ZstdDecompressor()
             reader = io.BufferedReader(cctx.stream_reader(fh))
@@ -52,53 +60,58 @@ def read(self, file, get_meta=False, autojoin_paragraphs=True, para_joiner='\n\n
                     yield ob
                     continue
 
-                text = ob['text']
+                text = ob["text"]
 
                 if autojoin_paragraphs and isinstance(text, list):
                     text = para_joiner.join(text)
 
                 if get_meta:
-                    yield text, (ob['meta'] if 'meta' in ob else {})
+                    yield text, (ob["meta"] if "meta" in ob else {})
                 else:
                     yield text
 
+
 class TextArchive:
     def __init__(self, file_path, mode="rb+"):
         self.file_path = file_path
         dir_name = os.path.dirname(file_path)
         if dir_name:
-            os.makedirs(dir_name, exist_ok=True)    
+            os.makedirs(dir_name, exist_ok=True)
 
         if not os.path.exists(file_path):
             Path(file_path).touch()
-                        
-        self.fh = open(self.file_path, mode)    
-    
+
+        self.fh = open(self.file_path, mode)
+
     def add_data(self, data):
-        self.fh.write(data.encode('UTF-8') + b'\n')
-    
+        self.fh.write(data.encode("UTF-8") + b"\n")
+
     def commit(self):
         self.fh.flush()
         self.fh.close()
 
+
 class TextReader:
     def __init__(self, file_path):
         self.file_path = file_path
 
     # Optimized mmap read with infrequent tqdm updates to maintain speed
-    # Tested up to 250MB/s. 
+    # Tested up to 250MB/s.
     def read_tqdm(self, update_frequency=10000):
         current_file_position = 0
         line_counter = 0
-        with open(self.file_path, 'r') as fh, \
-            tqdm.tqdm(total=os.path.getsize(self.file_path), dynamic_ncols=True, 
-                unit="byte", unit_scale=1) as progress:
+        with open(self.file_path, "r") as fh, tqdm.tqdm(
+            total=os.path.getsize(self.file_path),
+            dynamic_ncols=True,
+            unit="byte",
+            unit_scale=1,
+        ) as progress:
             with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
                 for line in iter(mmap_obj.readline, b""):
                     line = line.decode("utf-8")
                     line_counter += 1
                     if line_counter == update_frequency:
-                        new_file_pos = mmap_obj.tell() 
+                        new_file_pos = mmap_obj.tell()
                         bytes_read = new_file_pos - current_file_position
                         current_file_position = new_file_pos
                         progress.update(bytes_read)
@@ -107,24 +120,24 @@ def read_tqdm(self, update_frequency=10000):
 
     def read_and_tell(self):
         current_file_position = 0
-        with open(self.file_path, 'r', encoding="utf8") as fh:
+        with open(self.file_path, "r", encoding="utf8") as fh:
             with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
                 for line in iter(mmap_obj.readline, b""):
-                    line = line.decode("utf-8")                    
-                    new_file_pos = mmap_obj.tell() 
+                    line = line.decode("utf-8")
+                    new_file_pos = mmap_obj.tell()
                     raw_bytes_read = new_file_pos - current_file_position
                     current_file_position = new_file_pos
                     yield line[:-1], raw_bytes_read
 
     def read(self):
-        with open(self.file_path, 'r', encoding="utf8") as fh:
+        with open(self.file_path, "r", encoding="utf8") as fh:
             with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
                 for line in iter(mmap_obj.readline, b""):
-                    line = line.decode("utf-8")                    
+                    line = line.decode("utf-8")
                     yield line[:-1]
 
     def read_slow(self):
-        with open(self.file_path, 'r', encoding="utf8") as fh:
+        with open(self.file_path, "r", encoding="utf8") as fh:
             while True:
                 line = fh.readline()
                 if line == -1 or line == "":
@@ -132,16 +145,17 @@ def read_slow(self):
                 else:
                     yield line[:-1]
 
+
 # Optimized for speed. Decompresses the archive in shell before
 # using the mmap'd TextReader.
 class ZStdTextReader:
     def __init__(self, file):
-        self.file = file        
+        self.file = file
 
-    def read_tqdm(self):       
+    def read_tqdm(self):
         decompressed_file = self.file[:-4]
         print("Decompressing file, please wait...")
-        os.system(f"zstd -d {self.file}") # linux decompress is faster        
+        os.system(f"zstd -d {self.file}")  # linux decompress is faster
         reader = TextReader(decompressed_file)
         yield from reader.read_tqdm()
-        os.remove(decompressed_file)
\ No newline at end of file
+        os.remove(decompressed_file)
diff --git a/lm_eval/decontamination/decontaminate.py b/lm_eval/decontamination/decontaminate.py
index a08b9cbcfc..c78d4a6977 100644
--- a/lm_eval/decontamination/decontaminate.py
+++ b/lm_eval/decontamination/decontaminate.py
@@ -49,7 +49,7 @@ def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit):
         return f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.overlaps"
 
     lookups = {}
-    duplicates = {} # (task_name, task_set): set(doc_ids)}
+    duplicates = {}  # (task_name, task_set): set(doc_ids)}
     sets_to_decontaminate = len(docs_by_task_set.keys())
 
     for (task_name, task_set), docs in docs_by_task_set.items():
@@ -57,19 +57,27 @@ def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit):
             os.mkdir(f"data/{task_name}")
 
         # Check if we've decontaminated this combination before
-        overlaps_dump_path = get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit)
+        overlaps_dump_path = get_overlaps_dump_path(
+            task_name, task_set, ngrams_n_size, limit
+        )
         if os.path.exists(overlaps_dump_path):
-            duplicates[(task_name, task_set)] = pickle.load(open(overlaps_dump_path, "rb"))
+            duplicates[(task_name, task_set)] = pickle.load(
+                open(overlaps_dump_path, "rb")
+            )
             sets_to_decontaminate -= 1
             continue
         else:
             duplicates[(task_name, task_set)] = set()
 
         # Build/load the task lookup {ngram: set(documents)}.
-        task_set_lookup_path = f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.lookup"
+        task_set_lookup_path = (
+            f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.lookup"
+        )
         if os.path.exists(task_set_lookup_path):
-            print(f"{task_set_lookup_path} available, loading...")        
-            lookups[(task_name, task_set)] = pickle.load(open(task_set_lookup_path, "rb"))
+            print(f"{task_set_lookup_path} available, loading...")
+            lookups[(task_name, task_set)] = pickle.load(
+                open(task_set_lookup_path, "rb")
+            )
         else:
             print(f"{task_set_lookup_path} not available, building...")
             lookup = collections.defaultdict(set)
@@ -79,7 +87,7 @@ def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit):
                 for ngram in ngrams:
                     lookup[ngram].add(doc_id)
 
-            pickle.dump(lookup, open(task_set_lookup_path,"wb"))
+            pickle.dump(lookup, open(task_set_lookup_path, "wb"))
             lookups[(task_name, task_set)] = lookup
 
     elapsed = time.perf_counter() - start
@@ -89,7 +97,7 @@ def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit):
 
     if sets_to_decontaminate > 0:
         print("Merging lookups...")
-        start = time.perf_counter()    
+        start = time.perf_counter()
         merged_lookup = collections.defaultdict(list)
         for (task_name, task_set), lookup in lookups.items():
             for ngram, doc_ids in lookup.items():
@@ -112,20 +120,26 @@ def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit):
             non_matching_unique = 0
 
             current_ngram = ""
-            for line in reader.read_tqdm(): # Scan training set ngrams file
+            for line in reader.read_tqdm():  # Scan training set ngrams file
                 total_ngrams += 1
                 [ngram, document_id] = line.rsplit(" ", 1)
-                if ngram != current_ngram: # Only need to match the ngram once in training set
+                if (
+                    ngram != current_ngram
+                ):  # Only need to match the ngram once in training set
                     unique_ngrams += 1
                     current_ngram = ngram
                     if ngram in merged_lookup:
-                        matched_ngrams.append(ngram) # For logging
+                        matched_ngrams.append(ngram)  # For logging
                         matching_unique += 1
                         for task_name, task_set, doc_ids in merged_lookup[ngram]:
                             task_doc_set = duplicates[(task_name, task_set)]
-                            for doc_id in doc_ids: # Record contamination across all relevant task/set combos
+                            for (
+                                doc_id
+                            ) in (
+                                doc_ids
+                            ):  # Record contamination across all relevant task/set combos
                                 task_doc_set.add(doc_id)
-                        del merged_lookup[ngram] # No point matching again
+                        del merged_lookup[ngram]  # No point matching again
                     else:
                         non_matching_unique += 1
 
@@ -143,11 +157,12 @@ def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit):
 
         print(duplicates)
 
-        # Dump overlaps separately    
+        # Dump overlaps separately
         for (task_name, task_set), doc_ids in duplicates.items():
-            overlaps_dump_path = get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit)
-            pickle.dump(doc_ids, open(overlaps_dump_path,"wb"))
+            overlaps_dump_path = get_overlaps_dump_path(
+                task_name, task_set, ngrams_n_size, limit
+            )
+            pickle.dump(doc_ids, open(overlaps_dump_path, "wb"))
 
     # Strip task set and return
     return {task_name: doc_ids for (task_name, task_set), doc_ids in duplicates.items()}
-
diff --git a/lm_eval/decontamination/janitor.py b/lm_eval/decontamination/janitor.py
index f8bb08b054..5d43ee305b 100644
--- a/lm_eval/decontamination/janitor.py
+++ b/lm_eval/decontamination/janitor.py
@@ -9,6 +9,7 @@
 # c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
 try:
     import janitor_util
+
     JANITOR_CPP = True
 except Exception as e:
     print("WARNING: C++ module could not be loaded. Janitor running in python mode")
@@ -41,6 +42,7 @@ def word_ngrams(s, n):
     ngram_seqs = form_ngrams(iter(tokens), n)
     return (" ".join(ngram) for ngram in ngram_seqs)
 
+
 # Does character sequences only - combined faster function to play around with later
 # def word_ngrams_indices_combined(sequence, n):
 #     current_word = ""
@@ -70,7 +72,7 @@ def split_indices(s):
     """Splits a string on whitespaces and records the indices of each in the original string.
     @:return generator((word, (start_idx, end_idx)), ...)
     """
-    return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r'\S+', s))
+    return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r"\S+", s))
 
 
 def word_ngrams_indices(s, n):
@@ -90,22 +92,27 @@ def word_ngrams_indices(s, n):
     #   ([word, word, ...], [(start,end), (start,end), ...]),
     #   ...
     # )
-    ngram_indices_pairs = (zip(*ngram_with_indices) for ngram_with_indices in ngram_seqs_with_indices)
+    ngram_indices_pairs = (
+        zip(*ngram_with_indices) for ngram_with_indices in ngram_seqs_with_indices
+    )
 
     # Generator of ( (word_ngram, (start, end)), (word_ngram, start, end)), ...)
-    return ((" ".join(ngram_seq), (indices[0][0], indices[-1][1])) for ngram_seq, indices in ngram_indices_pairs)
+    return (
+        (" ".join(ngram_seq), (indices[0][0], indices[-1][1]))
+        for ngram_seq, indices in ngram_indices_pairs
+    )
 
 
 class Janitor:
 
     # FIXME delete_chars: Should anything else go here? Special chars?
     def __init__(
-            self,
-            ngram_n=13,
-            window_to_remove=200,
-            too_dirty_cutoff=10,
-            minimum_slice_length=200,
-            delete_chars=string.punctuation
+        self,
+        ngram_n=13,
+        window_to_remove=200,
+        too_dirty_cutoff=10,
+        minimum_slice_length=200,
+        delete_chars=string.punctuation,
     ):
         self.ngram_n = ngram_n
         self.window_to_remove = window_to_remove
@@ -121,7 +128,7 @@ def __init__(
         self.translation_table = str.maketrans(
             string.ascii_lowercase + string.ascii_uppercase,  # These characters
             string.ascii_lowercase * 2,  # Become these characters
-            self.delete_chars  # These are deleted
+            self.delete_chars,  # These are deleted
         )
 
     ##############
@@ -129,14 +136,13 @@ def __init__(
     ##############
 
     def save_contamination_ngrams(self, filename):
-        with open(filename, 'wb') as fp:
+        with open(filename, "wb") as fp:
             pickle.dump(filename, fp)
 
     def load_contamination_ngrams(self, filename):
-        with open(filename, 'rb') as fp:
+        with open(filename, "rb") as fp:
             self.dirt_ngrams = pickle.load(fp)
 
-
     ##############
     # Call these :)
     ##############
@@ -171,11 +177,11 @@ def _split_chunks(self, dirty_string, dirty_parts):
             end = min(len(dirty_string), end + self.window_to_remove)
 
             if start - splice_idx > self.minimum_slice_length:
-                clean_chunks.append(dirty_string[splice_idx: start])
+                clean_chunks.append(dirty_string[splice_idx:start])
             splice_idx = end
 
         if end < len(dirty_string) - self.minimum_slice_length:
-            clean_chunks.append(dirty_string[end+1:])
+            clean_chunks.append(dirty_string[end + 1 :])
 
         return clean_chunks
 
@@ -184,10 +190,14 @@ def _split_chunks(self, dirty_string, dirty_parts):
     ##############
 
     def register_contaminant_cpp(self, dirt_string):
-        self.dirt_ngrams.update(janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n))
+        self.dirt_ngrams.update(
+            janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n)
+        )
 
     def clean_cpp(self, dirty_string):
-        contamination_indices = janitor_util.clean_ngram_with_indices(dirty_string, self.delete_chars, self.ngram_n)
+        contamination_indices = janitor_util.clean_ngram_with_indices(
+            dirty_string, self.delete_chars, self.ngram_n
+        )
         return self._split_chunks(dirty_string, contamination_indices)
 
     ##############
@@ -198,7 +208,9 @@ def normalize_string(self, s):
         return s.translate(self.translation_table)
 
     def register_contaminant_python(self, dirt_string):
-        self.dirt_ngrams.update(word_ngrams(self.normalize_string(dirt_string), self.ngram_n))
+        self.dirt_ngrams.update(
+            word_ngrams(self.normalize_string(dirt_string), self.ngram_n)
+        )
 
     def clean_python(self, dirty_string):
         contamination_indices = (
@@ -249,29 +261,29 @@ def clean_python(self, dirty_string):
 #             data = f.read()
 #         jan = Janitor(too_dirty_cutoff=1000)
 #         jan.register_contaminant('''
-#         theories is that there is a connection between &quot;geekdom&quot; and autism.  
+#         theories is that there is a connection between &quot;geekdom&quot; and autism.
 #         This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled &quot;
-#         The [[Geek]] Syndrome&quot;, which is a point argued by many in the autism rights 
-#         movement{{ref|Wired}}.  This article, many professionals assert, is just one example of 
+#         The [[Geek]] Syndrome&quot;, which is a point argued by many in the autism rights
+#         movement{{ref|Wired}}.  This article, many professionals assert, is just one example of
 #         the media's application of mental disease labels to what is actually variant normal behavior
 #         &amp;mdash;they argue that shyness, lack of athletic ability or social skills, and intellectual
-#         interests, even when they seem unusual to others, are not in themselves signs of autism or 
+#         interests, even when they seem unusual to others, are not in themselves signs of autism or
 #         Asperger's syndrome. Others assert that it is actually the medical profession which is applying
 #         mental disease labels to children who in the past would have simply been accepted as a little
 #         different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue.
 #         Due to the recent publicity surrounding autism and autis
 #         ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958.  At first,
-#         oil money had a marginal impact.  A few lowrise concete buildings were erected, and the first 
-#         paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties 
-#         would last, took a cautious approach, prefering to save the revenue rather than investing it in 
-#         development.  His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential 
-#         to transform Abu Dhabi.  The ruling Al Nahayan family decided that Sheikh Zayed should replace his 
-#         brother as Ruler and carry out his vision of developing the country.  On [[August 6]], [[1966]], 
-#         with the assistance of the British, Sheikh Zayed became the new ruler.  See generally, Al-Fahim, M, 
-#         ''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995), 
-#         ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the 
-#         Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the 
-#         [[United Arab Emirates]]. After the Emirates gained independence in 1971, 
+#         oil money had a marginal impact.  A few lowrise concete buildings were erected, and the first
+#         paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
+#         would last, took a cautious approach, prefering to save the revenue rather than investing it in
+#         development.  His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
+#         to transform Abu Dhabi.  The ruling Al Nahayan family decided that Sheikh Zayed should replace his
+#         brother as Ruler and carry out his vision of developing the country.  On [[August 6]], [[1966]],
+#         with the assistance of the British, Sheikh Zayed became the new ruler.  See generally, Al-Fahim, M,
+#         ''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995),
+#         ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the
+#         Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the
+#         [[United Arab Emirates]]. After the Emirates gained independence in 1971,
 #         ''')
 #         """
 
diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 1c754a42f8..abf5c28fc5 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -11,19 +11,29 @@
 from lm_eval.utils import positional_deprecated, run_task_tests
 from lm_eval.decontamination.decontaminate import get_train_overlap
 
+
 @positional_deprecated
-def simple_evaluate(model, model_args=None, tasks=[],
-                    num_fewshot=0, batch_size=None, device=None,
-                    no_cache=False, limit=None, bootstrap_iters=100000,
-                    description_dict=None, check_integrity=False, 
-                    decontamination_ngrams_path=None):
+def simple_evaluate(
+    model,
+    model_args=None,
+    tasks=[],
+    num_fewshot=0,
+    batch_size=None,
+    device=None,
+    no_cache=False,
+    limit=None,
+    bootstrap_iters=100000,
+    description_dict=None,
+    check_integrity=False,
+    decontamination_ngrams_path=None,
+):
 
     """Instantiate and evaluate a model on a list of tasks.
 
     :param model: Union[str, LM]
         Name of model or LM object, see lm_eval.models.get_model
     :param model_args: Optional[str]
-        String arguments for each model class, see LM.create_from_arg_string. 
+        String arguments for each model class, see LM.create_from_arg_string.
         Ignored if `model` argument is a LM object.
     :param tasks: list[Union[str, Task]]
         List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
@@ -40,7 +50,7 @@ def simple_evaluate(model, model_args=None, tasks=[],
     :param bootstrap_iters:
         Number of iterations for bootstrap statistics
     :param description_dict: dict[str, str]
-        Dictionary of custom task descriptions of the form: `task_name: description` 
+        Dictionary of custom task descriptions of the form: `task_name: description`
     :param check_integrity: bool
         Whether to run the relevant part of the test suite for the tasks
     :return
@@ -52,19 +62,25 @@ def simple_evaluate(model, model_args=None, tasks=[],
     assert tasks != [], "No tasks specified"
 
     if isinstance(model, str):
-        if model_args is None: model_args = ""
-        lm = lm_eval.models.get_model(model).create_from_arg_string(model_args, {
-            'batch_size': batch_size, 'device': device
-        })
+        if model_args is None:
+            model_args = ""
+        lm = lm_eval.models.get_model(model).create_from_arg_string(
+            model_args, {"batch_size": batch_size, "device": device}
+        )
     else:
         assert isinstance(model, lm_eval.base.LM)
         lm = model
 
     if not no_cache:
         lm = lm_eval.base.CachingLM(
-            lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_').replace('/', '-') + '.db'
+            lm,
+            "lm_cache/"
+            + model
+            + "_"
+            + model_args.replace("=", "-").replace(",", "_").replace("/", "-")
+            + ".db",
         )
-    
+
     task_dict = lm_eval.tasks.get_task_dict(tasks)
 
     if check_integrity:
@@ -76,7 +92,7 @@ def simple_evaluate(model, model_args=None, tasks=[],
         num_fewshot=num_fewshot,
         limit=limit,
         description_dict=description_dict,
-        decontamination_ngrams_path=decontamination_ngrams_path, 
+        decontamination_ngrams_path=decontamination_ngrams_path,
     )
 
     # add info about the model and few shot config
@@ -89,16 +105,26 @@ def simple_evaluate(model, model_args=None, tasks=[],
         "no_cache": no_cache,
         "limit": limit,
         "bootstrap_iters": bootstrap_iters,
-        "description_dict": description_dict
+        "description_dict": description_dict,
     }
 
     return results
 
+
 decontaminate_suffix = "_decontaminate"
 
+
 @positional_deprecated
-def evaluate(lm, task_dict, provide_description=None, num_fewshot=0, limit=None, bootstrap_iters=100000, description_dict=None,
-             decontamination_ngrams_path=None):
+def evaluate(
+    lm,
+    task_dict,
+    provide_description=None,
+    num_fewshot=0,
+    limit=None,
+    bootstrap_iters=100000,
+    description_dict=None,
+    decontamination_ngrams_path=None,
+):
     """Instantiate and evaluate a model on a list of tasks.
 
     :param lm: obj
@@ -114,7 +140,7 @@ def evaluate(lm, task_dict, provide_description=None, num_fewshot=0, limit=None,
     :param bootstrap_iters:
         Number of iterations for bootstrap statistics
     :param description_dict: dict[str, str]
-        Dictionary of custom task descriptions of the form: `task_name: description` 
+        Dictionary of custom task descriptions of the form: `task_name: description`
     :return
         Dictionary of results
     """
@@ -124,14 +150,16 @@ def evaluate(lm, task_dict, provide_description=None, num_fewshot=0, limit=None,
     assert not provide_description  # not implemented.
     if provide_description is not None:
         # nudge people to not specify it at all
-        print("WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict")
+        print(
+            "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
+        )
 
     decontaminate = decontamination_ngrams_path is not None
 
     task_dict_items = [
         (name, task)
         for name, task in task_dict.items()
-        if(task.has_validation_docs() or task.has_test_docs())
+        if (task.has_validation_docs() or task.has_test_docs())
     ]
 
     results = collections.defaultdict(dict)
@@ -140,7 +168,7 @@ def evaluate(lm, task_dict, provide_description=None, num_fewshot=0, limit=None,
     requests = collections.defaultdict(list)
     requests_origin = collections.defaultdict(list)
 
-    overlaps = collections.defaultdict(list) # {task_name: contaminated_docs}
+    overlaps = collections.defaultdict(list)  # {task_name: contaminated_docs}
 
     # If we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger
     # memory, we can always modify this plumbing to support that, but I didn't want to include it just yet because
@@ -159,9 +187,9 @@ def evaluate(lm, task_dict, provide_description=None, num_fewshot=0, limit=None,
         # TODO: the test-fallback-to-val system isn't final, we should revisit it at some point
         if task.has_test_docs():
             task_doc_func = task.test_docs
-            task_set = "test" # Required for caching in the decontamination
+            task_set = "test"  # Required for caching in the decontamination
         elif task.has_validation_docs():
-            task_set = "val" # Required for caching in the decontamination
+            task_set = "val"  # Required for caching in the decontamination
             task_doc_func = task.validation_docs
         else:
             raise RuntimeError("Task has neither test_docs nor validation_docs")
@@ -172,19 +200,22 @@ def evaluate(lm, task_dict, provide_description=None, num_fewshot=0, limit=None,
         rnd.seed(42)
         rnd.shuffle(task_docs)
 
-        description = description_dict[task_name] if description_dict and task_name in description_dict else ""
+        description = (
+            description_dict[task_name]
+            if description_dict and task_name in description_dict
+            else ""
+        )
 
         for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
 
             if decontaminate and task.should_decontaminate():
-                docs_for_decontamination[(task_name, task_set)].append(task.doc_to_decontamination_query(doc))
+                docs_for_decontamination[(task_name, task_set)].append(
+                    task.doc_to_decontamination_query(doc)
+                )
 
             docs[(task_name, doc_id)] = doc
             ctx = task.fewshot_context(
-                doc=doc,
-                num_fewshot=num_fewshot,
-                rnd=rnd,
-                description=description
+                doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
             )
             reqs = task.construct_requests(doc, ctx)
             if not isinstance(reqs, (list, tuple)):
@@ -198,7 +229,9 @@ def evaluate(lm, task_dict, provide_description=None, num_fewshot=0, limit=None,
     # Compare all tasks/sets at once to ensure a single training set scan
     if decontaminate:
         print("Finding train/test overlap, please wait...")
-        overlaps = get_train_overlap(docs_for_decontamination, decontamination_ngrams_path, limit)
+        overlaps = get_train_overlap(
+            docs_for_decontamination, decontamination_ngrams_path, limit
+        )
 
     # all responses for each (task, doc)
     process_res_queue = collections.defaultdict(list)
@@ -212,11 +245,13 @@ def evaluate(lm, task_dict, provide_description=None, num_fewshot=0, limit=None,
 
         print("Running", reqtype, "requests")
         resps = getattr(lm, reqtype)([req.args for req in reqs])
-        resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]
+        resps = [
+            x if req.index is None else x[req.index] for x, req in zip(resps, reqs)
+        ]
 
         for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
             process_res_queue[(task_name, doc_id)].append((i, resp))
-    
+
     vals = collections.defaultdict(list)
 
     # unpack results and sort back in order and return control to Task
@@ -235,13 +270,15 @@ def evaluate(lm, task_dict, provide_description=None, num_fewshot=0, limit=None,
             if decontaminate and task_name in overlaps:
                 if doc_id not in overlaps[task_name]:
                     vals[(task_name, metric + decontaminate_suffix)].append(value)
-    
+
     # aggregate results
     for (task_name, metric), items in vals.items():
         task = task_dict[task_name]
-        real_metric = metric # key when looking up the metric with task.aggregation
+        real_metric = metric  # key when looking up the metric with task.aggregation
         if metric.endswith(decontaminate_suffix):
-            real_metric = metric.replace(decontaminate_suffix, "") # decontaminated still uses the same metric
+            real_metric = metric.replace(
+                decontaminate_suffix, ""
+            )  # decontaminated still uses the same metric
         results[task_name][metric] = task.aggregation()[real_metric](items)
 
         # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
@@ -249,16 +286,15 @@ def evaluate(lm, task_dict, provide_description=None, num_fewshot=0, limit=None,
 
         stderr = lm_eval.metrics.stderr_for_metric(
             metric=task.aggregation()[real_metric],
-            bootstrap_iters=min(bootstrap_iters, 1000) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters,
+            bootstrap_iters=min(bootstrap_iters, 1000)
+            if metric in ["bleu", "chrf", "ter"]
+            else bootstrap_iters,
         )
-        
+
         if stderr is not None:
             results[task_name][metric + "_stderr"] = stderr(items)
-    
-    return {
-        "results": dict(results),
-        "versions": dict(versions)
-    }
+
+    return {"results": dict(results), "versions": dict(versions)}
 
 
 def make_table(result_dict):
@@ -280,9 +316,9 @@ def make_table(result_dict):
 
             if m + "_stderr" in dic:
                 se = dic[m + "_stderr"]
-                values.append([k, version, m, '%.4f' % v, '±', '%.4f' % se])
+                values.append([k, version, m, "%.4f" % v, "±", "%.4f" % se])
             else:
-                values.append([k, version, m, '%.4f' % v, '', ''])
+                values.append([k, version, m, "%.4f" % v, "", ""])
             k = ""
             version = ""
     md_writer.value_matrix = values
diff --git a/lm_eval/metrics.py b/lm_eval/metrics.py
index 05fad59ff3..8f30a42695 100644
--- a/lm_eval/metrics.py
+++ b/lm_eval/metrics.py
@@ -103,6 +103,7 @@ def weighted_mean(items):
 def weighted_perplexity(items):
     return math.exp(-weighted_mean(items))
 
+
 def bits_per_byte(items):
     return -weighted_mean(items) / math.log(2)
 
@@ -184,8 +185,10 @@ def _sacreformat(refs, preds):
 
     return refs, preds
 
+
 # stderr stuff
 
+
 class _bootstrap_internal:
     def __init__(self, f, n):
         self.f = f
@@ -203,9 +206,10 @@ def __call__(self, v):
 
 def bootstrap_stderr(f, xs, iters):
     import multiprocessing as mp
+
     pool = mp.Pool(mp.cpu_count())
     # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
-    # equivalent to stderr calculated without Bessel's correction in the stddev. 
+    # equivalent to stderr calculated without Bessel's correction in the stddev.
     # Unfortunately, I haven't been able to figure out what the right correction is
     # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
     # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
@@ -213,10 +217,15 @@ def bootstrap_stderr(f, xs, iters):
     res = []
     chunk_size = min(1000, iters)
     from tqdm import tqdm
+
     print("bootstrapping for stddev:", f.__name__)
-    for bootstrap in tqdm(pool.imap(
+    for bootstrap in tqdm(
+        pool.imap(
             _bootstrap_internal(f, chunk_size),
-            [(i, xs) for i in range(iters // chunk_size)]), total=iters // chunk_size):
+            [(i, xs) for i in range(iters // chunk_size)],
+        ),
+        total=iters // chunk_size,
+    ):
         # sample w replacement
         res.extend(bootstrap)
 
@@ -238,17 +247,13 @@ def stderr_for_metric(metric, bootstrap_iters):
     if metric in bootstrappable:
         return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters)
 
-    stderr = {
-        mean: mean_stderr,
-        acc_all: acc_all_stderr
-        
-    }
+    stderr = {mean: mean_stderr, acc_all: acc_all_stderr}
 
     return stderr.get(metric, None)
 
 
 def yesno(x):
     if x:
-        return 'yes'
+        return "yes"
     else:
-        return 'no'
+        return "no"
diff --git a/lm_eval/models/gpt2.py b/lm_eval/models/gpt2.py
index 338d5d76bb..0a387a1d2c 100644
--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -4,15 +4,22 @@
 
 
 class HFLM(BaseLM):
-
-    def __init__(self, device='cuda', pretrained='gpt2', revision='main', subfolder=None, tokenizer=None, batch_size=1):
+    def __init__(
+        self,
+        device="cuda",
+        pretrained="gpt2",
+        revision="main",
+        subfolder=None,
+        tokenizer=None,
+        batch_size=1,
+    ):
         super().__init__()
 
         assert isinstance(device, str)
         assert isinstance(pretrained, str)
         assert isinstance(batch_size, int)
 
-        if device:            
+        if device:
             if device not in ["cuda", "cpu"]:
                 device = int(device)
             self._device = torch.device(device)
@@ -20,28 +27,47 @@ def __init__(self, device='cuda', pretrained='gpt2', revision='main', subfolder=
         else:
             print("Device not specificed")
             print(f"Cuda Available? {torch.cuda.is_available()}")
-            self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+            self._device = (
+                torch.device("cuda")
+                if torch.cuda.is_available()
+                else torch.device("cpu")
+            )
 
         # TODO: update this to be less of a hack once subfolder is fixed in HF
         self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
-            pretrained, revision=revision + ("/" + subfolder if subfolder is not None else "")
+            pretrained,
+            revision=revision + ("/" + subfolder if subfolder is not None else ""),
         ).to(self.device)
         self.gpt2.eval()
 
         # pretrained tokenizer for neo is broken for now so just hard-coding this to gpt2
         self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-            pretrained if tokenizer is None else tokenizer, revision=revision, subfolder=subfolder)
+            pretrained if tokenizer is None else tokenizer,
+            revision=revision,
+            subfolder=subfolder,
+        )
 
-        assert isinstance(self.tokenizer, (
-            transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast,
-            transformers.T5Tokenizer, transformers.T5TokenizerFast,
-        )), "this tokenizer has not been checked for compatibility yet!"
+        assert isinstance(
+            self.tokenizer,
+            (
+                transformers.GPT2Tokenizer,
+                transformers.GPT2TokenizerFast,
+                transformers.T5Tokenizer,
+                transformers.T5TokenizerFast,
+            ),
+        ), "this tokenizer has not been checked for compatibility yet!"
 
         self.vocab_size = self.tokenizer.vocab_size
 
-        if isinstance(self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)):
-            assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373], \
-                self.tokenizer.encode('hello\n\nhello')
+        if isinstance(
+            self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)
+        ):
+            assert self.tokenizer.encode("hello\n\nhello") == [
+                31373,
+                198,
+                198,
+                31373,
+            ], self.tokenizer.encode("hello\n\nhello")
 
         # multithreading and batching
         self.batch_size_per_gpu = batch_size  # todo: adaptive batch size
@@ -80,7 +106,7 @@ def device(self):
 
     def tok_encode(self, string: str):
         return self.tokenizer.encode(string, add_special_tokens=False)
-    
+
     def tok_decode(self, tokens):
         return self.tokenizer.decode(tokens)
 
@@ -94,13 +120,10 @@ def _model_call(self, inps):
         """
         with torch.no_grad():
             return self.gpt2(inps)[0][:, :, :50257]
-    
+
     def _model_generate(self, context, max_length, eos_token_id):
         return self.gpt2.generate(
-            context,
-            max_length=max_length,
-            eos_token_id=eos_token_id,
-            do_sample=False
+            context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False
         )
 
 
diff --git a/lm_eval/models/gpt3.py b/lm_eval/models/gpt3.py
index 3ca6dca5d6..80e9be4d8c 100644
--- a/lm_eval/models/gpt3.py
+++ b/lm_eval/models/gpt3.py
@@ -31,22 +31,24 @@ def get_result(response, ctxlen):
         if top_token != token:
             is_greedy = False
             break
-    
+
     return continuation_logprobs, is_greedy
 
 
 def oa_completion(**kwargs):
-    """ Query OpenAI API for completion.
+    """Query OpenAI API for completion.
 
     Retry with back-off until they respond
     """
     import openai
+
     backoff_time = 3
     while True:
         try:
             return openai.Completion.create(**kwargs)
         except openai.error.OpenAIError:
             import traceback
+
             traceback.print_exc()
             time.sleep(backoff_time)
             backoff_time *= 1.5
@@ -66,16 +68,19 @@ def __init__(self, engine, truncate=False):
         super().__init__()
 
         import openai
+
         self.engine = engine
-        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
+        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2")
 
         self.vocab_size = self.tokenizer.vocab_size
 
         # to make the annoying "Using pad_token, but it is not set yet." error go away
         self.tokenizer.pad_token = "<|endoftext|>"
-        assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373]
+        assert self.tokenizer.encode("hello\n\nhello") == [31373, 198, 198, 31373]
         self.truncate = truncate
-        self.end_of_text_token_id = self.tokenizer.convert_tokens_to_ids(["<|endoftext|>"])[0]
+        self.end_of_text_token_id = self.tokenizer.convert_tokens_to_ids(
+            ["<|endoftext|>"]
+        )[0]
 
         # Read from environment variable OPENAI_API_SECRET_KEY
         openai.api_key = os.environ["OPENAI_API_SECRET_KEY"]
@@ -105,7 +110,7 @@ def device(self):
 
     def tok_encode(self, string: str):
         return self.tokenizer.encode(string, add_special_tokens=False)
-    
+
     def tok_decode(self, tokens):
         return self.tokenizer.decode(tokens)
 
@@ -118,17 +123,22 @@ def _collate(x):
             # we care about and so we need some kind of backup for when it isn't
             toks = x[1] + x[2]
             return -len(toks), tuple(toks)
-        
+
         reord = utils.Reorderer(requests, _collate)
 
-        for chunk in tqdm(list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE)), disable=disable_tqdm):
+        for chunk in tqdm(
+            list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE)),
+            disable=disable_tqdm,
+        ):
             inps = []
             ctxlens = []
             for cache_key, context_enc, continuation_enc in chunk:
                 # max_length+1 because the API takes up to 2049 tokens, including the first context token
-                inp = (context_enc + continuation_enc)[-(self.max_length+1):]
+                inp = (context_enc + continuation_enc)[-(self.max_length + 1) :]
                 # TODO: the logic is much simpler if we just look at the length of continuation tokens
-                ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - (self.max_length+1))
+                ctxlen = len(context_enc) - max(
+                    0, len(context_enc) + len(continuation_enc) - (self.max_length + 1)
+                )
 
                 inps.append(inp)
                 ctxlens.append(ctxlen)
@@ -137,11 +147,14 @@ def _collate(x):
                 engine=self.engine,
                 prompt=inps,
                 echo=True,
-                max_tokens=0, temperature=0.,
+                max_tokens=0,
+                temperature=0.0,
                 logprobs=10,
             )
 
-            for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(response.choices, ctxlens, chunk):
+            for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(
+                response.choices, ctxlens, chunk
+            ):
                 answer = get_result(resp, ctxlen)
 
                 res.append(answer)
@@ -160,7 +173,7 @@ def greedy_until(self, requests):
         def _collate(x):
             toks = self.tok_encode(x[0])
             return len(toks), x[0]
-        
+
         reord = utils.Reorderer(requests, _collate)
 
         def sameuntil_chunks(xs, size):
@@ -172,38 +185,40 @@ def sameuntil_chunks(xs, size):
                     ret = []
                     lastuntil = x[1]
                 ret.append(x)
-            
+
             if ret:
                 yield ret, lastuntil
 
         # todo: more intelligent batching for heterogeneous `until`
-        for chunk, until in tqdm(list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))):
+        for chunk, until in tqdm(
+            list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))
+        ):
             inps = []
             for context, _ in chunk:
                 context_enc = self.tok_encode(context)
-                inp = context_enc[-(self.max_length - self.max_gen_toks):]
+                inp = context_enc[-(self.max_length - self.max_gen_toks) :]
                 inps.append(inp)
 
             response = oa_completion(
                 engine=self.engine,
                 prompt=inps,
-                max_tokens=self.max_gen_toks, 
-                temperature=0.,
+                max_tokens=self.max_gen_toks,
+                temperature=0.0,
                 logprobs=10,
                 stop=until,
             )
 
             for resp, (context, until_) in zip(response.choices, chunk):
-                s = resp['text']
+                s = resp["text"]
 
                 for term in until_:
                     s = s.split(term)[0]
 
                 # partial caching
                 self.cache_hook.add_partial("greedy_until", (context, until_), s)
-                
+
                 res.append(s)
-        
+
         return reord.get_original(res)
 
     def _model_call(self, inps):
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index 3231278100..358d4c7002 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -59,8 +59,8 @@
 
 # 6 total
 gpt3_translation_benchmarks = {
-    "wmt14": ['en-fr', 'fr-en'],  # French
-    "wmt16": ['en-ro', 'ro-en', 'de-en', 'en-de'],  # German, Romanian
+    "wmt14": ["en-fr", "fr-en"],  # French
+    "wmt16": ["en-ro", "ro-en", "de-en", "en-de"],  # German, Romanian
 }
 
 
@@ -68,7 +68,7 @@
 selected_translation_benchmarks = {
     **gpt3_translation_benchmarks,
     "wmt20": sacrebleu.get_langpairs_for_testset("wmt20"),
-    "iwslt17": ['en-ar', 'ar-en']  # Arabic
+    "iwslt17": ["en-ar", "ar-en"],  # Arabic
 }
 
 # 319 total
@@ -92,7 +92,7 @@
     "rte": glue.RTE,
     "qnli": glue.QNLI,
     "qqp": glue.QQP,
-    #"stsb": glue.STSB, # not implemented yet
+    # "stsb": glue.STSB, # not implemented yet
     "sst": glue.SST,
     "wnli": glue.WNLI,
     # SuperGLUE
@@ -103,34 +103,26 @@
     "record": superglue.ReCoRD,
     "wic": superglue.WordsInContext,
     "wsc": superglue.SGWinogradSchemaChallenge,
-    
     # Order by benchmark/genre?
     "coqa": coqa.CoQA,
     "drop": drop.DROP,
     "lambada": lambada.LAMBADA,
     "lambada_cloze": lambada_cloze.LAMBADA_cloze,
-    
     # multilingual lambada
     **lambada_multilingual.construct_tasks(),
-
     "wikitext": wikitext.WikiText,
     # "cbt-cn": cbt.CBTCN, # disabled pending context length fix
     # "cbt-ne": cbt.CBTNE, # disabled pending context length fix
-
     "piqa": piqa.PiQA,
     "prost": prost.PROST,
     "mc_taco": mc_taco.MCTACO,
-
     # Science related
-    "pubmedqa" : pubmedqa.Pubmed_QA,
-    "sciq" : sciq.SciQ,
-
+    "pubmedqa": pubmedqa.Pubmed_QA,
+    "sciq": sciq.SciQ,
     "qasper": qasper.QASPER,
-
-    "qa4mre_2011" : qa4mre.QA4MRE_2011,
-    "qa4mre_2012" : qa4mre.QA4MRE_2012,
-    "qa4mre_2013" : qa4mre.QA4MRE_2013,
-
+    "qa4mre_2011": qa4mre.QA4MRE_2011,
+    "qa4mre_2012": qa4mre.QA4MRE_2012,
+    "qa4mre_2013": qa4mre.QA4MRE_2013,
     "triviaqa": triviaqa.TriviaQA,
     "arc_easy": arc.ARCEasy,
     "arc_challenge": arc.ARCChallenge,
@@ -142,7 +134,7 @@
     "squad2": squad.SQuAD2,
     "race": race.RACE,
     # "naturalqs": naturalqs.NaturalQs, # not implemented yet
-    "headqa": headqa.HeadQAEsDeprecated, # for backwards compat - headqa used to default to es
+    "headqa": headqa.HeadQAEsDeprecated,  # for backwards compat - headqa used to default to es
     "headqa_es": headqa.HeadQAEs,
     "headqa_en": headqa.HeadQAEn,
     "mathqa": mathqa.MathQA,
@@ -152,21 +144,17 @@
     "anli_r1": anli.ANLIRound1,
     "anli_r2": anli.ANLIRound2,
     "anli_r3": anli.ANLIRound3,
-
     "ethics_cm": hendrycks_ethics.EthicsCM,
     "ethics_deontology": hendrycks_ethics.EthicsDeontology,
     "ethics_justice": hendrycks_ethics.EthicsJustice,
     "ethics_utilitarianism_original": hendrycks_ethics.EthicsUtilitarianismOriginal,
     "ethics_utilitarianism": hendrycks_ethics.EthicsUtilitarianism,
     "ethics_virtue": hendrycks_ethics.EthicsVirtue,
-
-     "truthfulqa_mc": truthfulqa.TruthfulQAMultipleChoice,
-     "truthfulqa_gen": truthfulqa.TruthfulQAGeneration,
-
+    "truthfulqa_mc": truthfulqa.TruthfulQAMultipleChoice,
+    "truthfulqa_gen": truthfulqa.TruthfulQAGeneration,
     # dialogue
     "mutual": mutual.MuTual,
     "mutual_plus": mutual.MuTualPlus,
-
     # math
     "math_algebra": hendrycks_math.MathAlgebra,
     "math_counting_and_prob": hendrycks_math.MathCountingAndProbability,
@@ -177,7 +165,6 @@
     "math_precalc": hendrycks_math.MathPrecalculus,
     "math_asdiv": asdiv.Asdiv,
     "gsm8k": gsm8k.GradeSchoolMath8K,
-
     # arithmetic
     "arithmetic_2da": arithmetic.Arithmetic2DPlus,
     "arithmetic_2ds": arithmetic.Arithmetic2DMinus,
@@ -191,22 +178,18 @@
     "arithmetic_1dc": arithmetic.Arithmetic1DComposite,
     # TODO Perhaps make these groups of tasks
     #   e.g. anli, arithmetic, openai_translations, harness_translations
-
     # hendrycksTest (57 tasks)
     **hendrycks_test.create_all_tasks(),
-
     # e.g. wmt14-fr-en
     **translation.create_tasks_from_benchmarks(gpt3_translation_benchmarks),
     # chef's selection, mostly wmt20
     **translation.create_tasks_from_benchmarks(selected_translation_benchmarks),
-
     # Word Scrambling and Manipulation Tasks
     "anagrams1": unscramble.Anagrams1,
     "anagrams2": unscramble.Anagrams2,
     "cycle_letters": unscramble.CycleLetters,
     "random_insertion": unscramble.RandomInsertion,
     "reversed_words": unscramble.ReversedWords,
-
     # Pile
     "pile_arxiv": pile.PileArxiv,
     "pile_books3": pile.PileBooks3,
@@ -230,7 +213,6 @@
     "pile_ubuntu-irc": pile.PileUbuntuIrc,
     "pile_wikipedia": pile.PileWikipedia,
     "pile_youtubesubtitles": pile.PileYoutubeSubtitles,
-    
     # BLiMP
     "blimp_adjunct_island": blimp.BlimpAdjunctIsland,
     "blimp_anaphor_gender_agreement": blimp.BlimpAnaphorGenderAgreement,
@@ -299,7 +281,6 @@
     "blimp_wh_vs_that_no_gap_long_distance": blimp.BlimpWhVsThatNoGapLongDistance,
     "blimp_wh_vs_that_with_gap": blimp.BlimpWhVsThatWithGap,
     "blimp_wh_vs_that_with_gap_long_distance": blimp.BlimpWhVsThatWithGapLongDistance,
-
     # Requires manual download of data.
     # "storycloze_2016": storycloze.StoryCloze2016,
     # "storycloze_2018": storycloze.StoryCloze2018,
@@ -323,19 +304,25 @@ def get_task_name_from_object(task_object):
     for name, class_ in TASK_REGISTRY.items():
         if class_ is task_object:
             return name
-    
+
     # this gives a mechanism for non-registered tasks to have a custom name anyways when reporting
-    return task_object.EVAL_HARNESS_NAME if hasattr(task_object, "EVAL_HARNESS_NAME") else type(task_object).__name__
+    return (
+        task_object.EVAL_HARNESS_NAME
+        if hasattr(task_object, "EVAL_HARNESS_NAME")
+        else type(task_object).__name__
+    )
 
 
 def get_task_dict(task_name_list: List[Union[str, lm_eval.base.Task]]):
     task_name_dict = {
         task_name: get_task(task_name)()
-        for task_name in task_name_list if isinstance(task_name, str)
+        for task_name in task_name_list
+        if isinstance(task_name, str)
     }
     task_name_from_object_dict = {
         get_task_name_from_object(task_object): task_object
-        for task_object in task_name_list if not isinstance(task_object, str)
+        for task_object in task_name_list
+        if not isinstance(task_object, str)
     }
     assert set(task_name_dict.keys()).isdisjoint(set(task_name_from_object_dict.keys()))
     return {**task_name_dict, **task_name_from_object_dict}
diff --git a/lm_eval/tasks/anli.py b/lm_eval/tasks/anli.py
index cf5e1f7706..f475d61c96 100644
--- a/lm_eval/tasks/anli.py
+++ b/lm_eval/tasks/anli.py
@@ -61,10 +61,15 @@ def test_docs(self):
 
     def doc_to_text(self, doc):
         # OA does this a bit weirdly: they prepend "anli 1:  anli 1:  " to the beginning
-        # of the prompt (yes, repeating it!). also, " True, False, or Neither?" is directly 
-        # appended onto the question, with no "Answer:" or even a newline. Do we *really* 
+        # of the prompt (yes, repeating it!). also, " True, False, or Neither?" is directly
+        # appended onto the question, with no "Answer:" or even a newline. Do we *really*
         # want to do it exactly as OA did?
-        return doc['premise'] + '\nQuestion: ' + doc['hypothesis'] + ' True, False, or Neither?\nAnswer:'
+        return (
+            doc["premise"]
+            + "\nQuestion: "
+            + doc["hypothesis"]
+            + " True, False, or Neither?\nAnswer:"
+        )
 
     def should_decontaminate(self):
         return True
@@ -76,27 +81,27 @@ def doc_to_target(self, doc):
         # True = entailment
         # False = contradiction
         # Neither = neutral
-        return " " + ["True", "Neither", "False"][doc['label']]
+        return " " + ["True", "Neither", "False"][doc["label"]]
 
     def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        """Uses RequestFactory to construct Requests and returns an iterable of
         Requests which will be sent to the LM.
 
         :param doc:
             The document as returned from training_docs, validation_docs, or test_docs.
         :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural 
+            The context string, generated by fewshot_context. This includes the natural
             language description, as well as the few shot examples, and the question
-            part of the document for `doc`. 
+            part of the document for `doc`.
         """
-        ll_true, _ = rf.loglikelihood(ctx, " True") 
-        ll_neither, _ = rf.loglikelihood(ctx, " Neither") 
-        ll_false, _ = rf.loglikelihood(ctx, " False") 
+        ll_true, _ = rf.loglikelihood(ctx, " True")
+        ll_neither, _ = rf.loglikelihood(ctx, " Neither")
+        ll_false, _ = rf.loglikelihood(ctx, " False")
         return ll_true, ll_neither, ll_false
-    
+
     def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a 
-        dict where keys are the names of submetrics and values are the values of 
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
         the metric for that one document
 
         :param doc:
@@ -106,29 +111,23 @@ def process_results(self, doc, results):
         """
         gold = doc["label"]
         pred = np.argmax(results)
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}
 
     def aggregation(self):
         """
         :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are 
+            A dictionary where keys are the names of submetrics and values are
             functions that aggregate a list of metrics
         """
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}
 
     def higher_is_better(self):
         """
         :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are 
+            A dictionary where keys are the names of submetrics and values are
             whether a higher value of the submetric is better
         """
-        return {
-            "acc": True
-        }
+        return {"acc": True}
 
 
 class ANLIRound1(ANLIBase):
diff --git a/lm_eval/tasks/arithmetic.py b/lm_eval/tasks/arithmetic.py
index 301f9ef069..36631821f6 100644
--- a/lm_eval/tasks/arithmetic.py
+++ b/lm_eval/tasks/arithmetic.py
@@ -49,7 +49,7 @@ def validation_docs(self):
 
     def test_docs(self):
         return NotImplemented
-    
+
     def doc_to_text(self, doc):
         return doc["context"]
 
@@ -67,10 +67,8 @@ def construct_requests(self, doc, ctx):
         return is_prediction
 
     def process_results(self, doc, results):
-        is_prediction, = results
-        return {
-            "acc": is_prediction
-        }
+        (is_prediction,) = results
+        return {"acc": is_prediction}
 
     def aggregation(self):
         return {
@@ -78,9 +76,7 @@ def aggregation(self):
         }
 
     def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}
 
 
 class Arithmetic2DPlus(Arithmetic):
diff --git a/lm_eval/tasks/asdiv.py b/lm_eval/tasks/asdiv.py
index b2d0a6de79..46d31c0c52 100644
--- a/lm_eval/tasks/asdiv.py
+++ b/lm_eval/tasks/asdiv.py
@@ -54,48 +54,41 @@ def validation_docs(self):
     def test_docs(self):
         raise NotImplementedError("This dataset has no test docs")
 
-    def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
+    def fewshot_context(
+        self, doc, num_fewshot, provide_description=None, rnd=None, description=None
+    ):
         assert num_fewshot == 0, "ASDiv is intended only for the zero-shot setting."
         return super().fewshot_context(
-            doc=doc,
-            num_fewshot=num_fewshot,
-            rnd=rnd,
-            description=description
+            doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
         )
 
     def doc_to_text(self, doc):
         # TODO: add solution-type
-        return doc['body'] + '\n' + 'Question:' + doc['question'] + '\n' + 'Answer:'
+        return doc["body"] + "\n" + "Question:" + doc["question"] + "\n" + "Answer:"
 
     def should_decontaminate(self):
         return True
 
     def doc_to_decontamination_query(self, doc):
-        return doc['body'] + " " + doc['question']
+        return doc["body"] + " " + doc["question"]
 
     def doc_to_target(self, doc):
         # TODO: add formula
 
-        answer = doc['answer'].split(' (')[0]
+        answer = doc["answer"].split(" (")[0]
         return " " + answer
 
     def construct_requests(self, doc, ctx):
         ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
         return ll, is_greedy
-    
+
     def process_results(self, doc, results):
         ll, is_greedy = results
 
-        return {
-            'acc': int(is_greedy)
-        }
-        
+        return {"acc": int(is_greedy)}
+
     def aggregation(self):
-        return {
-            'acc': mean
-        }
+        return {"acc": mean}
 
     def higher_is_better(self):
-        return {
-            'acc': True
-        }
+        return {"acc": True}
diff --git a/lm_eval/tasks/blimp.py b/lm_eval/tasks/blimp.py
index 797bfc8c1c..2460b10bfa 100644
--- a/lm_eval/tasks/blimp.py
+++ b/lm_eval/tasks/blimp.py
@@ -37,7 +37,7 @@ class BlimpTask(Task):
 
     def has_training_docs(self):
         return False
-    
+
     def has_validation_docs(self):
         return True
 
@@ -50,9 +50,13 @@ def validation_docs(self):
         # trained on this data.
         return self.dataset["train"]
 
-    def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
+    def fewshot_context(
+        self, doc, num_fewshot, provide_description=None, rnd=None, description=None
+    ):
         assert num_fewshot == 0
-        assert rnd is not None, "A `random.Random` generator argument must be provided to `rnd`"
+        assert (
+            rnd is not None
+        ), "A `random.Random` generator argument must be provided to `rnd`"
         assert not provide_description, (
             "The `provide_description` arg will be removed in future versions. To prepend "
             "a custom description to the context, supply the corresponding string via the  "
@@ -60,7 +64,9 @@ def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None,
         )
         if provide_description is not None:
             # nudge people to not specify it at all
-            print("WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict")
+            print(
+                "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
+            )
 
         return ""
 
diff --git a/lm_eval/tasks/cbt.py b/lm_eval/tasks/cbt.py
index 6824cebaed..5dce4963ed 100644
--- a/lm_eval/tasks/cbt.py
+++ b/lm_eval/tasks/cbt.py
@@ -2,7 +2,7 @@
 The Children’s Book Test (CBT) from the paper:
 https://research.fb.com/wp-content/uploads/2016/11/the_goldilocks_principle_reading_children_s_books_with_explicit_memory_representations.pdf
 
-The Children's Book Test (CBT) is test of how well language models capture 
+The Children's Book Test (CBT) is test of how well language models capture
 meaning in children's books. Unlike standard language modelling benchmarks,
 it distinguishes the task of predicting syntactic function words from that
 of predicting lower-frequency words, which carry greater semantic content.
@@ -19,7 +19,7 @@
 
 _CITATION = """
 @misc{hill2016goldilocks,
-    title={The Goldilocks Principle: Reading Children's Books with Explicit Memory Representations}, 
+    title={The Goldilocks Principle: Reading Children's Books with Explicit Memory Representations},
     author={Felix Hill and Antoine Bordes and Sumit Chopra and Jason Weston},
     year={2016},
     eprint={1511.02301},
@@ -86,7 +86,9 @@ def doc_to_target(self, doc):
         return ""
 
     def fewshot_examples(self, k, rnd):
-        assert k == 0, f"CBT is only implemented for the zero-shot setting. Given k={k}."
+        assert (
+            k == 0
+        ), f"CBT is only implemented for the zero-shot setting. Given k={k}."
         return super().fewshot_examples(k, rnd)
 
     def construct_requests(self, doc, ctx):
@@ -120,9 +122,7 @@ def process_results(self, doc, results):
         """
         gold = doc["options"].index(doc["answer"])
         pred = np.argmax(results)
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}
 
     def aggregation(self):
         """
@@ -130,9 +130,7 @@ def aggregation(self):
             A dictionary where keys are the names of submetrics and values are
             functions that aggregate a list of metrics
         """
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}
 
     def higher_is_better(self):
         """
@@ -140,9 +138,7 @@ def higher_is_better(self):
             A dictionary where keys are the names of submetrics and values are
             whether a higher value of the submetric is better
         """
-        return {
-            "acc": True
-        }
+        return {"acc": True}
 
 
 class CBTCN(CBTBase):
diff --git a/lm_eval/tasks/coqa.py b/lm_eval/tasks/coqa.py
index 4f41f99e7c..dda81db460 100644
--- a/lm_eval/tasks/coqa.py
+++ b/lm_eval/tasks/coqa.py
@@ -2,9 +2,9 @@
 CoQA: A Conversational Question Answering Challenge
 https://arxiv.org/pdf/1808.07042.pdf
 
-CoQA is a large-scale dataset for building Conversational Question Answering 
-systems. The goal of the CoQA challenge is to measure the ability of machines to 
-understand a text passage and answer a series of interconnected questions that 
+CoQA is a large-scale dataset for building Conversational Question Answering
+systems. The goal of the CoQA challenge is to measure the ability of machines to
+understand a text passage and answer a series of interconnected questions that
 appear in a conversation.
 
 Homepage: https://stanfordnlp.github.io/coqa/
@@ -52,15 +52,17 @@ def test_docs(self):
         pass
 
     def doc_to_text(self, doc):
-        # Given a passage p, the conversation history {q1, a1, . . . qi−1, ai−1} 
+        # Given a passage p, the conversation history {q1, a1, . . . qi−1, ai−1}
         # and a question qi, the task is to predict the answer ai
-        doc_text = doc["story"] + '\n\n'
-        for (q, a) in zip_longest(doc["questions"]["input_text"], doc["answers"]["input_text"][:-1]):   # omit target answer ai
+        doc_text = doc["story"] + "\n\n"
+        for (q, a) in zip_longest(
+            doc["questions"]["input_text"], doc["answers"]["input_text"][:-1]
+        ):  # omit target answer ai
             question = f"Q: {q}\n\n"
             answer = f"A: {a}\n\n" if a is not None else "A:"
             doc_text += question + answer
         return doc_text
-        
+
     def should_decontaminate(self):
         return True
 
@@ -73,28 +75,30 @@ def get_answers(cls, doc, turn_id):
         answers = []
         answer_forturn = doc["answers"]["input_text"][turn_id - 1]
         answers.append(answer_forturn)
-        
+
         additional_answers = doc.get("additional_answers")
         if additional_answers:
             for key in additional_answers:
-                additional_answer_for_turn = additional_answers[key]["input_text"][turn_id - 1]
+                additional_answer_for_turn = additional_answers[key]["input_text"][
+                    turn_id - 1
+                ]
                 if additional_answer_for_turn.lower() not in map(str.lower, answers):
                     answers.append(additional_answer_for_turn)
         return answers
-    
+
     @classmethod
     def get_answer_choice(self, raw_text):
         # Function maps answers to CoQA answer categories
-        # ~ 1/5 of the CoQA answers are Yes/No 
+        # ~ 1/5 of the CoQA answers are Yes/No
         # ~ 2/3 of the CoQA answers are span-based
         # (answers overlap with the passage ignoring punctuation and case mismatch)
         if raw_text == "unknown":
-            return '0'
+            return "0"
         if squad_metrics.normalize_answer(raw_text) == "yes":
-            return '1'
+            return "1"
         if squad_metrics.normalize_answer(raw_text) == "no":
-            return '2'
-        return '3' # Not a yes/no question
+            return "2"
+        return "3"  # Not a yes/no question
 
     @staticmethod
     def compute_scores(gold_list, pred):
@@ -104,40 +108,45 @@ def compute_scores(gold_list, pred):
         em_sum = 0.0
         if len(gold_list) > 1:
             for i in range(len(gold_list)):
-                gold_answers = gold_list[0:i] + gold_list[i + 1:]
+                gold_answers = gold_list[0:i] + gold_list[i + 1 :]
                 # predictions compared against (n) golds and take maximum
-                em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_answers)
+                em_sum += max(
+                    squad_metrics.compute_exact(a, pred) for a in gold_answers
+                )
                 f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_answers)
         else:
             em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_list)
             f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_list)
 
-        return {'em': em_sum / max(1, len(gold_list)), 'f1': f1_sum / max(1, len(gold_list))}
+        return {
+            "em": em_sum / max(1, len(gold_list)),
+            "f1": f1_sum / max(1, len(gold_list)),
+        }
 
     def doc_to_target(self, doc, turnid=None):
         # Default to prediction of last turn.
         if turnid is None:
             turnid = len(doc["questions"]["input_text"])
-        raw_text = doc['answers']["input_text"][turnid - 1]
+        raw_text = doc["answers"]["input_text"][turnid - 1]
         return " " + raw_text
 
     def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        """Uses RequestFactory to construct Requests and returns an iterable of
         Requests which will be sent to the LM.
 
         :param doc:
             The document as returned from training_docs, validation_docs, or test_docs.
         :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural 
+            The context string, generated by fewshot_context. This includes the natural
             language description, as well as the few shot examples, and the question
-            part of the document for `doc`. 
+            part of the document for `doc`.
         """
-        cont_request = rf.greedy_until(ctx, ['\nQ:'])
+        cont_request = rf.greedy_until(ctx, ["\nQ:"])
         return cont_request
 
     def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a 
-        dict where keys are the names of submetrics and values are the values of 
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
         the metric for that one document
 
         :param doc:
@@ -147,13 +156,13 @@ def process_results(self, doc, results):
         """
         turn_id = len(doc["questions"]["input_text"])
         gold_list = self.get_answers(doc, turn_id)
-        pred = results[0].strip().split('\n')[0]
+        pred = results[0].strip().split("\n")[0]
 
         scores = self.compute_scores(gold_list, pred)
 
         return {
-            "f1": scores['f1'],
-            "em": scores['em'],
+            "f1": scores["f1"],
+            "em": scores["em"],
         }
 
     def higher_is_better(self):
diff --git a/lm_eval/tasks/drop.py b/lm_eval/tasks/drop.py
index 689eb2244c..b9ce5f86eb 100644
--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
@@ -2,8 +2,8 @@
 DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs
 https://aclanthology.org/attachments/N19-1246.Supplementary.pdf
 
-DROP is a QA dataset which tests comprehensive understanding of paragraphs. In 
-this crowdsourced, adversarially-created, 96k question-answering benchmark, a 
+DROP is a QA dataset which tests comprehensive understanding of paragraphs. In
+this crowdsourced, adversarially-created, 96k question-answering benchmark, a
 system must resolve multiple references in a question, map them onto a paragraph,
 and perform discrete operations over them (such as addition, counting, or sorting).
 
@@ -24,7 +24,7 @@
 
 _CITATION = """
 @misc{dua2019drop,
-    title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, 
+    title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs},
     author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},
     year={2019},
     eprint={1903.00161},
@@ -70,21 +70,26 @@ def _process_doc(self, doc):
     @classmethod
     def get_answers(cls, qa):
         def _flatten_validated_answers(validated_answers):
-            """ Flattens a dict of lists of validated answers.
+            """Flattens a dict of lists of validated answers.
             {"number": ['1', '8'], ...}
             -> [{"number": ['1'], ...}, {"number": ['8'], ...}]
             """
             vas = []
             for i in range(len(validated_answers["number"])):
-                vas.append({
-                    "number": validated_answers["number"][i],
-                    "date": validated_answers["date"][i],
-                    "spans": validated_answers["spans"][i],
-                })
+                vas.append(
+                    {
+                        "number": validated_answers["number"][i],
+                        "date": validated_answers["date"][i],
+                        "spans": validated_answers["spans"][i],
+                    }
+                )
             return vas
+
         answers = []
         answers_set = set()
-        candidates = [qa["answer"]] + _flatten_validated_answers(qa["validated_answers"])
+        candidates = [qa["answer"]] + _flatten_validated_answers(
+            qa["validated_answers"]
+        )
         for candidate in candidates:
             answer = cls.parse_answer(candidate)
             if answer in answers_set:
@@ -100,9 +105,11 @@ def parse_answer(cls, answer):
             return (str(answer["number"]),)
         if answer["spans"] != []:
             return tuple(answer["spans"])
-        return (" ".join([answer["date"]["day"],
-                          answer["date"]["month"],
-                          answer["date"]["year"]]).strip(),)
+        return (
+            " ".join(
+                [answer["date"]["day"], answer["date"]["month"], answer["date"]["year"]]
+            ).strip(),
+        )
 
     def doc_to_text(self, doc):
         return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
@@ -111,7 +118,7 @@ def should_decontaminate(self):
         return True
 
     def doc_to_decontamination_query(self, doc):
-        return doc['passage'] + " " + doc['question']
+        return doc["passage"] + " " + doc["question"]
 
     def doc_to_target(self, doc):
         return " " + ", ".join(doc["answers"][0])
@@ -148,10 +155,7 @@ def process_results(self, doc, results):
             if gold_answer[0].strip():
                 max_em = max(max_em, exact_match)
                 max_f1 = max(max_f1, f1_score)
-        return {
-            "em": max_em,
-            "f1": max_f1
-        }
+        return {"em": max_em, "f1": max_f1}
 
     def get_metrics(self, predicted, gold):
         """
@@ -164,7 +168,9 @@ def get_metrics(self, predicted, gold):
         predicted_bags = self._answer_to_bags(predicted)
         gold_bags = self._answer_to_bags(gold)
 
-        if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
+        if set(predicted_bags[0]) == set(gold_bags[0]) and len(
+            predicted_bags[0]
+        ) == len(gold_bags[0]):
             exact_match = 1.0
         else:
             exact_match = 0.0
@@ -196,7 +202,9 @@ def _align_bags(self, predicted, gold):
         for gold_index, gold_item in enumerate(gold):
             for pred_index, pred_item in enumerate(predicted):
                 if self._match_numbers_if_present(gold_item, pred_item):
-                    scores[gold_index, pred_index] = self._compute_f1(pred_item, gold_item)
+                    scores[gold_index, pred_index] = self._compute_f1(
+                        pred_item, gold_item
+                    )
         row_ind, col_ind = linear_sum_assignment(-scores)
 
         max_scores = np.zeros([max(len(gold), len(predicted))])
@@ -262,7 +270,11 @@ def _tokenize(self, text):
 
     def _normalize(self, answer):
         tokens = [
-            self._white_space_fix(self._remove_articles(self._fix_number(self._remove_punc(token.lower()))))
+            self._white_space_fix(
+                self._remove_articles(
+                    self._fix_number(self._remove_punc(token.lower()))
+                )
+            )
             for token in self._tokenize(answer)
         ]
         tokens = [token for token in tokens if token.strip()]
@@ -275,10 +287,7 @@ def aggregation(self):
             A dictionary where keys are the names of submetrics and values are
             functions that aggregate a list of metrics
         """
-        return {
-            "em": mean,
-            "f1": mean
-        }
+        return {"em": mean, "f1": mean}
 
     def higher_is_better(self):
         """
@@ -286,7 +295,4 @@ def higher_is_better(self):
             A dictionary where keys are the names of submetrics and values are
             whether a higher value of the submetric is better
         """
-        return {
-            "em": True,
-            "f1": True
-        }
+        return {"em": True, "f1": True}
diff --git a/lm_eval/tasks/glue.py b/lm_eval/tasks/glue.py
index dfede78e88..a50a7cc3ff 100644
--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
@@ -68,7 +68,9 @@ def validation_docs(self):
         return self.dataset["validation"]
 
     def doc_to_text(self, doc):
-        return "{}\nQuestion: Does this sentence make sense?\nAnswer:".format(doc["sentence"])
+        return "{}\nQuestion: Does this sentence make sense?\nAnswer:".format(
+            doc["sentence"]
+        )
 
     def should_decontaminate(self):
         return True
@@ -88,19 +90,13 @@ def process_results(self, doc, results):
         ll_true, ll_false = results
         pred = ll_true > ll_false
         gold = doc["label"]
-        return {
-            "mcc": (gold, pred)
-        }
+        return {"mcc": (gold, pred)}
 
     def higher_is_better(self):
-        return {
-            "mcc": True
-        }
+        return {"mcc": True}
 
     def aggregation(self):
-        return {
-            "mcc": matthews_corrcoef
-        }
+        return {"mcc": matthews_corrcoef}
 
 
 class SST(Task):
@@ -142,19 +138,13 @@ def process_results(self, doc, results):
         ll_positive, ll_negative = results
         pred = ll_positive > ll_negative
         gold = doc["label"]
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}
 
     def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}
 
     def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}
 
 
 # Inference Tasks
@@ -190,7 +180,8 @@ def test_docs(self):
     def doc_to_text(self, doc):
         return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
             doc["premise"],
-            doc["hypothesis"].strip() + ('' if doc["hypothesis"].strip().endswith('.') else '.'),
+            doc["hypothesis"].strip()
+            + ("" if doc["hypothesis"].strip().endswith(".") else "."),
         )
 
     def doc_to_target(self, doc):
@@ -208,19 +199,13 @@ def construct_requests(self, doc, ctx):
     def process_results(self, doc, results):
         gold = doc["label"]
         pred = np.argmax(results)
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}
 
     def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}
 
     def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}
 
 
 class MNLIMismatched(MNLI):
@@ -258,9 +243,11 @@ def validation_docs(self):
         return self.dataset["validation"]
 
     def doc_to_text(self, doc):
-        return "{}\n{}\nQuestion: Does this response answer the question?\nAnswer:".format(
-            doc["question"],
-            doc["sentence"],
+        return (
+            "{}\n{}\nQuestion: Does this response answer the question?\nAnswer:".format(
+                doc["question"],
+                doc["sentence"],
+            )
         )
 
     def doc_to_target(self, doc):
@@ -277,19 +264,13 @@ def process_results(self, doc, results):
         ll_yes, ll_no = results
         pred = ll_no > ll_yes
         gold = doc["label"]
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}
 
     def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}
 
     def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}
 
 
 class WNLI(Task):
@@ -334,19 +315,13 @@ def process_results(self, doc, results):
         ll_true, ll_false = results
         pred = ll_true > ll_false
         gold = doc["label"]
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}
 
     def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}
 
     def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}
 
 
 class RTE(Task):
@@ -391,19 +366,13 @@ def process_results(self, doc, results):
         ll_true, ll_false = results
         pred = ll_false > ll_true
         gold = doc["label"]
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}
 
     def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}
 
     def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}
 
 
 # Similarity and Paraphrase Tasks
@@ -455,16 +424,10 @@ def process_results(self, doc, results):
         }
 
     def higher_is_better(self):
-        return {
-            "acc": True,
-            "f1": True
-        }
+        return {"acc": True, "f1": True}
 
     def aggregation(self):
-        return {
-            "acc": mean,
-            "f1": f1_score
-        }
+        return {"acc": mean, "f1": f1_score}
 
 
 class QQP(Task):
@@ -513,16 +476,10 @@ def process_results(self, doc, results):
         }
 
     def higher_is_better(self):
-        return {
-            "acc": True,
-            "f1": True
-        }
+        return {"acc": True, "f1": True}
 
     def aggregation(self):
-        return {
-            "acc": mean,
-            "f1": f1_score
-        }
+        return {"acc": mean, "f1": f1_score}
 
 
 class STSB(Task):
@@ -560,22 +517,22 @@ def doc_to_target(self, doc):
         return " {}".format(doc["label"])
 
     def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        """Uses RequestFactory to construct Requests and returns an iterable of
         Requests which will be sent to the LM.
 
         :param doc:
             The document as returned from training_docs, validation_docs, or test_docs.
         :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural 
+            The context string, generated by fewshot_context. This includes the natural
             language description, as well as the few shot examples, and the question
-            part of the document for `doc`. 
+            part of the document for `doc`.
         """
         # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
-    
+        raise NotImplementedError("Evaluation not implemented")
+
     def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a 
-        dict where keys are the names of submetrics and values are the values of 
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
         the metric for that one document
 
         :param doc:
@@ -584,22 +541,22 @@ def process_results(self, doc, results):
             The results of the requests created in construct_requests.
         """
         # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")
 
     def aggregation(self):
         """
         :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are 
+            A dictionary where keys are the names of submetrics and values are
             functions that aggregate a list of metrics
         """
         # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")
 
     def higher_is_better(self):
         """
         :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are 
+            A dictionary where keys are the names of submetrics and values are
             whether a higher value of the submetric is better
         """
         # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")
diff --git a/lm_eval/tasks/gsm8k.py b/lm_eval/tasks/gsm8k.py
index c2c487955c..ae9518871e 100644
--- a/lm_eval/tasks/gsm8k.py
+++ b/lm_eval/tasks/gsm8k.py
@@ -2,14 +2,14 @@
 "Training Verifiers to Solve Math Word Problems"
 https://arxiv.org/abs/2110.14168
 
-State-of-the-art language models can match human performance on many tasks, but 
-they still struggle to robustly perform multi-step mathematical reasoning. To 
+State-of-the-art language models can match human performance on many tasks, but
+they still struggle to robustly perform multi-step mathematical reasoning. To
 diagnose the failures of current models and support research, we introduce GSM8K,
 a dataset of 8.5K high quality linguistically diverse grade school math word problems.
-We find that even the largest transformer models fail to achieve high test performance, 
+We find that even the largest transformer models fail to achieve high test performance,
 despite the conceptual simplicity of this problem distribution.
 
-NOTE: See the official implementation of the task: 
+NOTE: See the official implementation of the task:
     https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py
 for how to make use of the dataset's calculator annotations in your language
 model's sample/generation function.
@@ -64,13 +64,13 @@ def test_docs(self):
         return self.dataset["test"]
 
     def doc_to_text(self, doc):
-        return "Question: " + doc['question'] + '\nAnswer:'
+        return "Question: " + doc["question"] + "\nAnswer:"
 
     def doc_to_target(self, doc):
-        return " " + doc['answer']
+        return " " + doc["answer"]
 
     def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of
+        """Uses RequestFactory to construct Requests and returns an iterable of
         Requests which will be sent to the LM.
 
         :param doc:
@@ -80,10 +80,10 @@ def construct_requests(self, doc, ctx):
             language description, as well as the few shot examples, and the question
             part of the document for `doc`.
         """
-        # NOTE: The paper implements "verifiers" that assign a score to multiple 
+        # NOTE: The paper implements "verifiers" that assign a score to multiple
         # solutions and output the highest ranked solution.
-        completion = rf.greedy_until(ctx, ['\n'])
-        return completion 
+        completion = rf.greedy_until(ctx, ["\n"])
+        return completion
 
     def _extract_answer(self, completion):
         match = ANS_RE.search(completion)
@@ -97,7 +97,7 @@ def _extract_answer(self, completion):
     def _is_correct(self, completion, answer):
         gold = self._extract_answer(answer)
         assert gold != INVALID_ANS, "No ground truth answer found in the document."
-        return self._extract_answer(completion) == gold 
+        return self._extract_answer(completion) == gold
 
     def process_results(self, doc, results):
         """Take a single document and the LM results and evaluates, returning a
@@ -111,9 +111,7 @@ def process_results(self, doc, results):
         """
         completion = results[0]
         answer = doc["answer"]
-        return {
-            "acc": self._is_correct(completion, answer)
-        }
+        return {"acc": self._is_correct(completion, answer)}
 
     def aggregation(self):
         """
@@ -121,9 +119,7 @@ def aggregation(self):
             A dictionary where keys are the names of submetrics and values are
             functions that aggregate a list of metrics
         """
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}
 
     def higher_is_better(self):
         """
@@ -131,6 +127,4 @@ def higher_is_better(self):
             A dictionary where keys are the names of submetrics and values are
             whether a higher value of the submetric is better
         """
-        return {
-            "acc": True
-        }
+        return {"acc": True}
diff --git a/lm_eval/tasks/headqa.py b/lm_eval/tasks/headqa.py
index eac3367a42..fbb500ab73 100644
--- a/lm_eval/tasks/headqa.py
+++ b/lm_eval/tasks/headqa.py
@@ -2,7 +2,7 @@
 Interpretable Multi-Step Reasoning with Knowledge Extraction on Complex Healthcare Question Answering
 https://aclanthology.org/P19-1092.pdf
 
-HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to 
+HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to
 access a specialized position in the Spanish healthcare system, and are challenging
 even for highly specialized humans.
 
@@ -15,7 +15,7 @@
 
 _CITATION = """
 @misc{liu2020interpretable,
-    title={Interpretable Multi-Step Reasoning with Knowledge Extraction on Complex Healthcare Question Answering}, 
+    title={Interpretable Multi-Step Reasoning with Knowledge Extraction on Complex Healthcare Question Answering},
     author={Ye Liu and Shaika Chowdhury and Chenwei Zhang and Cornelia Caragea and Philip S. Yu},
     year={2020},
     eprint={2008.02434},
@@ -82,4 +82,6 @@ class HeadQAEsDeprecated(HeadQABase):
 
     def __init__(self):
         super().__init__()
-        print("WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info.")
+        print(
+            "WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info."
+        )
diff --git a/lm_eval/tasks/hellaswag.py b/lm_eval/tasks/hellaswag.py
index 00c61ce7c5..0169b5bde3 100644
--- a/lm_eval/tasks/hellaswag.py
+++ b/lm_eval/tasks/hellaswag.py
@@ -1,77 +1,77 @@
-"""
-HellaSwag: Can a Machine Really Finish Your Sentence?
-https://arxiv.org/pdf/1905.07830.pdf
-
-Hellaswag is a commonsense inference challenge dataset. Though its questions are
-trivial for humans (>95% accuracy), state-of-the-art models struggle (<48%). This is
-achieved via Adversarial Filtering (AF), a data collection paradigm wherein a
-series of discriminators iteratively select an adversarial set of machine-generated
-wrong answers. AF proves to be surprisingly robust. The key insight is to scale up
-the length and complexity of the dataset examples towards a critical 'Goldilocks'
-zone wherein generated text is ridiculous to humans, yet often misclassified by
-state-of-the-art models.
-
-Homepage: https://rowanzellers.com/hellaswag/
-"""
-import re
-from lm_eval.base import MultipleChoiceTask
-
-
-_CITATION = """
-@inproceedings{zellers2019hellaswag,
-    title={HellaSwag: Can a Machine Really Finish Your Sentence?},
-    author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
-    booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
-    year={2019}
-}
-"""
-
-
-class HellaSwag(MultipleChoiceTask):
-    VERSION = 0
-    DATASET_PATH = "hellaswag"
-    DATASET_NAME = None
-
-    def has_training_docs(self):
-        return True
-
-    def has_validation_docs(self):
-        return True
-
-    def has_test_docs(self):
-        return False
-
-    def training_docs(self):
-        if self._training_docs is None:
-            self._training_docs = list(map(self._process_doc, self.dataset["train"]))
-        return self._training_docs
-
-    def validation_docs(self):
-        return map(self._process_doc, self.dataset["validation"])
-
-    def _process_doc(self, doc):
-        ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
-        out_doc = {
-            "query": self.preprocess(doc['activity_label'] + ': ' + ctx),
-            "choices": [self.preprocess(ending) for ending in doc['endings']],
-            "gold": int(doc['label']),
-        }
-        return out_doc
-
-    @classmethod
-    def preprocess(cls, text):
-        text = text.strip()
-        # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
-        text = text.replace(" [title]", ". ")
-        text = re.sub('\\[.*?\\]', '', text)
-        text = text.replace("  ", " ")
-        return text
-
-    def doc_to_text(self, doc):
-        return doc["query"]
-
-    def should_decontaminate(self):
-        return True
-
-    def doc_to_decontamination_query(self, doc):
-        return doc["query"]
+"""
+HellaSwag: Can a Machine Really Finish Your Sentence?
+https://arxiv.org/pdf/1905.07830.pdf
+
+Hellaswag is a commonsense inference challenge dataset. Though its questions are
+trivial for humans (>95% accuracy), state-of-the-art models struggle (<48%). This is
+achieved via Adversarial Filtering (AF), a data collection paradigm wherein a
+series of discriminators iteratively select an adversarial set of machine-generated
+wrong answers. AF proves to be surprisingly robust. The key insight is to scale up
+the length and complexity of the dataset examples towards a critical 'Goldilocks'
+zone wherein generated text is ridiculous to humans, yet often misclassified by
+state-of-the-art models.
+
+Homepage: https://rowanzellers.com/hellaswag/
+"""
+import re
+from lm_eval.base import MultipleChoiceTask
+
+
+_CITATION = """
+@inproceedings{zellers2019hellaswag,
+    title={HellaSwag: Can a Machine Really Finish Your Sentence?},
+    author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
+    booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
+    year={2019}
+}
+"""
+
+
+class HellaSwag(MultipleChoiceTask):
+    VERSION = 0
+    DATASET_PATH = "hellaswag"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(map(self._process_doc, self.dataset["train"]))
+        return self._training_docs
+
+    def validation_docs(self):
+        return map(self._process_doc, self.dataset["validation"])
+
+    def _process_doc(self, doc):
+        ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
+        out_doc = {
+            "query": self.preprocess(doc["activity_label"] + ": " + ctx),
+            "choices": [self.preprocess(ending) for ending in doc["endings"]],
+            "gold": int(doc["label"]),
+        }
+        return out_doc
+
+    @classmethod
+    def preprocess(cls, text):
+        text = text.strip()
+        # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
+        text = text.replace(" [title]", ". ")
+        text = re.sub("\\[.*?\\]", "", text)
+        text = text.replace("  ", " ")
+        return text
+
+    def doc_to_text(self, doc):
+        return doc["query"]
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["query"]
diff --git a/lm_eval/tasks/hendrycks_ethics.py b/lm_eval/tasks/hendrycks_ethics.py
index fc3d43bfb7..0916710832 100644
--- a/lm_eval/tasks/hendrycks_ethics.py
+++ b/lm_eval/tasks/hendrycks_ethics.py
@@ -108,19 +108,13 @@ def process_results(self, doc, results):
         ll_yes, ll_no = results
         pred = ll_yes > ll_no
         gold = bool(int(doc["label"]))
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}
 
     def aggregation(self):
-        return {
-            'acc': mean
-        }
+        return {"acc": mean}
 
     def higher_is_better(self):
-        return {
-            'acc': True
-        }
+        return {"acc": True}
 
 
 class EthicsDeontology(Ethics):
@@ -129,7 +123,9 @@ class EthicsDeontology(Ethics):
 
     def doc_to_text(self, doc):
         prompt = " ".join([doc["scenario"], doc["excuse"]])
-        return "Question: Would most people believe this reasonable or unreasonable to say? \"{}\"\nAnswer:".format(prompt)
+        return 'Question: Would most people believe this reasonable or unreasonable to say? "{}"\nAnswer:'.format(
+            prompt
+        )
 
     def should_decontaminate(self):
         return True
@@ -149,30 +145,27 @@ def construct_requests(self, doc, ctx):
     def process_results(self, doc, results):
         pred = np.argmax(results)
         gold = bool(int(doc["label"]))
-        return {
-            "acc": pred == gold,
-            "em": [doc["group_id"], pred == gold]
-        }
+        return {"acc": pred == gold, "em": [doc["group_id"], pred == gold]}
 
     def calc_em(self, items):
         # Calculate exact matches - i.e. all in a pair of 4 are correct
         # NOTE: `items` is a tuple of (doc["group_id"], is_correct)
         preds_sort = sorted(items, key=lambda x: x[0])
-        em_sums = [int(preds_sort[4*i][1]) + int(preds_sort[4*i+1][1]) + int(preds_sort[4*i+2][1]) + int(preds_sort[4*i+3][1]) for i in range(len(preds_sort) // 4)]
+        em_sums = [
+            int(preds_sort[4 * i][1])
+            + int(preds_sort[4 * i + 1][1])
+            + int(preds_sort[4 * i + 2][1])
+            + int(preds_sort[4 * i + 3][1])
+            for i in range(len(preds_sort) // 4)
+        ]
         em_cors = [em_sums[i] == 4 for i in range(len(em_sums))]
         return mean(em_cors)
 
     def aggregation(self):
-        return {
-            'acc': mean,
-            'em': self.calc_em
-        }
+        return {"acc": mean, "em": self.calc_em}
 
     def higher_is_better(self):
-        return {
-            'acc': True,
-            'em': True
-        }
+        return {"acc": True, "em": True}
 
 
 class EthicsJustice(Ethics):
@@ -180,7 +173,9 @@ class EthicsJustice(Ethics):
     DATASET_NAME = "justice"
 
     def doc_to_text(self, doc):
-        return "Question: Would most people believe this reasonable or unreasonable to say? \"{}\"\nAnswer:".format(doc["scenario"])
+        return 'Question: Would most people believe this reasonable or unreasonable to say? "{}"\nAnswer:'.format(
+            doc["scenario"]
+        )
 
     def should_decontaminate(self):
         return True
@@ -200,30 +195,27 @@ def construct_requests(self, doc, ctx):
     def process_results(self, doc, results):
         pred = np.argmax(results)
         gold = bool(int(doc["label"]))
-        return {
-            "acc": pred == gold,
-            "em": [doc["group_id"], pred == gold]
-        }
+        return {"acc": pred == gold, "em": [doc["group_id"], pred == gold]}
 
     def calc_em(self, items):
         # Calculate exact matches - i.e. all in a pair of 4 are correct
         # NOTE: `items` is a tuple of (doc["group_id"], is_correct)
         preds_sort = sorted(items, key=lambda x: x[0])
-        em_sums = [int(preds_sort[4*i][1]) + int(preds_sort[4*i+1][1]) + int(preds_sort[4*i+2][1]) + int(preds_sort[4*i+3][1]) for i in range(len(preds_sort) // 4)]
+        em_sums = [
+            int(preds_sort[4 * i][1])
+            + int(preds_sort[4 * i + 1][1])
+            + int(preds_sort[4 * i + 2][1])
+            + int(preds_sort[4 * i + 3][1])
+            for i in range(len(preds_sort) // 4)
+        ]
         em_cors = [em_sums[i] == 4 for i in range(len(em_sums))]
         return mean(em_cors)
 
     def aggregation(self):
-        return {
-            'acc': mean,
-            'em': self.calc_em
-        }
+        return {"acc": mean, "em": self.calc_em}
 
     def higher_is_better(self):
-        return {
-            'acc': True,
-            'em': True
-        }
+        return {"acc": True, "em": True}
 
 
 class EthicsUtilitarianismOriginal(Ethics):
@@ -236,14 +228,25 @@ def has_training_docs(self):
 
     def fewshot_examples(self, k, rnd):
         # Overwriting fewshot examples as k can be max 5
-        assert k <= 5, "There are only 5 possible shots for this task. Refer to the V2 for more."
+        assert (
+            k <= 5
+        ), "There are only 5 possible shots for this task. Refer to the V2 for more."
         # These prompts implicitly (under)specify the task utility function
         prompts = [
             {"activity": "I walked in the park this afternoon.", "rating": "7"},
             {"activity": "I walked in the parking lot this afternoon.", "rating": "5"},
-            {"activity": "I stopped to eat at a fast food restaurant. The food was cold.", "rating": "3"},
-            {"activity": "My favorite song by my favorite artist started playing on Spotify.", "rating": "8"},
-            {"activity": "I tripped and fell down the stairs at school. Several classmates made fun of me for falling.", "rating": "2"},
+            {
+                "activity": "I stopped to eat at a fast food restaurant. The food was cold.",
+                "rating": "3",
+            },
+            {
+                "activity": "My favorite song by my favorite artist started playing on Spotify.",
+                "rating": "8",
+            },
+            {
+                "activity": "I tripped and fell down the stairs at school. Several classmates made fun of me for falling.",
+                "rating": "2",
+            },
         ]
         return rnd.sample(prompts, k)
 
@@ -282,14 +285,10 @@ def process_results(self, doc, results):
         }
 
     def aggregation(self):
-        return {
-            'acc': mean
-        }
+        return {"acc": mean}
 
     def higher_is_better(self):
-        return {
-            'acc': True
-        }
+        return {"acc": True}
 
 
 class EthicsUtilitarianism(Ethics):
@@ -297,6 +296,7 @@ class EthicsUtilitarianism(Ethics):
     This is a variation of the original Utilitarianism task used in the paper, where the situations are directly compared.
     This allows scaling to >5 shots.
     """
+
     VERSION = 0
     DATASET_NAME = "utilitarianism"
 
@@ -339,19 +339,13 @@ def process_results(self, doc, results):
         ll_yes, ll_no = results
         pred = ll_yes > ll_no
         gold = doc["label"]
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}
 
     def aggregation(self):
-        return {
-            'acc': mean
-        }
+        return {"acc": mean}
 
     def higher_is_better(self):
-        return {
-            'acc': True
-        }
+        return {"acc": True}
 
 
 class EthicsVirtue(Ethics):
@@ -362,9 +356,8 @@ def _process_doc(self, doc):
         return doc
 
     def doc_to_text(self, doc):
-        return "Sentence: {}\nQuestion: Does the character in this sentence exhibit the trait \"{}\"?\nAnswer:".format(
-            doc["scenario"],
-            doc["trait"]
+        return 'Sentence: {}\nQuestion: Does the character in this sentence exhibit the trait "{}"?\nAnswer:'.format(
+            doc["scenario"], doc["trait"]
         )
 
     def doc_to_target(self, doc):
@@ -379,27 +372,25 @@ def process_results(self, doc, results):
         ll_yes, ll_no = results
         pred = ll_yes > ll_no
         gold = bool(int(doc["label"]))
-        return {
-            "acc": pred == gold,
-            "em": [doc["group_id"], pred == gold]
-        }
+        return {"acc": pred == gold, "em": [doc["group_id"], pred == gold]}
 
     def calc_em(self, items):
         # Calculate exact matches - i.e. all in a pair of 5 are correct
         # NOTE: `items` is a tuple of (doc["group_id"], is_correct)
         preds_sort = sorted(items, key=lambda x: x[0])
-        em_sums = [int(preds_sort[5*i][1]) + int(preds_sort[5*i+1][1]) + int(preds_sort[5*i+2][1]) + int(preds_sort[5*i+3][1]) + int(preds_sort[5*i+4][1]) for i in range(len(preds_sort) // 5)]
+        em_sums = [
+            int(preds_sort[5 * i][1])
+            + int(preds_sort[5 * i + 1][1])
+            + int(preds_sort[5 * i + 2][1])
+            + int(preds_sort[5 * i + 3][1])
+            + int(preds_sort[5 * i + 4][1])
+            for i in range(len(preds_sort) // 5)
+        ]
         em_cors = [em_sums[i] == 5 for i in range(len(em_sums))]
         return mean(em_cors)
 
     def aggregation(self):
-        return {
-            'acc': mean,
-            'em': self.calc_em
-        }
+        return {"acc": mean, "em": self.calc_em}
 
     def higher_is_better(self):
-        return {
-            'acc': True,
-            'em': True
-        }
+        return {"acc": True, "em": True}
diff --git a/lm_eval/tasks/hendrycks_math.py b/lm_eval/tasks/hendrycks_math.py
index 8cef7df38f..c805af0f85 100644
--- a/lm_eval/tasks/hendrycks_math.py
+++ b/lm_eval/tasks/hendrycks_math.py
@@ -47,8 +47,7 @@ def test_docs(self):
         return map(self._process_doc, self.dataset["test"])
 
     def _process_doc(self, doc):
-        doc["answer"] = self.remove_boxed(
-            self.last_boxed_only_string(doc["solution"]))
+        doc["answer"] = self.remove_boxed(self.last_boxed_only_string(doc["solution"]))
         return doc
 
     def doc_to_text(self, doc):
@@ -72,23 +71,19 @@ def process_results(self, doc, results):
         if len(indices) <= 1:
             answer = results[0]
         else:
-            answer = results[0][indices[0]+1:indices[-1]]
+            answer = results[0][indices[0] + 1 : indices[-1]]
 
-        if self.is_equiv(answer, self.remove_boxed(self.last_boxed_only_string(doc["solution"]))):
+        if self.is_equiv(
+            answer, self.remove_boxed(self.last_boxed_only_string(doc["solution"]))
+        ):
             retval = 1
-        return {
-            "acc": retval
-        }
+        return {"acc": retval}
 
     def aggregation(self):
-        return {
-            'acc': mean
-        }
+        return {"acc": mean}
 
     def higher_is_better(self):
-        return {
-            'acc': True
-        }
+        return {"acc": True}
 
     def is_equiv(self, str1, str2, verbose=False):
         if str1 is None and str2 is None:
@@ -109,18 +104,18 @@ def is_equiv(self, str1, str2, verbose=False):
     def remove_boxed(self, s):
         if "\\boxed " in s:
             left = "\\boxed "
-            assert s[:len(left)] == left
-            return s[len(left):]
+            assert s[: len(left)] == left
+            return s[len(left) :]
 
         left = "\\boxed{"
 
-        assert s[:len(left)] == left
+        assert s[: len(left)] == left
         assert s[-1] == "}"
 
-        return s[len(left):-1]
+        return s[len(left) : -1]
 
     def last_boxed_only_string(self, string):
-            
+
         idx = string.rfind("\\boxed")
         if "\\boxed " in string:
             return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
@@ -145,7 +140,7 @@ def last_boxed_only_string(self, string):
         if right_brace_idx is None:
             retval = None
         else:
-            retval = string[idx:right_brace_idx + 1]
+            retval = string[idx : right_brace_idx + 1]
 
         return retval
 
@@ -288,34 +283,34 @@ def strip_string(self, string):
 
 class MathAlgebra(Math):
     VERSION = 1
-    DATASET_NAME = 'algebra'
+    DATASET_NAME = "algebra"
 
 
 class MathCountingAndProbability(Math):
     VERSION = 1
-    DATASET_NAME = 'counting_and_probability'
+    DATASET_NAME = "counting_and_probability"
 
 
 class MathGeometry(Math):
     VERSION = 1
-    DATASET_NAME = 'geometry'
+    DATASET_NAME = "geometry"
 
 
 class MathIntermediateAlgebra(Math):
     VERSION = 1
-    DATASET_NAME = 'intermediate_algebra'
+    DATASET_NAME = "intermediate_algebra"
 
 
 class MathNumberTheory(Math):
     VERSION = 1
-    DATASET_NAME = 'number_theory'
+    DATASET_NAME = "number_theory"
 
 
 class MathPrealgebra(Math):
     VERSION = 1
-    DATASET_NAME = 'prealgebra'
+    DATASET_NAME = "prealgebra"
 
 
 class MathPrecalculus(Math):
     VERSION = 1
-    DATASET_NAME = 'precalculus'
+    DATASET_NAME = "precalculus"
diff --git a/lm_eval/tasks/hendrycks_test.py b/lm_eval/tasks/hendrycks_test.py
index 13995a7610..b2df32614a 100644
--- a/lm_eval/tasks/hendrycks_test.py
+++ b/lm_eval/tasks/hendrycks_test.py
@@ -3,11 +3,11 @@
 https://arxiv.org/pdf/2009.03300.pdf
 
 The Hendryck's Test is a benchmark that measured a text model’s multitask accuracy.
-The test covers 57 tasks including elementary mathematics, US history, computer 
+The test covers 57 tasks including elementary mathematics, US history, computer
 science, law, and more. To attain high accuracy on this test, models must possess
 extensive world knowledge and problem solving ability. By comprehensively evaluating
-the breadth and depth of a model’s academic and professional understanding, 
-Hendryck's Test can be used to analyze models across many tasks and to identify 
+the breadth and depth of a model’s academic and professional understanding,
+Hendryck's Test can be used to analyze models across many tasks and to identify
 important shortcomings.
 
 Homepage: https://github.com/hendrycks/test
@@ -25,16 +25,65 @@
 """
 
 
-SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
-            'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
-            'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
-            'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
-            'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics',
-            'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics',
-            'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence',
-            'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes',
-            'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine',
-            'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
+SUBJECTS = [
+    "abstract_algebra",
+    "anatomy",
+    "astronomy",
+    "business_ethics",
+    "clinical_knowledge",
+    "college_biology",
+    "college_chemistry",
+    "college_computer_science",
+    "college_mathematics",
+    "college_medicine",
+    "college_physics",
+    "computer_security",
+    "conceptual_physics",
+    "econometrics",
+    "electrical_engineering",
+    "elementary_mathematics",
+    "formal_logic",
+    "global_facts",
+    "high_school_biology",
+    "high_school_chemistry",
+    "high_school_computer_science",
+    "high_school_european_history",
+    "high_school_geography",
+    "high_school_government_and_politics",
+    "high_school_macroeconomics",
+    "high_school_mathematics",
+    "high_school_microeconomics",
+    "high_school_physics",
+    "high_school_psychology",
+    "high_school_statistics",
+    "high_school_us_history",
+    "high_school_world_history",
+    "human_aging",
+    "human_sexuality",
+    "international_law",
+    "jurisprudence",
+    "logical_fallacies",
+    "machine_learning",
+    "management",
+    "marketing",
+    "medical_genetics",
+    "miscellaneous",
+    "moral_disputes",
+    "moral_scenarios",
+    "nutrition",
+    "philosophy",
+    "prehistory",
+    "professional_accounting",
+    "professional_law",
+    "professional_medicine",
+    "professional_psychology",
+    "public_relations",
+    "security_studies",
+    "sociology",
+    "us_foreign_policy",
+    "virology",
+    "world_religions",
+]
 
 
 def create_all_tasks():
@@ -42,15 +91,14 @@ def create_all_tasks():
     :return: {task_name: task}
         e.g. {hendrycksTest-abstract_algebra: Task, hendrycksTest-anatomy: Task}
     """
-    return {
-        f"hendrycksTest-{sub}": create_task(sub) for sub in SUBJECTS
-    }
+    return {f"hendrycksTest-{sub}": create_task(sub) for sub in SUBJECTS}
 
 
 def create_task(subject):
     class HendrycksTest(GeneralHendrycksTest):
         def __init__(self):
             super().__init__(subject)
+
     return HendrycksTest
 
 
@@ -81,27 +129,32 @@ def test_docs(self):
     def _process_doc(self, doc):
         def format_example(doc, keys):
             """
-                Question: <prompt>
-                Choices:
-                A. <choice1>
-                B. <choice2>
-                C. <choice3>
-                D. <choice4>
-                Answer:
+            Question: <prompt>
+            Choices:
+            A. <choice1>
+            B. <choice2>
+            C. <choice3>
+            D. <choice4>
+            Answer:
             """
             prompt = "Question: " + doc["question"] + "\nChoices:\n"
-            prompt += "".join([f"{key}. {choice}\n" for key, choice in zip(keys, doc["choices"])])
+            prompt += "".join(
+                [f"{key}. {choice}\n" for key, choice in zip(keys, doc["choices"])]
+            )
             prompt += "Answer:"
             return prompt
-        keys = ['A', 'B', 'C', 'D']
+
+        keys = ["A", "B", "C", "D"]
         return {
             "query": format_example(doc, keys),
             "choices": doc["choices"],
-            "gold": keys.index(doc["answer"]) if isinstance(doc["answer"], str) else doc["answer"]
+            "gold": keys.index(doc["answer"])
+            if isinstance(doc["answer"], str)
+            else doc["answer"],
         }
 
     def fewshot_examples(self, k, rnd):
-        # fewshot_examples is not just sampling from train_docs because dev is 
+        # fewshot_examples is not just sampling from train_docs because dev is
         # in the same distribution as val/test but auxiliary_train isn't
 
         if self._fewshot_docs is None:
diff --git a/lm_eval/tasks/lambada.py b/lm_eval/tasks/lambada.py
index 9753d0cb1d..758a0ba27a 100644
--- a/lm_eval/tasks/lambada.py
+++ b/lm_eval/tasks/lambada.py
@@ -20,7 +20,7 @@
 
 _CITATION = """
 @misc{
-    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel}, 
+    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
     title={The LAMBADA dataset},
     DOI={10.5281/zenodo.2630551},
     publisher={Zenodo},
@@ -53,38 +53,29 @@ def test_docs(self):
         pass
 
     def doc_to_text(self, doc):
-        return doc['text'].rsplit(' ', 1)[0]
+        return doc["text"].rsplit(" ", 1)[0]
 
     def should_decontaminate(self):
         return True
 
     def doc_to_decontamination_query(self, doc):
-        return doc['text']
+        return doc["text"]
 
     def doc_to_target(self, doc):
-        return " " + doc['text'].rsplit(' ', 1)[1]
+        return " " + doc["text"].rsplit(" ", 1)[1]
 
     def construct_requests(self, doc, ctx):
         ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
 
         return ll, is_greedy
-    
+
     def process_results(self, doc, results):
         ll, is_greedy = results
 
-        return {
-            'ppl': ll,
-            'acc': int(is_greedy)
-        }
-        
+        return {"ppl": ll, "acc": int(is_greedy)}
+
     def aggregation(self):
-        return {
-            'ppl': perplexity,
-            'acc': mean
-        }
+        return {"ppl": perplexity, "acc": mean}
 
     def higher_is_better(self):
-        return {
-            'ppl': False,
-            'acc': True
-        }
+        return {"ppl": False, "acc": True}
diff --git a/lm_eval/tasks/lambada_cloze.py b/lm_eval/tasks/lambada_cloze.py
index b01c0f4ad0..23f0238997 100644
--- a/lm_eval/tasks/lambada_cloze.py
+++ b/lm_eval/tasks/lambada_cloze.py
@@ -18,7 +18,7 @@
 
 _CITATION = """
 @misc{
-    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel}, 
+    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
     title={The LAMBADA dataset},
     DOI={10.5281/zenodo.2630551},
     publisher={Zenodo},
@@ -32,13 +32,13 @@ class LAMBADA_cloze(LAMBADA):
     VERSION = 0
 
     def doc_to_text(self, doc):
-        return doc['text'].rsplit(' ', 1)[0] + " ____. ->"
+        return doc["text"].rsplit(" ", 1)[0] + " ____. ->"
 
     def should_decontaminate(self):
         return True
 
     def doc_to_decontamination_query(self, doc):
-        return doc['text']
+        return doc["text"]
 
     def doc_to_target(self, doc):
-        return " " + doc['text'].rsplit(' ', 1)[1]
+        return " " + doc["text"].rsplit(" ", 1)[1]
diff --git a/lm_eval/tasks/lambada_multilingual.py b/lm_eval/tasks/lambada_multilingual.py
index e1d830c7eb..63ca1dd405 100644
--- a/lm_eval/tasks/lambada_multilingual.py
+++ b/lm_eval/tasks/lambada_multilingual.py
@@ -18,7 +18,7 @@
 
 _CITATION = """
 @misc{
-    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel}, 
+    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
     title={The LAMBADA dataset},
     DOI={10.5281/zenodo.2630551},
     publisher={Zenodo},
@@ -33,28 +33,32 @@ class MultilingualLAMBADA(lambada.LAMBADA):
 
 
 class MultilingualLAMBADAEN(MultilingualLAMBADA):
-    DATASET_NAME = 'en'
+    DATASET_NAME = "en"
 
 
 class MultilingualLAMBADAFR(MultilingualLAMBADA):
-    DATASET_NAME = 'fr'
+    DATASET_NAME = "fr"
 
 
 class MultilingualLAMBADADE(MultilingualLAMBADA):
-    DATASET_NAME = 'de'
+    DATASET_NAME = "de"
 
 
 class MultilingualLAMBADAIT(MultilingualLAMBADA):
-    DATASET_NAME = 'it'
+    DATASET_NAME = "it"
 
 
 class MultilingualLAMBADAES(MultilingualLAMBADA):
-    DATASET_NAME = 'es'
+    DATASET_NAME = "es"
 
 
-LANG_CLASSES = [MultilingualLAMBADAEN, MultilingualLAMBADAFR,
-                MultilingualLAMBADADE, MultilingualLAMBADAIT,
-                MultilingualLAMBADAES]
+LANG_CLASSES = [
+    MultilingualLAMBADAEN,
+    MultilingualLAMBADAFR,
+    MultilingualLAMBADADE,
+    MultilingualLAMBADAIT,
+    MultilingualLAMBADAES,
+]
 
 
 def construct_tasks():
diff --git a/lm_eval/tasks/logiqa.py b/lm_eval/tasks/logiqa.py
index 2c923918f5..0f487e7abe 100644
--- a/lm_eval/tasks/logiqa.py
+++ b/lm_eval/tasks/logiqa.py
@@ -17,7 +17,7 @@
 
 _CITATION = """
 @misc{liu2020logiqa,
-    title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, 
+    title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning},
     author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},
     year={2020},
     eprint={2007.08124},
@@ -55,14 +55,14 @@ def test_docs(self):
     def _process_doc(self, doc):
         def format_example(doc, choices):
             """
-                Passage: <passage>
-                Question: <question>
-                Choices:
-                A. <choice1>
-                B. <choice2>
-                C. <choice3>
-                D. <choice4>
-                Answer:
+            Passage: <passage>
+            Question: <question>
+            Choices:
+            A. <choice1>
+            B. <choice2>
+            C. <choice3>
+            D. <choice4>
+            Answer:
             """
             prompt = "Passage: " + doc["context"] + "\n"
             prompt += "Question: " + doc["question"] + "\nChoices:\n"
@@ -70,12 +70,13 @@ def format_example(doc, choices):
                 prompt += f"{choice.upper()}. {option}\n"
             prompt += "Answer:"
             return prompt
-        choices = ['a', 'b', 'c', 'd']
+
+        choices = ["a", "b", "c", "d"]
         return {
-            "passage": doc["context"], # Used for decontamination
+            "passage": doc["context"],  # Used for decontamination
             "query": format_example(doc, choices),
             "choices": doc["options"],
-            "gold": choices.index(doc["label"])
+            "gold": choices.index(doc["label"]),
         }
 
     def doc_to_text(self, doc):
diff --git a/lm_eval/tasks/mathqa.py b/lm_eval/tasks/mathqa.py
index 1cd78c1f69..903f3ca16a 100644
--- a/lm_eval/tasks/mathqa.py
+++ b/lm_eval/tasks/mathqa.py
@@ -14,7 +14,7 @@
 
 _CITATION = """
 @misc{amini2019mathqa,
-    title={MathQA: Towards Interpretable Math Word Problem Solving with Operation-Based Formalisms}, 
+    title={MathQA: Towards Interpretable Math Word Problem Solving with Operation-Based Formalisms},
     author={Aida Amini and Saadia Gabriel and Peter Lin and Rik Koncel-Kedziorski and Yejin Choi and Hannaneh Hajishirzi},
     year={2019},
     eprint={1905.13319},
@@ -50,11 +50,14 @@ def test_docs(self):
         return map(self._process_doc, self.dataset["test"])
 
     def _process_doc(self, doc):
-        answer_idx = ['a', 'b', 'c', 'd', 'e'].index(doc['correct'])
-        choices = [c[4:].rstrip(" ,") for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc['options'])]
+        answer_idx = ["a", "b", "c", "d", "e"].index(doc["correct"])
+        choices = [
+            c[4:].rstrip(" ,")
+            for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc["options"])
+        ]
 
         out_doc = {
-            "query": "Question: " + doc['Problem'] + "\nAnswer:",
+            "query": "Question: " + doc["Problem"] + "\nAnswer:",
             "choices": choices,
             "gold": answer_idx,
         }
diff --git a/lm_eval/tasks/mc_taco.py b/lm_eval/tasks/mc_taco.py
index e2b4f7e771..6d41fdfd84 100644
--- a/lm_eval/tasks/mc_taco.py
+++ b/lm_eval/tasks/mc_taco.py
@@ -3,18 +3,18 @@
 A Study of Temporal Commonsense Understanding
 https://arxiv.org/pdf/1909.03065.pdf
 
-MC-TACO is a dataset of 13k question-answer pairs that require temporal commonsense 
+MC-TACO is a dataset of 13k question-answer pairs that require temporal commonsense
 comprehension. The dataset contains five temporal properties, (1) duration (how long
-an event takes), (2) temporal ordering (typical order of events), (3) typical time 
+an event takes), (2) temporal ordering (typical order of events), (3) typical time
 (when an event occurs), (4) frequency (how often an event occurs), and (5) stationarity
 (whether a state is maintained for a very long time or indefinitely).
 
-WARNING: Running this task with a `--limit` arg will give misleading results! The 
+WARNING: Running this task with a `--limit` arg will give misleading results! The
 corresponding dataset is structured such that each multiple-choice-question gathered
-by the authors is split into question-option pairs, where each such pair gets 
+by the authors is split into question-option pairs, where each such pair gets
 siloed into an individual document for plausibility testing. Because the harness
 shuffles these documents, setting `--limit` will likely "cut off" certain candidate
-answers. This is a problem because the task's metrics require an exhaustive evaluation 
+answers. This is a problem because the task's metrics require an exhaustive evaluation
 of a question's options. See section 4 of the paper for details.
 
 Homepage: https://leaderboard.allenai.org/mctaco/submissions/public
@@ -55,20 +55,22 @@ def test_docs(self):
         return self.dataset["test"]
 
     def doc_to_text(self, doc):
-        return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
+        return (
+            f"{doc['sentence']}\nQuestion: {doc['question']}\n"
             f"Answer: {doc['answer']}\nPlausible:"
+        )
 
     def should_decontaminate(self):
         return True
 
     def doc_to_decontamination_query(self, doc):
-        return doc['question'] + " " + doc['sentence']
+        return doc["question"] + " " + doc["sentence"]
 
     def doc_to_target(self, doc):
-        return " " + ["no", "yes"][doc['label']]
+        return " " + ["no", "yes"][doc["label"]]
 
     def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of
+        """Uses RequestFactory to construct Requests and returns an iterable of
         Requests which will be sent to the LM.
 
         :param doc:
@@ -93,18 +95,15 @@ def process_results(self, doc, results):
             The results of the requests created in construct_requests.
         """
         ll_no, ll_yes = results
-        gold = doc['label']
+        gold = doc["label"]
         pred = int(ll_yes > ll_no)
         question_id = self._question2id(doc)
         items = (gold, pred, question_id)
-        return {
-            "em": items,
-            "f1": items
-        }
+        return {"em": items, "f1": items}
 
     def _question2id(self, doc):
-        """ Returns an identifier for the question in the given document. """
-        return " ".join([doc['sentence'], doc['question']])
+        """Returns an identifier for the question in the given document."""
+        return " ".join([doc["sentence"], doc["question"]])
 
     def aggregation(self):
         return {
@@ -132,7 +131,7 @@ def exact_match(items):
 
 
 def f1(items):
-    """ See section 4 "Evaluation Metrics" in the paper about the F1 metric used. """
+    """See section 4 "Evaluation Metrics" in the paper about the F1 metric used."""
     results = list(zip(*items))
     # Group the positive ("yes" = 1) golds and predictions by question.
     gold_positives, pred_positives = defaultdict(list), defaultdict(list)
@@ -146,5 +145,5 @@ def f1(items):
         p = tp / pp if pp > 0.0 else 1.0
         r = tp / gp if gp > 0.0 else 1.0
         if p + r > 0.0:
-            f1.append(2. * (p * r) / (p + r))
+            f1.append(2.0 * (p * r) / (p + r))
     return np.mean(f1)
diff --git a/lm_eval/tasks/mutual.py b/lm_eval/tasks/mutual.py
index 13dcb508b5..d2d17a81df 100644
--- a/lm_eval/tasks/mutual.py
+++ b/lm_eval/tasks/mutual.py
@@ -29,7 +29,7 @@ class MuTualBase(Task):
     VERSION = 1
     DATASET_PATH = inspect.getfile(lm_eval.datasets.mutual.mutual)
     DATASET_NAME = None
-    CHOICES = ['A', 'B', 'C', 'D']
+    CHOICES = ["A", "B", "C", "D"]
 
     def has_training_docs(self):
         return True
@@ -88,26 +88,14 @@ def process_results(self, doc, results):
         r4_1 = np.argmax(results) == gold  # r4_1 = accuracy
         ranks = sorted(results, reverse=True)
         r4_2 = (ranks.index(results[gold]) == 1) + r4_1
-        mrr = 1. / (ranks.index(results[gold]) + 1)  # `+ 1` for index offset
-        return {
-            "r@1": r4_1,
-            "r@2": r4_2,
-            "mrr": mrr
-        }
+        mrr = 1.0 / (ranks.index(results[gold]) + 1)  # `+ 1` for index offset
+        return {"r@1": r4_1, "r@2": r4_2, "mrr": mrr}
 
     def aggregation(self):
-        return {
-            "r@1": mean,
-            "r@2": mean,
-            "mrr": mean
-        }
+        return {"r@1": mean, "r@2": mean, "mrr": mean}
 
     def higher_is_better(self):
-        return {
-            "r@1": True,
-            "r@2": True,
-            "mrr": True
-        }
+        return {"r@1": True, "r@2": True, "mrr": True}
 
 
 class MuTual(MuTualBase):
diff --git a/lm_eval/tasks/naturalqs.py b/lm_eval/tasks/naturalqs.py
index 1cfa279129..4a2d526f9e 100644
--- a/lm_eval/tasks/naturalqs.py
+++ b/lm_eval/tasks/naturalqs.py
@@ -1,126 +1,134 @@
-"""
-Natural Questions: a Benchmark for Question Answering Research
-https://storage.googleapis.com/pub-tools-public-publication-data/pdf/1f7b46b5378d757553d3e92ead36bda2e4254244.pdf
-
-The Natural Questions (NQ) corpus is a question-answering dataset that contains
-questions from real users and requires QA systems to read and comprehend an entire
-Wikipedia article that may or may not contain the answer to the question. The
-inclusion of real user questions, and the requirement that solutions should read
-an entire page to find the answer, cause NQ to be a more realistic and challenging
-task than prior QA datasets.
-
-TODO: NaturalQS has a *really* large train set that huggingface just automatically
-downloads even if you dont use it. we should try and only download the val set and
-not even bother with the train set. 
-
-Homepage: https://ai.google.com/research/NaturalQuestions
-"""
-from lm_eval.base import Task
-from itertools import islice
-
-
-_CITATION = """
-@article{47761,
-    title={Natural Questions: a Benchmark for Question Answering Research},
-    author={Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},
-    year={2019},
-    journal={Transactions of the Association of Computational Linguistics}
-}
-"""
-
-
-class NaturalQs(Task):
-    VERSION = 0
-    DATASET_PATH = "natural_questions"
-    DATASET_NAME = None
-
-    def has_training_docs(self):
-        return True
-
-    def has_validation_docs(self):
-        return True
-
-    def has_test_docs(self):
-        return False
-
-    def training_docs(self):
-        # Cache training for faster few-shot.
-        # Data is too large to fit in memory.
-        if self._training_docs is None:
-            self._training_docs = list(self.dataset["train"])
-        return self._training_docs
-
-    def validation_docs(self):
-        return self.dataset["validation"]
-
-    def fewshot_examples(self, k, rnd):
-        # Data is too large to fit in memory. We just sample from the first bit.
-        if self._training_docs is None:
-            self._training_docs = list(islice(self.training_docs(), 0, 100000))
-
-        return rnd.sample(self._training_docs, k)
-
-    def doc_to_text(self, doc):
-        return 'Q: ' + doc['question']['text'] + '\n\n' + 'A:'
-
-    def should_decontaminate(self):
-        return True
-
-    def doc_to_decontamination_query(self, doc):
-        return doc['question']['text']
-
-    def doc_to_target(self, doc):
-        # There's a short answer and a long answer. Based on the paper, I'm using the long answer.
-        short_answer = doc['annotations']['short_answers'][0]['text']
-        long_answer_start = doc['annotations']['long_answer'][0]['start_token']
-        long_answer_end = doc['annotations']['long_answer'][0]['end_token']
-        long_answer_span = doc['document']['tokens']['token'][long_answer_start:long_answer_end]
-        long_answer_is_html = doc['document']['tokens']['is_html'][long_answer_start:long_answer_end]
-        long_answer_chars = [tok for (tok, is_html) in zip(long_answer_span, long_answer_is_html) if not is_html]
-        long_answer = " ".join(long_answer_chars)
-        return long_answer # Replace with short_answer[0] for short answer
-
-    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
-        Requests which will be sent to the LM.
-
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural 
-            language description, as well as the few shot examples, and the question
-            part of the document for `doc`. 
-        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
-    
-    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a 
-        dict where keys are the names of submetrics and values are the values of 
-        the metric for that one document
-
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param results:
-            The results of the requests created in construct_requests.
-        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
-
-    def aggregation(self):
-        """
-        :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are 
-            functions that aggregate a list of metrics
-        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
-
-    def higher_is_better(self):
-        """
-        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are 
-            whether a higher value of the submetric is better
-        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+"""
+Natural Questions: a Benchmark for Question Answering Research
+https://storage.googleapis.com/pub-tools-public-publication-data/pdf/1f7b46b5378d757553d3e92ead36bda2e4254244.pdf
+
+The Natural Questions (NQ) corpus is a question-answering dataset that contains
+questions from real users and requires QA systems to read and comprehend an entire
+Wikipedia article that may or may not contain the answer to the question. The
+inclusion of real user questions, and the requirement that solutions should read
+an entire page to find the answer, cause NQ to be a more realistic and challenging
+task than prior QA datasets.
+
+TODO: NaturalQS has a *really* large train set that huggingface just automatically
+downloads even if you dont use it. we should try and only download the val set and
+not even bother with the train set.
+
+Homepage: https://ai.google.com/research/NaturalQuestions
+"""
+from lm_eval.base import Task
+from itertools import islice
+
+
+_CITATION = """
+@article{47761,
+    title={Natural Questions: a Benchmark for Question Answering Research},
+    author={Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},
+    year={2019},
+    journal={Transactions of the Association of Computational Linguistics}
+}
+"""
+
+
+class NaturalQs(Task):
+    VERSION = 0
+    DATASET_PATH = "natural_questions"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        # Cache training for faster few-shot.
+        # Data is too large to fit in memory.
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def fewshot_examples(self, k, rnd):
+        # Data is too large to fit in memory. We just sample from the first bit.
+        if self._training_docs is None:
+            self._training_docs = list(islice(self.training_docs(), 0, 100000))
+
+        return rnd.sample(self._training_docs, k)
+
+    def doc_to_text(self, doc):
+        return "Q: " + doc["question"]["text"] + "\n\n" + "A:"
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["question"]["text"]
+
+    def doc_to_target(self, doc):
+        # There's a short answer and a long answer. Based on the paper, I'm using the long answer.
+        short_answer = doc["annotations"]["short_answers"][0]["text"]
+        long_answer_start = doc["annotations"]["long_answer"][0]["start_token"]
+        long_answer_end = doc["annotations"]["long_answer"][0]["end_token"]
+        long_answer_span = doc["document"]["tokens"]["token"][
+            long_answer_start:long_answer_end
+        ]
+        long_answer_is_html = doc["document"]["tokens"]["is_html"][
+            long_answer_start:long_answer_end
+        ]
+        long_answer_chars = [
+            tok
+            for (tok, is_html) in zip(long_answer_span, long_answer_is_html)
+            if not is_html
+        ]
+        long_answer = " ".join(long_answer_chars)
+        return long_answer  # Replace with short_answer[0] for short answer
+
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError("Evaluation not implemented")
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError("Evaluation not implemented")
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError("Evaluation not implemented")
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError("Evaluation not implemented")
diff --git a/lm_eval/tasks/openbookqa.py b/lm_eval/tasks/openbookqa.py
index 8c795e1284..9738ac9209 100644
--- a/lm_eval/tasks/openbookqa.py
+++ b/lm_eval/tasks/openbookqa.py
@@ -1,71 +1,71 @@
-"""
-Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering
-https://arxiv.org/pdf/1809.02789.pdf
-
-OpenBookQA is a question-answering dataset modeled after open book exams for
-assessing human understanding of a subject. It consists of 5,957 multiple-choice
-elementary-level science questions (4,957 train, 500 dev, 500 test), which probe
-the understanding of a small “book” of 1,326 core science facts and the application
-of these facts to novel situations. For training, the dataset includes a mapping
-from each question to the core science fact it was designed to probe. Answering
-OpenBookQA questions requires additional broad common knowledge, not contained
-in the book. The questions, by design, are answered incorrectly by both a retrieval-
-based algorithm and a word co-occurrence algorithm.
-
-Homepage: https://allenai.org/data/open-book-qa
-"""
-from lm_eval.base import MultipleChoiceTask
-
-
-_CITATION = """
-@inproceedings{OpenBookQA2018,
-    title={Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering},
-    author={Todor Mihaylov and Peter Clark and Tushar Khot and Ashish Sabharwal},
-    booktitle={EMNLP},
-    year={2018}
-}
-"""
-
-
-class OpenBookQA(MultipleChoiceTask):
-    VERSION = 0
-    DATASET_PATH = "openbookqa"
-    DATASET_NAME = "main"
-
-    def has_training_docs(self):
-        return True
-
-    def has_validation_docs(self):
-        return True
-
-    def has_test_docs(self):
-        return True
-
-    def training_docs(self):
-        if self._training_docs is None:
-            self._training_docs = list(map(self._process_doc, self.dataset["train"]))
-        return self._training_docs
-
-    def validation_docs(self):
-        return map(self._process_doc, self.dataset["validation"])
-
-    def test_docs(self):
-        return map(self._process_doc, self.dataset["test"])
-
-    def _process_doc(self, doc):
-        out_doc = {
-            "id": doc["id"],
-            "query": doc["question_stem"],
-            "choices": doc["choices"]["text"],
-            "gold": ["A", "B", "C", "D"].index(doc["answerKey"].strip()),
-        }
-        return out_doc
-
-    def doc_to_text(self, doc):
-        return doc["query"]
-
-    def should_decontaminate(self):
-        return True
-
-    def doc_to_decontamination_query(self, doc):
-        return doc["query"]
+"""
+Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering
+https://arxiv.org/pdf/1809.02789.pdf
+
+OpenBookQA is a question-answering dataset modeled after open book exams for
+assessing human understanding of a subject. It consists of 5,957 multiple-choice
+elementary-level science questions (4,957 train, 500 dev, 500 test), which probe
+the understanding of a small “book” of 1,326 core science facts and the application
+of these facts to novel situations. For training, the dataset includes a mapping
+from each question to the core science fact it was designed to probe. Answering
+OpenBookQA questions requires additional broad common knowledge, not contained
+in the book. The questions, by design, are answered incorrectly by both a retrieval-
+based algorithm and a word co-occurrence algorithm.
+
+Homepage: https://allenai.org/data/open-book-qa
+"""
+from lm_eval.base import MultipleChoiceTask
+
+
+_CITATION = """
+@inproceedings{OpenBookQA2018,
+    title={Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering},
+    author={Todor Mihaylov and Peter Clark and Tushar Khot and Ashish Sabharwal},
+    booktitle={EMNLP},
+    year={2018}
+}
+"""
+
+
+class OpenBookQA(MultipleChoiceTask):
+    VERSION = 0
+    DATASET_PATH = "openbookqa"
+    DATASET_NAME = "main"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(map(self._process_doc, self.dataset["train"]))
+        return self._training_docs
+
+    def validation_docs(self):
+        return map(self._process_doc, self.dataset["validation"])
+
+    def test_docs(self):
+        return map(self._process_doc, self.dataset["test"])
+
+    def _process_doc(self, doc):
+        out_doc = {
+            "id": doc["id"],
+            "query": doc["question_stem"],
+            "choices": doc["choices"]["text"],
+            "gold": ["A", "B", "C", "D"].index(doc["answerKey"].strip()),
+        }
+        return out_doc
+
+    def doc_to_text(self, doc):
+        return doc["query"]
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["query"]
diff --git a/lm_eval/tasks/piqa.py b/lm_eval/tasks/piqa.py
index bd2726a167..0b48571504 100644
--- a/lm_eval/tasks/piqa.py
+++ b/lm_eval/tasks/piqa.py
@@ -5,7 +5,7 @@
 Physical Interaction: Question Answering (PIQA) is a physical commonsense
 reasoning and a corresponding benchmark dataset. PIQA was designed to investigate
 the physical knowledge of existing models. To what extent are current approaches
-actually learning about the world? 
+actually learning about the world?
 
 Homepage: https://yonatanbisk.com/piqa/
 """
diff --git a/lm_eval/tasks/prost.py b/lm_eval/tasks/prost.py
index 4cfdeca0f5..f5ddc0685e 100644
--- a/lm_eval/tasks/prost.py
+++ b/lm_eval/tasks/prost.py
@@ -52,20 +52,21 @@ def has_test_docs(self):
     def test_docs(self):
         return map(self._process_doc, self.dataset["test"])
 
-    def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
-        assert num_fewshot == 0, 'PROST is designed to probe models in a zero-shot fashion only.'
+    def fewshot_context(
+        self, doc, num_fewshot, provide_description=None, rnd=None, description=None
+    ):
+        assert (
+            num_fewshot == 0
+        ), "PROST is designed to probe models in a zero-shot fashion only."
         return super().fewshot_context(
-            doc=doc,
-            num_fewshot=num_fewshot,
-            rnd=rnd,
-            description=description
+            doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
         )
 
     def _process_doc(self, doc):
         out_doc = {
             "query": f"{doc['context']}\nQuestion: {doc['ex_question']}\nAnswer:",
-            "choices": [doc['A'], doc['B'], doc['C'], doc['D']],
-            "gold": doc['label'],
+            "choices": [doc["A"], doc["B"], doc["C"], doc["D"]],
+            "gold": doc["label"],
         }
         return out_doc
 
diff --git a/lm_eval/tasks/pubmedqa.py b/lm_eval/tasks/pubmedqa.py
index 45aaa1cd7f..b4e3d24e57 100644
--- a/lm_eval/tasks/pubmedqa.py
+++ b/lm_eval/tasks/pubmedqa.py
@@ -3,14 +3,14 @@
 https://arxiv.org/pdf/1909.06146.pdf
 
 PubMedQA is a novel biomedical question answering (QA) dataset collected from
-PubMed abstracts. The task of PubMedQA is to answer research questions with 
-yes/no/maybe (e.g.: Do preoperative statins reduce atrial fibrillation after 
-coronary artery bypass grafting?) using the corresponding abstracts. PubMedQA 
-has 1k expert-annotated, 61.2k unlabeled and 211.3k artificially generated QA 
+PubMed abstracts. The task of PubMedQA is to answer research questions with
+yes/no/maybe (e.g.: Do preoperative statins reduce atrial fibrillation after
+coronary artery bypass grafting?) using the corresponding abstracts. PubMedQA
+has 1k expert-annotated, 61.2k unlabeled and 211.3k artificially generated QA
 instances. Each PubMedQA instance is composed of (1) a question which is either
 an existing research article title or derived from one, (2) a context which is
 the corresponding abstract without its conclusion, (3) a long answer, which is
-the conclusion of the abstract and, presumably, answers the research question, 
+the conclusion of the abstract and, presumably, answers the research question,
 and (4) a yes/no/maybe answer which summarizes the conclusion.
 
 Homepage: https://pubmedqa.github.io/
@@ -53,9 +53,7 @@ def test_docs(self):
     def doc_to_text(self, doc):
         ctxs = "\n".join(doc["context"]["contexts"])
         return "Abstract: {}\nQuestion: {}\nAnswer:".format(
-            ctxs,
-            doc["question"],
-            doc["final_decision"]
+            ctxs, doc["question"], doc["final_decision"]
         )
 
     def should_decontaminate(self):
@@ -68,7 +66,7 @@ def doc_to_target(self, doc):
         return " {}".format(doc["final_decision"])
 
     def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns
+        """Uses RequestFactory to construct Requests and returns
         an iterable of Requests which will be sent to the LM.
         """
         ll_yes, _ = rf.loglikelihood(ctx, " yes")
@@ -81,15 +79,11 @@ def process_results(self, doc, results):
         ll_yes, ll_no, ll_maybe = results
         pred = np.argmax(results)
         return {
-            "acc": ["yes", "no", "maybe"][pred] == gold, 
+            "acc": ["yes", "no", "maybe"][pred] == gold,
         }
 
     def aggregation(self):
-        return {
-            "acc" : mean
-        }
+        return {"acc": mean}
 
     def higher_is_better(self):
-        return {
-            "acc" : True
-        }
+        return {"acc": True}
diff --git a/lm_eval/tasks/qa4mre.py b/lm_eval/tasks/qa4mre.py
index cea5d86cc1..26dbed6f96 100644
--- a/lm_eval/tasks/qa4mre.py
+++ b/lm_eval/tasks/qa4mre.py
@@ -3,9 +3,9 @@
 https://www.cs.cmu.edu/~./hovy/papers/13CLEF-QA4MRE.pdf
 
 The (English only) QA4MRE challenge which was run as a Lab at CLEF 2011-2013.
-The main objective of this exercise is to develop a methodology for evaluating 
-Machine Reading systems through Question Answering and Reading Comprehension 
-Tests. Systems should be able to extract knowledge from large volumes of text 
+The main objective of this exercise is to develop a methodology for evaluating
+Machine Reading systems through Question Answering and Reading Comprehension
+Tests. Systems should be able to extract knowledge from large volumes of text
 and use this knowledge to answer questions. Four different tasks have been
 organized during these years: Main Task, Processing Modality and Negation for
 Machine Reading, Machine Reading of Biomedical Texts about Alzheimer's disease,
@@ -47,7 +47,7 @@ def test_docs(self):
     def _process_doc(self, doc):
         choices = doc["answer_options"]["answer_str"]
         out_doc = {
-            "source": doc["document_str"].strip().replace("\'", "'"),
+            "source": doc["document_str"].strip().replace("'", "'"),
             "query": doc["question_str"],
             "choices": choices,
             "gold": int(doc["correct_answer_id"]) - 1,
diff --git a/lm_eval/tasks/qasper.py b/lm_eval/tasks/qasper.py
index a8a7e0f48e..5b38c78065 100644
--- a/lm_eval/tasks/qasper.py
+++ b/lm_eval/tasks/qasper.py
@@ -1,4 +1,4 @@
-""" 
+"""
 A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers
 https://arxiv.org/abs/2105.03011
 
diff --git a/lm_eval/tasks/quac.py b/lm_eval/tasks/quac.py
index a47c2c2e15..d37384f994 100644
--- a/lm_eval/tasks/quac.py
+++ b/lm_eval/tasks/quac.py
@@ -1,112 +1,123 @@
-"""
-QuAC: Question Answering in Context
-https://arxiv.org/abs/1808.07036 
-
-Question Answering in Context (QuAC) is a dataset for modeling, understanding, and 
-participating in information seeking dialog. Data instances consist of an interactive
-dialog between two crowd workers: (1) a student who poses a sequence of freeform
-questions to learn as much as possible about a hidden Wikipedia text, and (2)
-a teacher who answers the questions by providing short excerpts (spans) from the text.
-
-Homepage: https://quac.ai/
-"""
-import inspect
-import lm_eval.datasets.quac.quac
-from lm_eval.base import Task
-
-
-_CITATION = """
-@article{choi2018quac,
-    title={Quac: Question answering in context},
-    author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},
-    journal={arXiv preprint arXiv:1808.07036},
-    year={2018}
-}
-"""
-
-
-class QuAC(Task):
-    VERSION = 0
-    DATASET_PATH = inspect.getfile(lm_eval.datasets.quac.quac)
-    DATASET_NAME = None
-
-    def has_training_docs(self):
-        return True
-
-    def has_validation_docs(self):
-        return True
-
-    def has_test_docs(self):
-        return False
-
-    def training_docs(self):
-        if self._training_docs is None:
-            self._training_docs = list(map(self._process_doc, self.dataset["train"]))
-        return self._training_docs
-
-    def validation_docs(self):
-        return map(self._process_doc, self.dataset["validation"])
-
-    def test_docs(self):
-        raise NotImplementedError("QuAC has no test docs.")
-
-    def _process_doc(self, doc):
-        doc["title"] = doc['title'] + ' - ' + doc['section_title']
-        return doc
-
-    def doc_to_text(self, doc):
-        return 'TITLE: ' + doc['title'] + '\n' + 'PARAGRAPH: ' + doc['paragraph'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
-
-    def should_decontaminate(self):
-        return True
-
-    def doc_to_decontamination_query(self, doc):
-        return doc['paragraph']
-
-    def doc_to_target(self, doc):
-        return doc['answer']
-
-    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
-        Requests which will be sent to the LM.
-
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural 
-            language description, as well as the few shot examples, and the question
-            part of the document for `doc`.
-        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
-    
-    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a 
-        dict where keys are the names of submetrics and values are the values of 
-        the metric for that one document
-
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param results:
-            The results of the requests created in construct_requests.
-        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
-
-    def aggregation(self):
-        """
-        :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are 
-            functions that aggregate a list of metrics
-        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
-
-    def higher_is_better(self):
-        """
-        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are 
-            whether a higher value of the submetric is better
-        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+"""
+QuAC: Question Answering in Context
+https://arxiv.org/abs/1808.07036
+
+Question Answering in Context (QuAC) is a dataset for modeling, understanding, and
+participating in information seeking dialog. Data instances consist of an interactive
+dialog between two crowd workers: (1) a student who poses a sequence of freeform
+questions to learn as much as possible about a hidden Wikipedia text, and (2)
+a teacher who answers the questions by providing short excerpts (spans) from the text.
+
+Homepage: https://quac.ai/
+"""
+import inspect
+import lm_eval.datasets.quac.quac
+from lm_eval.base import Task
+
+
+_CITATION = """
+@article{choi2018quac,
+    title={Quac: Question answering in context},
+    author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},
+    journal={arXiv preprint arXiv:1808.07036},
+    year={2018}
+}
+"""
+
+
+class QuAC(Task):
+    VERSION = 0
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.quac.quac)
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(map(self._process_doc, self.dataset["train"]))
+        return self._training_docs
+
+    def validation_docs(self):
+        return map(self._process_doc, self.dataset["validation"])
+
+    def test_docs(self):
+        raise NotImplementedError("QuAC has no test docs.")
+
+    def _process_doc(self, doc):
+        doc["title"] = doc["title"] + " - " + doc["section_title"]
+        return doc
+
+    def doc_to_text(self, doc):
+        return (
+            "TITLE: "
+            + doc["title"]
+            + "\n"
+            + "PARAGRAPH: "
+            + doc["paragraph"]
+            + "\n\n"
+            + "Q: "
+            + doc["question"]
+            + "\n\n"
+            + "A: "
+        )
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["paragraph"]
+
+    def doc_to_target(self, doc):
+        return doc["answer"]
+
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError("Evaluation not implemented")
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError("Evaluation not implemented")
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError("Evaluation not implemented")
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError("Evaluation not implemented")
diff --git a/lm_eval/tasks/race.py b/lm_eval/tasks/race.py
index cac3f452b7..f4a5653e34 100644
--- a/lm_eval/tasks/race.py
+++ b/lm_eval/tasks/race.py
@@ -20,7 +20,7 @@
 @article{lai2017large,
     title={RACE: Large-scale ReAding Comprehension Dataset From Examinations},
     author={Lai, Guokun and Xie, Qizhe and Liu, Hanxiao and Yang, Yiming and Hovy, Eduard},
-    journal={arXiv preprint arXiv:1704.04683},  
+    journal={arXiv preprint arXiv:1704.04683},
     year={2017}
 }
 """
@@ -40,7 +40,7 @@ class RACE(Task):
     DATASET_NAME = "high"
 
     cache = {}
-    letter_to_num = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
+    letter_to_num = {"A": 0, "B": 1, "C": 2, "D": 3}
 
     def has_training_docs(self):
         return True
@@ -59,17 +59,27 @@ def _collate_data(self, set):
         # is shown that one document is made per passage.
 
         r = collections.defaultdict(list)
-        for item in datasets.load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)[set]:
-            r[item['article']].append(item)
-        
-        res = list(r.values() >> each(lambda x: {
-            'article': x[0]['article'],
-            'problems': x >> each(lambda y: {
-                'question': y['question'],
-                'answer': y['answer'],
-                'options': y['options'],
-            })
-        }))
+        for item in datasets.load_dataset(
+            path=self.DATASET_PATH, name=self.DATASET_NAME
+        )[set]:
+            r[item["article"]].append(item)
+
+        res = list(
+            r.values()
+            >> each(
+                lambda x: {
+                    "article": x[0]["article"],
+                    "problems": x
+                    >> each(
+                        lambda y: {
+                            "question": y["question"],
+                            "answer": y["answer"],
+                            "options": y["options"],
+                        }
+                    ),
+                }
+            )
+        )
 
         self.cache[set] = res
         return res
@@ -85,55 +95,56 @@ def test_docs(self):
 
     @classmethod
     def get_answer_option(cls, problem):
-        answer = cls.letter_to_num[problem['answer']]
-        return problem['options'][answer]
+        answer = cls.letter_to_num[problem["answer"]]
+        return problem["options"][answer]
 
     @classmethod
     def last_problem(cls, doc):
-        return doc['problems'][-1]
+        return doc["problems"][-1]
 
     def doc_to_text(self, doc):
-        text = 'Article: ' + doc['article'] + '\n\n'
-        for problem in doc['problems'][:-1]:
-            if problem['question'][-6:] == '  _  .':
-                text += problem['question'][-5:] + self.get_answer_option(problem) + '\n'
+        text = "Article: " + doc["article"] + "\n\n"
+        for problem in doc["problems"][:-1]:
+            if problem["question"][-6:] == "  _  .":
+                text += (
+                    problem["question"][-5:] + self.get_answer_option(problem) + "\n"
+                )
             else:
-                question = 'Question: ' + problem['question'] + '\n'
-                answer = 'Answer: ' + self.get_answer_option(problem) + '\n'
+                question = "Question: " + problem["question"] + "\n"
+                answer = "Answer: " + self.get_answer_option(problem) + "\n"
                 text += question + answer
-        text += self.last_problem(doc)['question']
+        text += self.last_problem(doc)["question"]
         return text
 
     def should_decontaminate(self):
         return True
 
     def doc_to_decontamination_query(self, doc):
-        return doc['article']
+        return doc["article"]
 
     def doc_to_target(self, doc):
         return " " + self.get_answer_option(self.last_problem(doc))
 
     def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        """Uses RequestFactory to construct Requests and returns an iterable of
         Requests which will be sent to the LM.
 
         :param doc:
             The document as returned from training_docs, validation_docs, or test_docs.
         :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural 
+            The context string, generated by fewshot_context. This includes the natural
             language description, as well as the few shot examples, and the question
-            part of the document for `doc`. 
+            part of the document for `doc`.
         """
         problem = self.last_problem(doc)
         ll_choices = [
-            rf.loglikelihood(ctx, " " + problem['options'][i])[0]
-            for i in range(4)
+            rf.loglikelihood(ctx, " " + problem["options"][i])[0] for i in range(4)
         ]
         return ll_choices
 
     def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a 
-        dict where keys are the names of submetrics and values are the values of 
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
         the metric for that one document
 
         :param doc:
@@ -141,28 +152,22 @@ def process_results(self, doc, results):
         :param results:
             The results of the requests created in construct_requests.
         """
-        gold = self.letter_to_num[self.last_problem(doc)['answer']]
+        gold = self.letter_to_num[self.last_problem(doc)["answer"]]
         pred = np.argmax(results)
-        return {
-            "acc": int(pred == gold)
-        }
+        return {"acc": int(pred == gold)}
 
     def aggregation(self):
         """
         :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are 
+            A dictionary where keys are the names of submetrics and values are
             functions that aggregate a list of metrics
         """
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}
 
     def higher_is_better(self):
         """
         :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are 
+            A dictionary where keys are the names of submetrics and values are
             whether a higher value of the submetric is better
         """
-        return {
-            "acc": True
-        }
+        return {"acc": True}
diff --git a/lm_eval/tasks/sat.py b/lm_eval/tasks/sat.py
index db2df08a60..736bc204d0 100644
--- a/lm_eval/tasks/sat.py
+++ b/lm_eval/tasks/sat.py
@@ -59,14 +59,16 @@ def test_docs(self):
 
     def _process_doc(self, doc):
         return {
-            'source': doc['source'],
-            'query': doc['stem'].split(' ')[:2],
-            'choices': ["{} is to {}".format(*c.split(' ')[:2]) for c in doc["choices"]],
-            'gold': ['a', 'b', 'c', 'd', 'e'].index(doc['solution'].strip()),
+            "source": doc["source"],
+            "query": doc["stem"].split(" ")[:2],
+            "choices": [
+                "{} is to {}".format(*c.split(" ")[:2]) for c in doc["choices"]
+            ],
+            "gold": ["a", "b", "c", "d", "e"].index(doc["solution"].strip()),
         }
 
     def doc_to_text(self, doc):
-        return "{} is to {} as".format(*doc['query'])
+        return "{} is to {} as".format(*doc["query"])
 
     def should_decontaminate(self):
         return True
diff --git a/lm_eval/tasks/sciq.py b/lm_eval/tasks/sciq.py
index 650b117213..ca82d54003 100644
--- a/lm_eval/tasks/sciq.py
+++ b/lm_eval/tasks/sciq.py
@@ -54,10 +54,10 @@ def _process_doc(self, doc):
             doc["distractor3"],
             doc["correct_answer"],
         ]
-        src = doc['support']
+        src = doc["support"]
         out_doc = {
             "source": src,
-            "query": doc['question'],
+            "query": doc["question"],
             "choices": choices,
             "gold": 3,
         }
diff --git a/lm_eval/tasks/squad.py b/lm_eval/tasks/squad.py
index 21e76a0224..2a987ea7eb 100644
--- a/lm_eval/tasks/squad.py
+++ b/lm_eval/tasks/squad.py
@@ -1,169 +1,219 @@
-"""
-Know What You Don’t Know: Unanswerable Questions for SQuAD
-https://arxiv.org/pdf/1806.03822.pdf
-
-Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
-consisting of questions posed by crowdworkers on a set of Wikipedia articles,
-where the answer to every question is a segment of text, or span, from the
-corresponding reading passage, or the question might be unanswerable.
-SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
-questions written adversarially by crowdworkers to look similar to answerable ones.
-To do well on SQuAD2.0, systems must not only answer questions when possible, but
-also determine when no answer is supported by the paragraph and abstain from answering.
-
-Homepage: https://rajpurkar.github.io/SQuAD-explorer/
-"""
-import datasets
-from math import exp
-from lm_eval.base import rf, Task
-from functools import partial
-from packaging import version
-
-
-_CITATION = """
-@misc{rajpurkar2018know,
-    title={Know What You Don't Know: Unanswerable Questions for SQuAD}, 
-    author={Pranav Rajpurkar and Robin Jia and Percy Liang},
-    year={2018},
-    eprint={1806.03822},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL}
-}
-"""
-
-
-def _squad_metric(predictions, references):
-    squad_metric = datasets.load_metric("squad_v2")
-    return squad_metric.compute(predictions=predictions, references=references)
-
-
-def _squad_agg(key, items):
-    predictions, references = zip(*items)
-
-    return _squad_metric(predictions=predictions, references=references)[key]
-
-
-class SQuAD2(Task):
-    VERSION = 1
-    DATASET_PATH = "squad_v2"
-    DATASET_NAME = None
-
-    # HF changed squad on us so we have to make sure we aren't running the old one
-    assert version.parse(datasets.__version__) >= version.parse("1.11.0"), "datasets v1.11.0 or later required for SQuAD"
-
-    def has_training_docs(self):
-        return True
-
-    def has_validation_docs(self):
-        return True
-
-    def has_test_docs(self):
-        return False
-
-    def training_docs(self):
-        return self.dataset["train"]
-
-    def validation_docs(self):
-        return self.dataset["validation"]
-
-    def doc_to_text(self, doc):
-        return 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Question: ' + doc['question'] + '\n\n' + 'Answer:'
-
-    def should_decontaminate(self):
-        return True
-
-    def doc_to_decontamination_query(self, doc):
-        return doc['context']
-
-    def doc_to_target(self, doc):
-        answer_list = doc['answers']['text']
-        if len(answer_list) > 0:
-            answer = answer_list[0]
-        else:
-            answer = 'unanswerable'
-        return " " + answer
-
-    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
-        Requests which will be sent to the LM.
-
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural 
-            language description, as well as the few shot examples, and the question
-            part of the document for `doc`. 
-        """
-        continuation = rf.greedy_until(ctx, ['\n'])
-        is_unanswerable = rf.loglikelihood(ctx, " " + "unanswerable")
-        return continuation, is_unanswerable
-    
-    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a 
-        dict where keys are the names of submetrics and values are the values of 
-        the metric for that one document
-
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param results:
-            The results of the requests created in construct_requests.
-        """
-        continuation, (logprob_unanswerable, _) = results
-
-        no_answer_probability = exp(logprob_unanswerable)
-        
-        predictions = {
-            'id': doc['id'],
-            'prediction_text': continuation,
-            'no_answer_probability': no_answer_probability,
-        }
-
-        references = {
-            'id': doc['id'],
-            'answers': doc['answers'],
-        }
-
-        return { 
-            'exact': (predictions, references), # Exact match (the normalized answer exactly match the gold answer)
-            'f1': (predictions, references), #  The F-score of predicted tokens versus the gold answer
-            'HasAns_exact': (predictions, references), # Exact match (the normalized answer exactly match the gold answer)
-            'HasAns_f1': (predictions, references), # The F-score of predicted tokens versus the gold answer
-            'NoAns_exact': (predictions, references), # Exact match (the normalized answer exactly match the gold answer)
-            'NoAns_f1': (predictions, references), # The F-score of predicted tokens versus the gold answer
-            'best_exact': (predictions, references), # Best exact match (with varying threshold)
-            'best_f1': (predictions, references), # Best F1 (with varying threshold)
-        }
-
-    def aggregation(self):
-        """
-        :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are 
-            functions that aggregate a list of metrics
-        """
-        return { 
-            'exact': partial(_squad_agg, 'exact'), # Exact match (the normalized answer exactly match the gold answer)
-            'f1': partial(_squad_agg, 'f1'), #  The F-score of predicted tokens versus the gold answer
-            'HasAns_exact': partial(_squad_agg, 'HasAns_exact'), # Exact match (the normalized answer exactly match the gold answer)
-            'HasAns_f1': partial(_squad_agg, 'HasAns_f1'), # The F-score of predicted tokens versus the gold answer
-            'NoAns_exact': partial(_squad_agg, 'NoAns_exact'), # Exact match (the normalized answer exactly match the gold answer)
-            'NoAns_f1': partial(_squad_agg, 'NoAns_f1'), # The F-score of predicted tokens versus the gold answer
-            'best_exact': partial(_squad_agg, 'best_exact'), # Best exact match (with varying threshold)
-            'best_f1': partial(_squad_agg, 'best_f1'), # Best F1 (with varying threshold)
-        }
-
-    def higher_is_better(self):
-        """
-        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are 
-            whether a higher value of the submetric is better
-        """
-        return { 
-            'exact': True, # Exact match (the normalized answer exactly match the gold answer)
-            'f1': True, #  The F-score of predicted tokens versus the gold answer
-            'HasAns_exact': True, # Exact match (the normalized answer exactly match the gold answer)
-            'HasAns_f1': True, # The F-score of predicted tokens versus the gold answer
-            'NoAns_exact': True, # Exact match (the normalized answer exactly match the gold answer)
-            'NoAns_f1': True, # The F-score of predicted tokens versus the gold answer
-            'best_exact': True, # Best exact match (with varying threshold)
-            'best_f1': True, # Best F1 (with varying threshold)
-        }
+"""
+Know What You Don’t Know: Unanswerable Questions for SQuAD
+https://arxiv.org/pdf/1806.03822.pdf
+
+Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
+consisting of questions posed by crowdworkers on a set of Wikipedia articles,
+where the answer to every question is a segment of text, or span, from the
+corresponding reading passage, or the question might be unanswerable.
+SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
+questions written adversarially by crowdworkers to look similar to answerable ones.
+To do well on SQuAD2.0, systems must not only answer questions when possible, but
+also determine when no answer is supported by the paragraph and abstain from answering.
+
+Homepage: https://rajpurkar.github.io/SQuAD-explorer/
+"""
+import datasets
+from math import exp
+from lm_eval.base import rf, Task
+from functools import partial
+from packaging import version
+
+
+_CITATION = """
+@misc{rajpurkar2018know,
+    title={Know What You Don't Know: Unanswerable Questions for SQuAD},
+    author={Pranav Rajpurkar and Robin Jia and Percy Liang},
+    year={2018},
+    eprint={1806.03822},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+
+
+def _squad_metric(predictions, references):
+    squad_metric = datasets.load_metric("squad_v2")
+    return squad_metric.compute(predictions=predictions, references=references)
+
+
+def _squad_agg(key, items):
+    predictions, references = zip(*items)
+
+    return _squad_metric(predictions=predictions, references=references)[key]
+
+
+class SQuAD2(Task):
+    VERSION = 1
+    DATASET_PATH = "squad_v2"
+    DATASET_NAME = None
+
+    # HF changed squad on us so we have to make sure we aren't running the old one
+    assert version.parse(datasets.__version__) >= version.parse(
+        "1.11.0"
+    ), "datasets v1.11.0 or later required for SQuAD"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        return self.dataset["train"]
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def doc_to_text(self, doc):
+        return (
+            "Title: "
+            + doc["title"]
+            + "\n\n"
+            + "Background: "
+            + doc["context"]
+            + "\n\n"
+            + "Question: "
+            + doc["question"]
+            + "\n\n"
+            + "Answer:"
+        )
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["context"]
+
+    def doc_to_target(self, doc):
+        answer_list = doc["answers"]["text"]
+        if len(answer_list) > 0:
+            answer = answer_list[0]
+        else:
+            answer = "unanswerable"
+        return " " + answer
+
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        continuation = rf.greedy_until(ctx, ["\n"])
+        is_unanswerable = rf.loglikelihood(ctx, " " + "unanswerable")
+        return continuation, is_unanswerable
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        continuation, (logprob_unanswerable, _) = results
+
+        no_answer_probability = exp(logprob_unanswerable)
+
+        predictions = {
+            "id": doc["id"],
+            "prediction_text": continuation,
+            "no_answer_probability": no_answer_probability,
+        }
+
+        references = {
+            "id": doc["id"],
+            "answers": doc["answers"],
+        }
+
+        return {
+            "exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": (
+                predictions,
+                references,
+            ),  #  The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "best_exact": (
+                predictions,
+                references,
+            ),  # Best exact match (with varying threshold)
+            "best_f1": (predictions, references),  # Best F1 (with varying threshold)
+        }
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {
+            "exact": partial(
+                _squad_agg, "exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": partial(
+                _squad_agg, "f1"
+            ),  #  The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": partial(
+                _squad_agg, "HasAns_exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": partial(
+                _squad_agg, "HasAns_f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": partial(
+                _squad_agg, "NoAns_exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": partial(
+                _squad_agg, "NoAns_f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "best_exact": partial(
+                _squad_agg, "best_exact"
+            ),  # Best exact match (with varying threshold)
+            "best_f1": partial(
+                _squad_agg, "best_f1"
+            ),  # Best F1 (with varying threshold)
+        }
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {
+            "exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": True,  #  The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": True,  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": True,  # The F-score of predicted tokens versus the gold answer
+            "best_exact": True,  # Best exact match (with varying threshold)
+            "best_f1": True,  # Best F1 (with varying threshold)
+        }
diff --git a/lm_eval/tasks/storycloze.py b/lm_eval/tasks/storycloze.py
index cbc8331afd..5d3c7a6d55 100644
--- a/lm_eval/tasks/storycloze.py
+++ b/lm_eval/tasks/storycloze.py
@@ -65,23 +65,27 @@ def test_docs(self):
         return self.dataset["test"]
 
     def doc_to_text(self, doc):
-        return ' '.join([
-            doc["input_sentence_1"],
-            doc["input_sentence_2"],
-            doc["input_sentence_3"],
-            doc["input_sentence_4"],
-        ])
+        return " ".join(
+            [
+                doc["input_sentence_1"],
+                doc["input_sentence_2"],
+                doc["input_sentence_3"],
+                doc["input_sentence_4"],
+            ]
+        )
 
     def should_decontaminate(self):
         return True
 
     def doc_to_decontamination_query(self, doc):
-        return ' '.join([
-            doc["input_sentence_1"],
-            doc["input_sentence_2"],
-            doc["input_sentence_3"],
-            doc["input_sentence_4"],
-        ])
+        return " ".join(
+            [
+                doc["input_sentence_1"],
+                doc["input_sentence_2"],
+                doc["input_sentence_3"],
+                doc["input_sentence_4"],
+            ]
+        )
 
     def doc_to_target(self, doc):
         clozes = [doc["sentence_quiz1"], doc["sentence_quiz2"]]
@@ -89,7 +93,7 @@ def doc_to_target(self, doc):
         return " " + clozes[doc["answer_right_ending"] - 1]
 
     def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of
+        """Uses RequestFactory to construct Requests and returns an iterable of
         Requests which will be sent to the LM.
 
         :param doc:
@@ -100,10 +104,7 @@ def construct_requests(self, doc, ctx):
             part of the document for `doc`.
         """
         clozes = [doc["sentence_quiz1"], doc["sentence_quiz2"]]
-        lls = [
-            rf.loglikelihood(ctx, " {}".format(choice))[0]
-            for choice in clozes
-        ]
+        lls = [rf.loglikelihood(ctx, " {}".format(choice))[0] for choice in clozes]
         return lls
 
     def process_results(self, doc, results):
@@ -117,10 +118,8 @@ def process_results(self, doc, results):
             The results of the requests created in construct_requests.
         """
         gold = doc["answer_right_ending"] - 1
-        acc = 1. if np.argmax(results) == gold else 0.
-        return {
-            "acc": acc
-        }
+        acc = 1.0 if np.argmax(results) == gold else 0.0
+        return {"acc": acc}
 
     def aggregation(self):
         """
@@ -128,9 +127,7 @@ def aggregation(self):
             A dictionary where keys are the names of submetrics and values are
             functions that aggregate a list of metrics
         """
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}
 
     def higher_is_better(self):
         """
@@ -138,9 +135,7 @@ def higher_is_better(self):
             A dictionary where keys are the names of submetrics and values are
             whether a higher value of the submetric is better
         """
-        return {
-            "acc": True
-        }
+        return {"acc": True}
 
 
 class StoryCloze2016(StoryCloze):
diff --git a/lm_eval/tasks/superglue.py b/lm_eval/tasks/superglue.py
index 8fa4eaf1d4..f388726ce3 100644
--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
@@ -61,15 +61,15 @@ def should_decontaminate(self):
         return True
 
     def doc_to_decontamination_query(self, doc):
-        return doc['passage']
-    
+        return doc["passage"]
+
     def doc_to_target(self, doc):
-        return " " + yesno(doc['label']) 
+        return " " + yesno(doc["label"])
 
     def construct_requests(self, doc, ctx):
 
-        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
-        ll_no, _ = rf.loglikelihood(ctx, ' no')
+        ll_yes, _ = rf.loglikelihood(ctx, " yes")
+        ll_no, _ = rf.loglikelihood(ctx, " no")
 
         return ll_yes, ll_no
 
@@ -77,21 +77,15 @@ def process_results(self, doc, results):
         ll_yes, ll_no = results
         gold = doc["label"]
 
-        acc = 1. if (ll_yes > ll_no) == gold else 0.
+        acc = 1.0 if (ll_yes > ll_no) == gold else 0.0
+
+        return {"acc": acc}
 
-        return {
-            "acc": acc
-        }
-    
     def higher_is_better(self):
-        return {
-            "acc": True
-        }
-    
+        return {"acc": True}
+
     def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}
 
 
 class CommitmentBank(Task):
@@ -129,27 +123,21 @@ def doc_to_target(self, doc):
         return " {}".format({0: "True", 1: "False", 2: "Neither"}[doc["label"]])
 
     def construct_requests(self, doc, ctx):
-        ll_true, _ = rf.loglikelihood(ctx, ' True')
-        ll_false, _ = rf.loglikelihood(ctx, ' False')
-        ll_neither, _ = rf.loglikelihood(ctx, ' Neither')
+        ll_true, _ = rf.loglikelihood(ctx, " True")
+        ll_false, _ = rf.loglikelihood(ctx, " False")
+        ll_neither, _ = rf.loglikelihood(ctx, " Neither")
 
         return ll_true, ll_false, ll_neither
 
     def process_results(self, doc, results):
         gold = doc["label"]
         pred = np.argmax(results)
-        acc = 1. if pred == gold else 0.
+        acc = 1.0 if pred == gold else 0.0
+
+        return {"acc": acc, "f1": (pred, gold)}
 
-        return {
-            "acc": acc,
-            "f1": (pred, gold)
-        }
-    
     def higher_is_better(self):
-        return {
-            "acc": True,
-            "f1": True
-        }
+        return {"acc": True, "f1": True}
 
     @classmethod
     def cb_multi_fi(cls, items):
@@ -161,7 +149,7 @@ def cb_multi_fi(cls, items):
         f13 = sklearn.metrics.f1_score(y_true=golds == 2, y_pred=preds == 2)
         avg_f1 = mean([f11, f12, f13])
         return avg_f1
-    
+
     def aggregation(self):
         return {
             "acc": mean,
@@ -207,7 +195,7 @@ def doc_to_target(self, doc):
     def construct_requests(self, doc, ctx):
         choice1 = " " + self.convert_choice(doc["choice1"])
         choice2 = " " + self.convert_choice(doc["choice2"])
-        
+
         ll_choice1, _ = rf.loglikelihood(ctx, choice1)
         ll_choice2, _ = rf.loglikelihood(ctx, choice2)
 
@@ -216,21 +204,15 @@ def construct_requests(self, doc, ctx):
     def process_results(self, doc, results):
         gold = doc["label"]
         pred = np.argmax(results)
-        acc = 1. if pred == gold else 0.
+        acc = 1.0 if pred == gold else 0.0
+
+        return {"acc": acc}
 
-        return {
-            "acc": acc
-        }
-    
     def higher_is_better(self):
-        return {
-            "acc": True
-        }
-    
+        return {"acc": True}
+
     def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}
 
     @staticmethod
     def convert_choice(choice):
@@ -273,28 +255,22 @@ def format_answer(answer, label):
     def construct_requests(self, doc, ctx):
         true_choice = self.format_answer(answer=doc["answer"], label=True)
         false_choice = self.format_answer(answer=doc["answer"], label=False)
-        
-        ll_true_choice, _ = rf.loglikelihood(ctx, f' {true_choice}')
-        ll_false_choice, _ = rf.loglikelihood(ctx, f' {false_choice}')
+
+        ll_true_choice, _ = rf.loglikelihood(ctx, f" {true_choice}")
+        ll_false_choice, _ = rf.loglikelihood(ctx, f" {false_choice}")
 
         return ll_true_choice, ll_false_choice
 
     def process_results(self, doc, results):
         ll_true_choice, ll_false_choice = results
         pred = ll_true_choice > ll_false_choice
-        return {
-            "acc": (pred, doc)
-        }
-    
+        return {"acc": (pred, doc)}
+
     def higher_is_better(self):
-        return {
-            "acc": True
-        }
-    
+        return {"acc": True}
+
     def aggregation(self):
-        return {
-            "acc": acc_all
-        }
+        return {"acc": acc_all}
 
 
 class ReCoRD(Task):
@@ -343,7 +319,7 @@ def doc_to_text(self, doc):
 
     @classmethod
     def format_answer(cls, query, entity):
-        return f'  - {query}'.replace("@placeholder", entity)
+        return f"  - {query}".replace("@placeholder", entity)
 
     def doc_to_target(self, doc):
         # We only output the first correct entity in a doc
@@ -365,8 +341,12 @@ def process_results(self, doc, results):
 
         prediction = doc["entities"][max_idx]
         gold_label_set = doc["answers"]
-        f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, prediction, gold_label_set)
-        em = metric_max_over_ground_truths(squad_metrics.compute_exact, prediction, gold_label_set)
+        f1 = metric_max_over_ground_truths(
+            squad_metrics.compute_f1, prediction, gold_label_set
+        )
+        em = metric_max_over_ground_truths(
+            squad_metrics.compute_exact, prediction, gold_label_set
+        )
 
         return {
             "f1": f1,
@@ -409,19 +389,21 @@ def validation_docs(self):
         return self.dataset["validation"]
 
     def doc_to_text(self, doc):
-        return "Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the" \
-               " two sentences above?\nAnswer:".format(
-                    doc["sentence1"],
-                    doc["sentence2"],
-                    doc["sentence1"][doc["start1"]:doc["end1"]],
-                )
+        return (
+            "Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the"
+            " two sentences above?\nAnswer:".format(
+                doc["sentence1"],
+                doc["sentence2"],
+                doc["sentence1"][doc["start1"] : doc["end1"]],
+            )
+        )
 
     def doc_to_target(self, doc):
         return " {}".format({0: "no", 1: "yes"}[doc["label"]])
 
     def construct_requests(self, doc, ctx):
-        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
-        ll_no, _ = rf.loglikelihood(ctx, ' no')
+        ll_yes, _ = rf.loglikelihood(ctx, " yes")
+        ll_no, _ = rf.loglikelihood(ctx, " no")
 
         return ll_yes, ll_no
 
@@ -429,21 +411,15 @@ def process_results(self, doc, results):
         ll_yes, ll_no = results
         gold = doc["label"]
 
-        acc = 1. if (ll_yes > ll_no) == gold else 0.
+        acc = 1.0 if (ll_yes > ll_no) == gold else 0.0
 
-        return {
-            "acc": acc
-        }
+        return {"acc": acc}
 
     def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}
 
     def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}
 
 
 class SGWinogradSchemaChallenge(Task):
@@ -467,9 +443,7 @@ def training_docs(self):
             if self._training_docs is None:
                 # GPT-3 Paper's format only uses positive examples for fewshot "training"
                 self._training_docs = [
-                    doc for doc in
-                    self.dataset["train"]
-                    if doc["label"]
+                    doc for doc in self.dataset["train"] if doc["label"]
                 ]
             return self._training_docs
 
@@ -479,25 +453,25 @@ def validation_docs(self):
     def doc_to_text(self, doc):
         raw_passage = doc["text"]
         # NOTE: HuggingFace span indices are word-based not character-based.
-        pre = " ".join(raw_passage.split()[:doc["span2_index"]])
-        post = raw_passage[len(pre) + len(doc["span2_text"]) + 1:]
-        passage = general_detokenize(pre + " *{}*".format(doc['span2_text']) + post)
+        pre = " ".join(raw_passage.split()[: doc["span2_index"]])
+        post = raw_passage[len(pre) + len(doc["span2_text"]) + 1 :]
+        passage = general_detokenize(pre + " *{}*".format(doc["span2_text"]) + post)
         noun = doc["span1_text"]
         pronoun = doc["span2_text"]
         text = (
             f"Passage: {passage}\n"
-            + f"Question: In the passage above, does the pronoun \"*{pronoun}*\" refer to \"*{noun}*\"?\n"
+            + f'Question: In the passage above, does the pronoun "*{pronoun}*" refer to "*{noun}*"?\n'
             + "Answer:"
         )
         return text
 
     def doc_to_target(self, doc):
-        return " " + yesno(doc['label'])
+        return " " + yesno(doc["label"])
 
     def construct_requests(self, doc, ctx):
 
-        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
-        ll_no, _ = rf.loglikelihood(ctx, ' no')
+        ll_yes, _ = rf.loglikelihood(ctx, " yes")
+        ll_no, _ = rf.loglikelihood(ctx, " no")
 
         return ll_yes, ll_no
 
@@ -505,18 +479,12 @@ def process_results(self, doc, results):
         ll_yes, ll_no = results
         gold = doc["label"]
 
-        acc = 1. if (ll_yes > ll_no) == gold else 0.
+        acc = 1.0 if (ll_yes > ll_no) == gold else 0.0
 
-        return {
-            "acc": acc
-        }
+        return {"acc": acc}
 
     def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}
 
     def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}
diff --git a/lm_eval/tasks/translation.py b/lm_eval/tasks/translation.py
index 31000ca93f..30a9ee6b7d 100644
--- a/lm_eval/tasks/translation.py
+++ b/lm_eval/tasks/translation.py
@@ -41,44 +41,57 @@ def create_tasks_from_benchmarks(benchmark_dict):
     :return: {task_name: task}
         e.g. {wmt14-fr-en: Task, wmt16-de-en: Task}
     """
+
     def version_of(dataset, language_pair):
         if language_pair[-2:] in ["zh", "ja"]:
-            return 1 # changed to use jieba/nagisa
+            return 1  # changed to use jieba/nagisa
         return 0
 
     return {
-        f"{dataset}-{language_pair}": create_translation_task(dataset, language_pair, version_of(dataset, language_pair))
+        f"{dataset}-{language_pair}": create_translation_task(
+            dataset, language_pair, version_of(dataset, language_pair)
+        )
         for dataset, language_pairs in benchmark_dict.items()
         for language_pair in language_pairs
     }
 
+
 ########################################
 # Language Specifics
 ########################################
 
+
 def zh_split(zh_text: List[str]) -> List[str]:
     """Chinese splitting"""
     import jieba
+
     return [" ".join(jieba.cut(txt.strip())) for txt in zh_text]
 
+
 def ja_split(ja_text: List[str]) -> List[str]:
     """Japanese splitting"""
     import nagisa
+
     return [" ".join(nagisa.tagging(txt.strip()).words) for txt in ja_text]
 
+
 NO_SPACE_LANG = {"zh": zh_split, "ja": ja_split}
 
 ########################################
 # Tasks
 ########################################
 
+
 def create_translation_task(dataset, language_pair, version=0):
     class TranslationTask(GeneralTranslationTask):
         VERSION = version
+
         def __init__(self):
             super().__init__(dataset, language_pair)
+
     return TranslationTask
 
+
 class GeneralTranslationTask(Task):
     VERSION = 0
 
@@ -92,8 +105,9 @@ def __init__(self, sacrebleu_dataset, sacrebleu_language_pair=None):
 
     def download(self, data_dir=None, cache_dir=None, download_mode=None):
         # This caches in the users home dir automatically
-        self.src_file, self.ref_file = \
-            sacrebleu.download_test_set(self.sacrebleu_dataset, self.sacrebleu_language_pair)
+        self.src_file, self.ref_file = sacrebleu.download_test_set(
+            self.sacrebleu_dataset, self.sacrebleu_language_pair
+        )
         self.src_data, self.ref_data = [
             [line.rstrip() for line in sacrebleu.smart_open(file)]
             for file in (self.src_file, self.ref_file)
@@ -117,10 +131,9 @@ def test_docs(self):
         :return: Iterable[obj]
             A iterable of any object, that doc_to_text can handle
         """
-        return [{
-            "src": src,
-            "ref": ref
-        } for src, ref in zip(self.src_data, self.ref_data)]
+        return [
+            {"src": src, "ref": ref} for src, ref in zip(self.src_data, self.ref_data)
+        ]
 
     def doc_to_text(self, doc):
         language_codes = self.sacrebleu_language_pair.split("-")
@@ -139,7 +152,7 @@ def doc_to_target(self, doc):
         return " " + doc["ref"] if isinstance(doc["ref"], str) else doc["ref"][0]
 
     def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of
+        """Uses RequestFactory to construct Requests and returns an iterable of
         Requests which will be sent to the LM.
 
         :param doc:
diff --git a/lm_eval/tasks/triviaqa.py b/lm_eval/tasks/triviaqa.py
index 4bf9d77ff3..50def64a4f 100644
--- a/lm_eval/tasks/triviaqa.py
+++ b/lm_eval/tasks/triviaqa.py
@@ -43,10 +43,10 @@ def has_test_docs(self):
         return False
 
     def training_docs(self):
-        return self.dataset['train']
+        return self.dataset["train"]
 
     def validation_docs(self):
-        return self.dataset['validation']
+        return self.dataset["validation"]
 
     def test_docs(self):
         raise NotImplementedError()
@@ -58,10 +58,10 @@ def should_decontaminate(self):
         return True
 
     def doc_to_decontamination_query(self, doc):
-        return doc['question']
+        return doc["question"]
 
     def doc_to_target(self, doc):
-        return " " + doc['answer']['value']
+        return " " + doc["answer"]["value"]
 
     def _remove_prefixes(self, aliases):
         # Optimization: Remove any alias that has a strict prefix elsewhere in the list
@@ -75,15 +75,13 @@ def _remove_prefixes(self, aliases):
 
     def construct_requests(self, doc, ctx):
         ret = []
-        for alias in self._remove_prefixes(doc['answer']['aliases']):
+        for alias in self._remove_prefixes(doc["answer"]["aliases"]):
             _, is_prediction = rf.loglikelihood(ctx, " " + alias)
             ret.append(is_prediction)
         return ret
 
     def process_results(self, doc, results):
-        return {
-            "acc": float(any(results))
-        }
+        return {"acc": float(any(results))}
 
     def aggregation(self):
         return {
@@ -91,6 +89,4 @@ def aggregation(self):
         }
 
     def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}
diff --git a/lm_eval/tasks/truthfulqa.py b/lm_eval/tasks/truthfulqa.py
index 1293080873..4e61e80496 100644
--- a/lm_eval/tasks/truthfulqa.py
+++ b/lm_eval/tasks/truthfulqa.py
@@ -82,28 +82,29 @@ def test_docs(self):
         raise NotImplementedError()
 
     def doc_to_text(self, doc):
-        return QA_PROMPT + "\n\nQ: " + doc['question'] + "\nA:"
+        return QA_PROMPT + "\n\nQ: " + doc["question"] + "\nA:"
 
     def should_decontaminate(self):
         return True
 
     def doc_to_decontamination_query(self, doc):
-        return doc['question']
+        return doc["question"]
 
     def doc_to_target(self, doc):
         return " "
 
-    def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
-        assert num_fewshot == 0, "TruthfulQA is intended only for the zero-shot setting."
+    def fewshot_context(
+        self, doc, num_fewshot, provide_description=None, rnd=None, description=None
+    ):
+        assert (
+            num_fewshot == 0
+        ), "TruthfulQA is intended only for the zero-shot setting."
         return super().fewshot_context(
-            doc=doc,
-            num_fewshot=num_fewshot,
-            rnd=rnd,
-            description=description
+            doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
         )
 
     def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of
+        """Uses RequestFactory to construct Requests and returns an iterable of
         Requests which will be sent to the LM.
 
         :param doc:
@@ -113,11 +114,15 @@ def construct_requests(self, doc, ctx):
             language description, as well as the few shot examples, and the question
             part of the document for `doc`.
         """
+
         def get_lls(targets):
             return [rf.loglikelihood(ctx, " " + t)[0] for t in targets]
+
         # MC1 and MC2 targets are not always the same set of strings so we collect
         # likelihoods separately for simpler processing.
-        return get_lls(doc['mc1_targets']["choices"]) + get_lls(doc['mc2_targets']["choices"])
+        return get_lls(doc["mc1_targets"]["choices"]) + get_lls(
+            doc["mc2_targets"]["choices"]
+        )
 
     def process_results(self, doc, results):
         """Take a single document and the LM results and evaluates, returning a
@@ -129,37 +134,29 @@ def process_results(self, doc, results):
         :param results:
             The results of the requests created in construct_requests.
         """
+
         def mc1(lls):
             # The gold answers in `mc1_targets` are always first (index = `0`).
             return np.argmax(lls) == 0
 
         def mc2(lls):
             # Split on the first `0` as everything before it is true (`1`).
-            split_idx = list(doc['mc2_targets']["labels"]).index(0)
+            split_idx = list(doc["mc2_targets"]["labels"]).index(0)
             # Compute the normalized probability mass for the correct answer.
             ll_true, ll_false = lls[:split_idx], lls[split_idx:]
             p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))
             p_true = p_true / (sum(p_true) + sum(p_false))
             return sum(p_true)
 
-        split_idx = len(doc['mc1_targets']["choices"])
+        split_idx = len(doc["mc1_targets"]["choices"])
         mc1_lls, mc2_lls = results[:split_idx], results[split_idx:]
-        return {
-            "mc1": mc1(mc1_lls),
-            "mc2": mc2(mc2_lls)
-        }
+        return {"mc1": mc1(mc1_lls), "mc2": mc2(mc2_lls)}
 
     def aggregation(self):
-        return {
-            "mc1": mean,
-            "mc2": mean
-        }
+        return {"mc1": mean, "mc2": mean}
 
     def higher_is_better(self):
-        return {
-            "mc1": True,
-            "mc2": True
-        }
+        return {"mc1": True, "mc2": True}
 
 
 class TruthfulQAGeneration(Task):
@@ -189,44 +186,45 @@ def _format_answers(self, answers):
             answer = answer.strip()
             if len(answer):
                 # Add a period after all answers.
-                if answer[-1] != '.':
-                    formatted_answers.append(answer + '.')
+                if answer[-1] != ".":
+                    formatted_answers.append(answer + ".")
                 else:
                     formatted_answers.append(answer)
         return formatted_answers
 
     def validation_docs(self):
         for doc in self.dataset["validation"]:
-            incorrect_answers = self._format_answers(doc['incorrect_answers'])
-            correct_answers = self._format_answers(doc['correct_answers'])
+            incorrect_answers = self._format_answers(doc["incorrect_answers"])
+            correct_answers = self._format_answers(doc["correct_answers"])
             if "I have no comment." not in correct_answers:
                 correct_answers.append("I have no comment.")
             yield {
-                'question': doc['question'].strip(),
-                'correct_answers': correct_answers,
-                'incorrect_answers': incorrect_answers
+                "question": doc["question"].strip(),
+                "correct_answers": correct_answers,
+                "incorrect_answers": incorrect_answers,
             }
 
     def test_docs(self):
         raise NotImplementedError()
 
     def doc_to_text(self, doc):
-        return QA_PROMPT + "\n\nQ: " + doc['question']
+        return QA_PROMPT + "\n\nQ: " + doc["question"]
 
     def doc_to_target(self, doc):
         return " "
 
-    def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
-        assert num_fewshot == 0, "TruthfulQA is intended only for the zero-shot setting."
+    def fewshot_context(
+        self, doc, num_fewshot, provide_description=None, rnd=None, description=None
+    ):
+        assert (
+            num_fewshot == 0
+        ), "TruthfulQA is intended only for the zero-shot setting."
         return super().fewshot_context(
-            doc=doc,
-            num_fewshot=num_fewshot,
-            rnd=rnd,
-            description=description
+            doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
         )
 
     def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of
+        """Uses RequestFactory to construct Requests and returns an iterable of
         Requests which will be sent to the LM.
 
         :param doc:
@@ -237,7 +235,7 @@ def construct_requests(self, doc, ctx):
             part of the document for `doc`.
         """
         # TODO: Find a way to cap the number of generated tokens to `50` as in the official implementation.
-        completion = rf.greedy_until(ctx, ['.'])
+        completion = rf.greedy_until(ctx, ["."])
         return completion
 
     def process_results(self, doc, results):
@@ -251,18 +249,18 @@ def process_results(self, doc, results):
             The results of the requests created in construct_requests.
         """
         completion = results[0].strip()
-        true_refs, false_refs = doc['correct_answers'], doc['incorrect_answers']
+        true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
         all_refs = true_refs + false_refs
 
         # Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures.
 
         # BLEURT
         bleurt_scores_true = self.bleurt.compute(
-            predictions=[completion] * len(true_refs),
-            references=true_refs)['scores']
+            predictions=[completion] * len(true_refs), references=true_refs
+        )["scores"]
         bleurt_scores_false = self.bleurt.compute(
-            predictions=[completion] * len(false_refs),
-            references=false_refs)['scores']
+            predictions=[completion] * len(false_refs), references=false_refs
+        )["scores"]
         bleurt_correct = max(bleurt_scores_true)
         bleurt_incorrect = max(bleurt_scores_false)
         bleurt_max = bleurt_correct
@@ -271,8 +269,8 @@ def process_results(self, doc, results):
 
         # BLEU
         bleu_scores = [self.bleu([[ref]], [completion]) for ref in all_refs]
-        bleu_correct = np.nanmax(bleu_scores[:len(true_refs)])
-        bleu_incorrect = np.nanmax(bleu_scores[len(true_refs):])
+        bleu_correct = np.nanmax(bleu_scores[: len(true_refs)])
+        bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :])
         bleu_max = bleu_correct
         bleu_diff = bleu_correct - bleu_incorrect
         bleu_acc = int(bleu_correct > bleu_incorrect)
@@ -280,23 +278,23 @@ def process_results(self, doc, results):
         # ROUGE-N
         rouge_scores = [self.rouge([ref], [completion]) for ref in all_refs]
         # ROUGE-1
-        rouge1_scores = [score['rouge1'] for score in rouge_scores]
-        rouge1_correct = np.nanmax(rouge1_scores[:len(true_refs)])
-        rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs):])
+        rouge1_scores = [score["rouge1"] for score in rouge_scores]
+        rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)])
+        rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :])
         rouge1_max = rouge1_correct
         rouge1_diff = rouge1_correct - rouge1_incorrect
         rouge1_acc = int(rouge1_correct > rouge1_incorrect)
         # ROUGE-2
-        rouge2_scores = [score['rouge2'] for score in rouge_scores]
-        rouge2_correct = np.nanmax(rouge2_scores[:len(true_refs)])
-        rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs):])
+        rouge2_scores = [score["rouge2"] for score in rouge_scores]
+        rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)])
+        rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :])
         rouge2_max = rouge2_correct
         rouge2_diff = rouge2_correct - rouge2_incorrect
         rouge2_acc = int(rouge2_correct > rouge2_incorrect)
         # ROUGE-L
-        rougeL_scores = [score['rougeLsum'] for score in rouge_scores]
-        rougeL_correct = np.nanmax(rougeL_scores[:len(true_refs)])
-        rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs):])
+        rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
+        rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)])
+        rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :])
         rougeL_max = rougeL_correct
         rougeL_diff = rougeL_correct - rougeL_incorrect
         rougeL_acc = int(rougeL_correct > rougeL_incorrect)
@@ -305,19 +303,15 @@ def process_results(self, doc, results):
             "bleurt_max": bleurt_max,
             "bleurt_acc": bleurt_acc,
             "bleurt_diff": bleurt_diff,
-
             "bleu_max": bleu_max,
             "bleu_acc": bleu_acc,
             "bleu_diff": bleu_diff,
-
             "rouge1_max": rouge1_max,
             "rouge1_acc": rouge1_acc,
             "rouge1_diff": rouge1_diff,
-
             "rouge2_max": rouge2_max,
             "rouge2_acc": rouge2_acc,
             "rouge2_diff": rouge2_diff,
-
             "rougeL_max": rougeL_max,
             "rougeL_acc": rougeL_acc,
             "rougeL_diff": rougeL_diff,
@@ -328,19 +322,15 @@ def aggregation(self):
             "bleurt_max": mean,
             "bleurt_acc": mean,
             "bleurt_diff": mean,
-
             "bleu_max": mean,
             "bleu_acc": mean,
             "bleu_diff": mean,
-
             "rouge1_max": mean,
             "rouge1_acc": mean,
             "rouge1_diff": mean,
-
             "rouge2_max": mean,
             "rouge2_acc": mean,
             "rouge2_diff": mean,
-
             "rougeL_max": mean,
             "rougeL_acc": mean,
             "rougeL_diff": mean,
@@ -351,19 +341,15 @@ def higher_is_better(self):
             "bleurt_max": True,
             "bleurt_acc": True,
             "bleurt_diff": True,
-
             "bleu_max": True,
             "bleu_acc": True,
             "bleu_diff": True,
-
             "rouge1_max": True,
             "rouge1_acc": True,
             "rouge1_diff": True,
-
             "rouge2_max": True,
             "rouge2_acc": True,
             "rouge2_diff": True,
-
             "rougeL_max": True,
             "rougeL_acc": True,
             "rougeL_diff": True,
@@ -387,7 +373,7 @@ def bleu(self, refs, preds):
             force=False,
             lowercase=False,
             tokenize="intl",
-            use_effective_order=False
+            use_effective_order=False,
         ).score
         return score
 
@@ -407,6 +393,7 @@ def rouge(self, refs, preds):
         def _prepare_summary(summary):
             summary = summary.replace(" . ", ".\n")
             return summary
+
         # Accumulate confidence intervals.
         aggregator = scoring.BootstrapAggregator()
         for ref, pred in zip(refs, preds):
@@ -414,4 +401,4 @@ def _prepare_summary(summary):
             pred = _prepare_summary(pred)
             aggregator.add_scores(scorer.score(ref, pred))
         result = aggregator.aggregate()
-        return {type: result[type].mid.fmeasure*100 for type in rouge_types}
+        return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
diff --git a/lm_eval/tasks/unscramble.py b/lm_eval/tasks/unscramble.py
index 69f73a4072..dd772c6de3 100644
--- a/lm_eval/tasks/unscramble.py
+++ b/lm_eval/tasks/unscramble.py
@@ -65,19 +65,13 @@ def construct_requests(self, doc, ctx):
     def process_results(self, doc, results):
         pred = results[0]
         gold = doc["completion"]
-        return {
-            "acc": int(pred == gold)
-        }
+        return {"acc": int(pred == gold)}
 
     def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}
 
     def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}
 
 
 class Anagrams1(WordUnscrambleTask):
diff --git a/lm_eval/tasks/webqs.py b/lm_eval/tasks/webqs.py
index 3848659441..42982620d8 100644
--- a/lm_eval/tasks/webqs.py
+++ b/lm_eval/tasks/webqs.py
@@ -54,20 +54,20 @@ def test_docs(self):
         return self.dataset["test"]
 
     def doc_to_text(self, doc):
-        return "Question: " + doc['question'] + '\nAnswer:'
+        return "Question: " + doc["question"] + "\nAnswer:"
 
     def should_decontaminate(self):
         return True
 
     def doc_to_decontamination_query(self, doc):
-        return doc['question']
+        return doc["question"]
 
     def doc_to_target(self, doc):
-        # this picks one answer to be the "correct" one, despite sometimes 
+        # this picks one answer to be the "correct" one, despite sometimes
         # multiple correct answers being possible.
         # TODO: make sure we're actually handling multi-answer correctly
-        return " " + doc['answers'][0]
-        
+        return " " + doc["answers"][0]
+
     def _remove_prefixes(self, aliases):
         # Optimization: Remove any alias that has a strict prefix elsewhere in the list
         # we can do this because if the prefix is acceptable by isgreedy, we can stop looking
@@ -81,15 +81,13 @@ def _remove_prefixes(self, aliases):
 
     def construct_requests(self, doc, ctx):
         ret = []
-        for alias in self._remove_prefixes(doc['answers']):
+        for alias in self._remove_prefixes(doc["answers"]):
             _, is_prediction = rf.loglikelihood(ctx, " " + alias)
             ret.append(is_prediction)
         return ret
 
     def process_results(self, doc, results):
-        return {
-            "acc": float(any(results))
-        }
+        return {"acc": float(any(results))}
 
     def aggregation(self):
         return {
@@ -97,6 +95,4 @@ def aggregation(self):
         }
 
     def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}
diff --git a/lm_eval/tasks/wikitext.py b/lm_eval/tasks/wikitext.py
index fc899b5d1f..fb9d3ee7cc 100644
--- a/lm_eval/tasks/wikitext.py
+++ b/lm_eval/tasks/wikitext.py
@@ -2,7 +2,7 @@
 Pointer Sentinel Mixture Models
 https://arxiv.org/pdf/1609.07843.pdf
 
-The WikiText language modeling dataset is a collection of over 100 million tokens 
+The WikiText language modeling dataset is a collection of over 100 million tokens
 extracted from the set of verified Good and Featured articles on Wikipedia.
 
 NOTE: This `Task` is based on WikiText-2.
@@ -17,7 +17,7 @@
 
 _CITATION = """
 @misc{merity2016pointer,
-    title={Pointer Sentinel Mixture Models}, 
+    title={Pointer Sentinel Mixture Models},
     author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
     year={2016},
     eprint={1609.07843},
diff --git a/lm_eval/tasks/winogrande.py b/lm_eval/tasks/winogrande.py
index 7b16b22c62..08c940810d 100644
--- a/lm_eval/tasks/winogrande.py
+++ b/lm_eval/tasks/winogrande.py
@@ -1,138 +1,132 @@
-"""
-WinoGrande: An Adversarial Winograd Schema Challenge at Scale
-https://arxiv.org/pdf/1907.10641.pdf
-
-WinoGrande is a collection of 44k problems, inspired by Winograd Schema Challenge
-(Levesque, Davis, and Morgenstern 2011), but adjusted to improve the scale and
-robustness against the dataset-specific bias. Formulated as a fill-in-a-blank
-task with binary options, the goal is to choose the right option for a given
-sentence which requires commonsense reasoning.
-
-NOTE: This evaluation of Winogrande uses partial evaluation as described by
-Trinh & Le in Simple Method for Commonsense Reasoning (2018). 
-See: https://arxiv.org/abs/1806.02847
-
-Homepage: https://leaderboard.allenai.org/winogrande/submissions/public
-"""
-import numpy as np
-from lm_eval.base import rf, Task
-from lm_eval.metrics import mean
-
-
-_CITATION = """
-@article{sakaguchi2019winogrande,
-    title={WinoGrande: An Adversarial Winograd Schema Challenge at Scale},
-    author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin},
-    journal={arXiv preprint arXiv:1907.10641},
-    year={2019}
-}
-"""
-
-
-class Winogrande(Task):
-    VERSION = 0
-    DATASET_PATH = "winogrande"
-    DATASET_NAME = "winogrande_xl"
-
-    answer_to_num = {'1': 0, '2': 1}
-
-    def has_training_docs(self):
-        return True
-
-    def has_validation_docs(self):
-        return True
-
-    def has_test_docs(self):
-        return False
-
-    def training_docs(self):
-        if self._training_docs is None:
-            self._training_docs = list(self.dataset["train"])
-        return self._training_docs
-
-    def validation_docs(self):
-        return self.dataset["validation"]
-
-    def doc_to_text(self, doc):
-        return self.partial_context(doc, doc["option" + doc["answer"]])
-
-    def should_decontaminate(self):
-        return True
-
-    def doc_to_decontamination_query(self, doc):
-        return doc["sentence"]
-        
-    @classmethod
-    def partial_context(cls, doc, option):
-        # Substitute the pronoun in the sentence with the specified option
-        # and ignore everything after.
-        pronoun_loc = doc["sentence"].index("_")
-        return doc["sentence"][:pronoun_loc] + option
-
-    def doc_to_target(self, doc):
-        return self.partial_target(doc)
-
-    @classmethod
-    def partial_target(cls, doc):
-        # The target is everything after the document specified pronoun.
-        pronoun_loc = doc["sentence"].index("_") + 1
-        return " " + doc["sentence"][pronoun_loc:].strip()
-
-    def construct_requests(self, doc, ctx):
-        """Uses RequestFactory to construct Requests and returns an iterable of
-        Requests which will be sent to the LM.
-
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural
-            language description, as well as the few shot examples, and the question
-            part of the document for `doc`.
-        """
-        target = self.partial_target(doc)
-        lls = []
-        for option in [doc["option1"], doc["option2"]]:
-            partial_ctx = self.partial_context(doc, option)
-            full_ctx = self.append_context(ctx, partial_ctx)
-            lls.append(rf.loglikelihood(full_ctx, target)[0])
-        return lls
-
-    @classmethod
-    def append_context(cls, ctx, partial_ctx):
-        ctx = ctx.split("\n\n")  # Each fewshot context is on its own new line.
-        ctx.pop()  # Remove the correct context put in by `doc_to_text`.
-        return "\n\n".join([*ctx, partial_ctx]) if ctx else partial_ctx
-
-    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a
-        dict where keys are the names of submetrics and values are the values of
-        the metric for that one document
-
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param results:
-            The results of the requests created in construct_requests.
-        """
-        return {
-            "acc": np.argmax(results) == self.answer_to_num[doc["answer"]]
-        }
-
-    def aggregation(self):
-        """
-        :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are
-            functions that aggregate a list of metrics
-        """
-        return {
-            "acc": mean
-        }
-
-    def higher_is_better(self):
-        """
-        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are
-            whether a higher value of the submetric is better
-        """
-        return {
-            "acc": True
-        }
+"""
+WinoGrande: An Adversarial Winograd Schema Challenge at Scale
+https://arxiv.org/pdf/1907.10641.pdf
+
+WinoGrande is a collection of 44k problems, inspired by Winograd Schema Challenge
+(Levesque, Davis, and Morgenstern 2011), but adjusted to improve the scale and
+robustness against the dataset-specific bias. Formulated as a fill-in-a-blank
+task with binary options, the goal is to choose the right option for a given
+sentence which requires commonsense reasoning.
+
+NOTE: This evaluation of Winogrande uses partial evaluation as described by
+Trinh & Le in Simple Method for Commonsense Reasoning (2018).
+See: https://arxiv.org/abs/1806.02847
+
+Homepage: https://leaderboard.allenai.org/winogrande/submissions/public
+"""
+import numpy as np
+from lm_eval.base import rf, Task
+from lm_eval.metrics import mean
+
+
+_CITATION = """
+@article{sakaguchi2019winogrande,
+    title={WinoGrande: An Adversarial Winograd Schema Challenge at Scale},
+    author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin},
+    journal={arXiv preprint arXiv:1907.10641},
+    year={2019}
+}
+"""
+
+
+class Winogrande(Task):
+    VERSION = 0
+    DATASET_PATH = "winogrande"
+    DATASET_NAME = "winogrande_xl"
+
+    answer_to_num = {"1": 0, "2": 1}
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def doc_to_text(self, doc):
+        return self.partial_context(doc, doc["option" + doc["answer"]])
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["sentence"]
+
+    @classmethod
+    def partial_context(cls, doc, option):
+        # Substitute the pronoun in the sentence with the specified option
+        # and ignore everything after.
+        pronoun_loc = doc["sentence"].index("_")
+        return doc["sentence"][:pronoun_loc] + option
+
+    def doc_to_target(self, doc):
+        return self.partial_target(doc)
+
+    @classmethod
+    def partial_target(cls, doc):
+        # The target is everything after the document specified pronoun.
+        pronoun_loc = doc["sentence"].index("_") + 1
+        return " " + doc["sentence"][pronoun_loc:].strip()
+
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        target = self.partial_target(doc)
+        lls = []
+        for option in [doc["option1"], doc["option2"]]:
+            partial_ctx = self.partial_context(doc, option)
+            full_ctx = self.append_context(ctx, partial_ctx)
+            lls.append(rf.loglikelihood(full_ctx, target)[0])
+        return lls
+
+    @classmethod
+    def append_context(cls, ctx, partial_ctx):
+        ctx = ctx.split("\n\n")  # Each fewshot context is on its own new line.
+        ctx.pop()  # Remove the correct context put in by `doc_to_text`.
+        return "\n\n".join([*ctx, partial_ctx]) if ctx else partial_ctx
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        return {"acc": np.argmax(results) == self.answer_to_num[doc["answer"]]}
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {"acc": mean}
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {"acc": True}
diff --git a/lm_eval/tasks/wsc273.py b/lm_eval/tasks/wsc273.py
index 60f9e0becc..c88c7b7768 100644
--- a/lm_eval/tasks/wsc273.py
+++ b/lm_eval/tasks/wsc273.py
@@ -40,8 +40,19 @@ class WinogradSchemaChallenge273(Task):
     DATASET_PATH = "winograd_wsc"
     DATASET_NAME = "wsc273"
 
-    upper_pronouns = ["A", "An", "The", "She", "He",
-                      "It", "They", "My", "His", "Her", "Their"]
+    upper_pronouns = [
+        "A",
+        "An",
+        "The",
+        "She",
+        "He",
+        "It",
+        "They",
+        "My",
+        "His",
+        "Her",
+        "Their",
+    ]
 
     def has_training_docs(self):
         return False
@@ -68,7 +79,7 @@ def __normalize_option(self, doc, option):
             option += "'s"
         # Appropriately lowercase the pronoun in the option.
         pronoun = option.split()[0]
-        start_of_sentence = doc["text"][doc['pronoun_loc'] - 2] == '.'
+        start_of_sentence = doc["text"][doc["pronoun_loc"] - 2] == "."
         if not start_of_sentence and pronoun in self.upper_pronouns:
             return option.replace(pronoun, pronoun.lower())
         return option
@@ -95,7 +106,7 @@ def doc_to_decontamination_query(self, doc):
     def partial_context(cls, doc, option):
         # Substitute the pronoun in the original text with the specified
         # option and ignore everything after.
-        return doc["text"][:doc["pronoun_loc"]] + option
+        return doc["text"][: doc["pronoun_loc"]] + option
 
     def doc_to_target(self, doc):
         return self.partial_target(doc)
@@ -141,9 +152,7 @@ def process_results(self, doc, results):
         :param results:
             The results of the requests created in construct_requests.
         """
-        return {
-            "acc": np.argmax(results) == doc["label"]
-        }
+        return {"acc": np.argmax(results) == doc["label"]}
 
     def aggregation(self):
         """
@@ -151,9 +160,7 @@ def aggregation(self):
             A dictionary where keys are the names of submetrics and values are
             functions that aggregate a list of metrics
         """
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}
 
     def higher_is_better(self):
         """
@@ -161,6 +168,4 @@ def higher_is_better(self):
             A dictionary where keys are the names of submetrics and values are
             whether a higher value of the submetric is better
         """
-        return {
-            "acc": True
-        }
+        return {"acc": True}
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index e331283866..c2fbe7b298 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -34,6 +34,7 @@ def simple_parse_args_string(args_string):
         args_dict[k] = v
     return args_dict
 
+
 def join_iters(iters):
     for iter in iters:
         yield from iter
@@ -46,23 +47,26 @@ def chunks(iter, n):
         if len(arr) == n:
             yield arr
             arr = []
-    
-    if arr: yield arr
+
+    if arr:
+        yield arr
+
 
 def group(arr, fn):
     res = collections.defaultdict(list)
 
     for ob in arr:
         res[fn(ob)].append(ob)
-    
+
     return list(res.values())
 
+
 def general_detokenize(string):
     string = string.replace(" n't", "n't")
     string = string.replace(" )", ")")
     string = string.replace("( ", "(")
-    string = string.replace("\" ", "\"")
-    string = string.replace(" \"", "\"")
+    string = string.replace('" ', '"')
+    string = string.replace(' "', '"')
     string = re.sub(r" (['.,])", r"\1", string)
     return string
 
@@ -94,10 +98,7 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
 
     # Special handling for first window: predict all tokens
     first_seq_len = min(max_seq_len, len(token_list))
-    yield (
-        [prefix_token] + token_list[:first_seq_len - 1],
-        token_list[:first_seq_len]
-    )
+    yield ([prefix_token] + token_list[: first_seq_len - 1], token_list[:first_seq_len])
     predicted += first_seq_len
 
     while predicted < len(token_list):
@@ -105,61 +106,66 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
         window_end = predicted + window_pred_len
 
         yield (
-            token_list[window_end - max_seq_len - 1:window_end - 1],
-            token_list[window_end - window_pred_len:window_end],
+            token_list[window_end - max_seq_len - 1 : window_end - 1],
+            token_list[window_end - window_pred_len : window_end],
         )
         predicted += window_pred_len
 
+
 def make_disjoint_window(pair):
-    """ Takes output from get_rolling_token_windows and makes the context not overlap with the continuation """
+    """Takes output from get_rolling_token_windows and makes the context not overlap with the continuation"""
 
     a, b = pair
 
-    return a[:-(len(b) - 1)], b
+    return a[: -(len(b) - 1)], b
+
 
 class Reorderer:
     def __init__(self, arr, fn):
         self.size = len(arr)
         arr = list(enumerate(arr))
         arr = group(arr, lambda x: fn(x[1]))
-        arr = [
-            ([y[0] for y in x], x[0][1]) for x in arr
-        ]
+        arr = [([y[0] for y in x], x[0][1]) for x in arr]
         arr.sort(key=lambda x: fn(x[1]))
 
         self.arr = arr
-        
-    
+
     def get_reordered(self):
         return [x[1] for x in self.arr]
-    
+
     def get_original(self, newarr):
         res = [None] * self.size
         cov = [False] * self.size
 
         for (inds, _), v in zip(self.arr, newarr):
-            for ind in inds: 
+            for ind in inds:
                 res[ind] = v
                 cov[ind] = True
-        
+
         assert all(cov)
-        
+
         return res
 
+
 def positional_deprecated(fn):
     """
-    A decorator to nudge users into passing only keyword args (`kwargs`) to the 
+    A decorator to nudge users into passing only keyword args (`kwargs`) to the
     wrapped function, `fn`.
     """
+
     @functools.wraps(fn)
     def _wrapper(*args, **kwargs):
-        if len(args) != 1 if inspect.ismethod(fn) else 0: 
-            print(f"WARNING: using {fn.__name__} with positional arguments is "
+        if len(args) != 1 if inspect.ismethod(fn) else 0:
+            print(
+                f"WARNING: using {fn.__name__} with positional arguments is "
                 "deprecated and will be disallowed in a future version of "
-                "lm-evaluation-harness!")
+                "lm-evaluation-harness!"
+            )
         return fn(*args, **kwargs)
+
     return _wrapper
 
+
 @positional_deprecated
 def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
     """
@@ -169,12 +175,14 @@ def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
     cur_path = start_path.resolve()
     max_layers = 3
     for _ in range(max_layers):
-        if (cur_path / 'tests' / 'test_version_stable.py').exists():
+        if (cur_path / "tests" / "test_version_stable.py").exists():
             return cur_path
         else:
             cur_path = cur_path.parent.resolve()
-    raise FileNotFoundError(f"Unable to find package root within {max_layers} upwards" +\
-        f"of {start_path}")
+    raise FileNotFoundError(
+        f"Unable to find package root within {max_layers} upwards" + f"of {start_path}"
+    )
+
 
 @positional_deprecated
 def run_task_tests(task_list: List[str]):
@@ -182,9 +190,16 @@ def run_task_tests(task_list: List[str]):
     Find the package root and run the tests for the given tasks
     """
     package_root = find_test_root(start_path=pathlib.Path(__file__))
-    task_string = ' or '.join(task_list)
-    args = [f'{package_root}/tests/test_version_stable.py', f'--rootdir={package_root}', '-k', f'{task_string}']
+    task_string = " or ".join(task_list)
+    args = [
+        f"{package_root}/tests/test_version_stable.py",
+        f"--rootdir={package_root}",
+        "-k",
+        f"{task_string}",
+    ]
     sys.path.append(str(package_root))
     pytest_return_val = pytest.main(args)
     if pytest_return_val:
-        raise ValueError(f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}")
\ No newline at end of file
+        raise ValueError(
+            f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
+        )
diff --git a/main.py b/main.py
index f07e6f6aa4..f84b8b85e8 100644
--- a/main.py
+++ b/main.py
@@ -7,6 +7,7 @@
 
 logging.getLogger("openai").setLevel(logging.WARNING)
 
+
 class MultiChoice:
     def __init__(self, choices):
         self.choices = choices
@@ -23,24 +24,26 @@ def __iter__(self):
         for choice in self.choices:
             yield choice
 
+
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model', required=True)
-    parser.add_argument('--model_args', default="")
-    parser.add_argument('--tasks', default=None, choices=MultiChoice(tasks.ALL_TASKS))
-    parser.add_argument('--provide_description', action="store_true")
-    parser.add_argument('--num_fewshot', type=int, default=0)
-    parser.add_argument('--batch_size', type=int, default=None)
-    parser.add_argument('--device', type=str, default=None)
-    parser.add_argument('--output_path', default=None)
-    parser.add_argument('--limit', type=int, default=None)
-    parser.add_argument('--no_cache', action="store_true")
-    parser.add_argument('--decontamination_ngrams_path', default=None)
-    parser.add_argument('--description_dict_path', default=None)  
-    parser.add_argument('--check_integrity', action="store_true")  
+    parser.add_argument("--model", required=True)
+    parser.add_argument("--model_args", default="")
+    parser.add_argument("--tasks", default=None, choices=MultiChoice(tasks.ALL_TASKS))
+    parser.add_argument("--provide_description", action="store_true")
+    parser.add_argument("--num_fewshot", type=int, default=0)
+    parser.add_argument("--batch_size", type=int, default=None)
+    parser.add_argument("--device", type=str, default=None)
+    parser.add_argument("--output_path", default=None)
+    parser.add_argument("--limit", type=int, default=None)
+    parser.add_argument("--no_cache", action="store_true")
+    parser.add_argument("--decontamination_ngrams_path", default=None)
+    parser.add_argument("--description_dict_path", default=None)
+    parser.add_argument("--check_integrity", action="store_true")
 
     return parser.parse_args()
 
+
 # Returns a list containing all values of the source_list that
 # match at least one of the patterns
 def pattern_match(patterns, source_list):
@@ -50,13 +53,16 @@ def pattern_match(patterns, source_list):
             task_names.add(matching)
     return list(task_names)
 
+
 def main():
     args = parse_args()
-        
+
     assert not args.provide_description  # not implemented
-    
+
     if args.limit:
-        print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
+        print(
+            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
+        )
 
     if args.tasks is None:
         task_names = tasks.ALL_TASKS
@@ -67,7 +73,7 @@ def main():
 
     description_dict = {}
     if args.description_dict_path:
-        with open(args.description_dict_path, 'r') as f:
+        with open(args.description_dict_path, "r") as f:
             description_dict = json.load(f)
 
     results = evaluator.simple_evaluate(
@@ -81,10 +87,10 @@ def main():
         limit=args.limit,
         description_dict=description_dict,
         decontamination_ngrams_path=args.decontamination_ngrams_path,
-        check_integrity=args.check_integrity
+        check_integrity=args.check_integrity,
     )
 
-    dumped = json.dumps(results, indent=2)    
+    dumped = json.dumps(results, indent=2)
     print(dumped)
 
     if args.output_path:
diff --git a/pile_statistics.json b/pile_statistics.json
index e7f5514494..116f0eb976 100644
--- a/pile_statistics.json
+++ b/pile_statistics.json
@@ -34,4 +34,4 @@
         196565318,
         203583306
     ]
-}
\ No newline at end of file
+}
diff --git a/scripts/clean_training_data/README.md b/scripts/clean_training_data/README.md
index 30fb775677..4d206519ea 100644
--- a/scripts/clean_training_data/README.md
+++ b/scripts/clean_training_data/README.md
@@ -1,17 +1,17 @@
-janitor.py contains a script to remove benchmark data contamination from training data sets. 
+janitor.py contains a script to remove benchmark data contamination from training data sets.
 It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.14165).
 
 ## Algorithm
 1) Collects all contamination text files that are to be removed from training data
-2) Filters training data by finding `N`gram matches between the training data 
+2) Filters training data by finding `N`gram matches between the training data
    and any contamination
-   1) `N`grams ignore case and punctation and are split on whitespace.  
-   2) Matching `N`gram substrings are removed, as is a `window_to_remove` character window around 
+   1) `N`grams ignore case and punctation and are split on whitespace.
+   2) Matching `N`gram substrings are removed, as is a `window_to_remove` character window around
     the match, splitting the training data into chunks
    3) Any chunks less than `minimum_slice_length` are removed
    4) Training data sets split into more than `too_dirty_cutoff` are considered
     completey contaminated and removed
-      
+
 OpenAI used:
 ```
 ngram_n = 13
@@ -31,4 +31,3 @@ c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor
 ```
 
 If your your compiler isn't linked to python, you may need to add to the above `-undefined dynamic_lookup`
-
diff --git a/scripts/clean_training_data/compress_and_package.py b/scripts/clean_training_data/compress_and_package.py
index 041462a91b..dfa23e42d9 100644
--- a/scripts/clean_training_data/compress_and_package.py
+++ b/scripts/clean_training_data/compress_and_package.py
@@ -9,11 +9,15 @@
 
 import logging
 from tqdm_multiprocess.logger import setup_logger_tqdm
+
 logger = logging.getLogger(__name__)
 
-def process_task(working_directory, output_directory, bucket_file_path, tqdm_func, global_tqdm):
+
+def process_task(
+    working_directory, output_directory, bucket_file_path, tqdm_func, global_tqdm
+):
     command = f"zstd {bucket_file_path}"
-    logger.info(command)    
+    logger.info(command)
     subprocess.call(command, shell=True)
 
     compressed_file = bucket_file_path + ".zst"
@@ -23,32 +27,38 @@ def process_task(working_directory, output_directory, bucket_file_path, tqdm_fun
     os.remove(bucket_file_path)
     global_tqdm.update()
 
+
 def compress_and_move(working_directory, output_directory, process_count):
     os.makedirs(output_directory, exist_ok=True)
     original_info_file_path = os.path.join(working_directory, "info.json")
-    assert(os.path.exists(original_info_file_path))
+    assert os.path.exists(original_info_file_path)
 
     tasks = []
-    bucket_file_paths = glob.glob(os.path.join(working_directory, "output", f"*.bkt.txt.sorted")) 
+    bucket_file_paths = glob.glob(
+        os.path.join(working_directory, "output", f"*.bkt.txt.sorted")
+    )
     for bucket_file_path in bucket_file_paths:
         task = (process_task, (working_directory, output_directory, bucket_file_path))
         tasks.append(task)
 
-    pool = TqdmMultiProcessPool(process_count) 
-    on_done = lambda _ : None
-    on_error = lambda _ : None
+    pool = TqdmMultiProcessPool(process_count)
+    on_done = lambda _: None
+    on_error = lambda _: None
 
-    global_progress = tqdm(total=len(bucket_file_paths), dynamic_ncols=True, unit="file")
+    global_progress = tqdm(
+        total=len(bucket_file_paths), dynamic_ncols=True, unit="file"
+    )
     _ = pool.map(global_progress, tasks, on_error, on_done)
 
     shutil.copy(original_info_file_path, os.path.join(output_directory, "info.json"))
 
-parser = argparse.ArgumentParser(description='sort 13gram buckets')
+
+parser = argparse.ArgumentParser(description="sort 13gram buckets")
 parser.add_argument("-dir", "--working_directory", required=True)
 parser.add_argument("-output", "--output_directory", required=True)
 parser.add_argument("-procs", "--process_count", type=int, default=8)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     version = 1.00
     print(f"Running version {version}")
 
@@ -56,4 +66,4 @@ def compress_and_move(working_directory, output_directory, process_count):
     setup_logger_tqdm(logfile_path)
 
     args = parser.parse_args()
-    compress_and_move(args.working_directory, args.output_directory, args.process_count)
\ No newline at end of file
+    compress_and_move(args.working_directory, args.output_directory, args.process_count)
diff --git a/scripts/clean_training_data/generate_13_grams.py b/scripts/clean_training_data/generate_13_grams.py
index f72cf7cc55..27037e394d 100644
--- a/scripts/clean_training_data/generate_13_grams.py
+++ b/scripts/clean_training_data/generate_13_grams.py
@@ -1,9 +1,9 @@
 """
 Outputs all 13-grams found in The Pile.
 
-Loops through all documents and uses the logic found in janitor.py to extract 13-grams. 
-We bucket each 13-gram by hash into separate file buckets to allow easy parallel processing in the 
-next stage. We also include the current pile document_id with each ngram instance to allow the 
+Loops through all documents and uses the logic found in janitor.py to extract 13-grams.
+We bucket each 13-gram by hash into separate file buckets to allow easy parallel processing in the
+next stage. We also include the current pile document_id with each ngram instance to allow the
 filtering to exclude 13-grams that match more then 10 unique documents (done further down the pipeline).
 
 We didn't use lm_dataformat to output as it increases time 4x (slow jsonify) and makes
@@ -37,18 +37,24 @@
 
 import logging
 from tqdm_multiprocess.logger import setup_logger_tqdm
+
 logger = logging.getLogger(__name__)
 
 terminate = False
+
+
 def handler(signal_received, frame):
     global terminate
     terminate = True
 
+
 def yield_pile(start_offsets=None, checkpoint_offset=None):
     directory = "pile"
 
     if not os.path.exists(directory):
-        print("We expect the pile archives to be in the 'pile' directory, but this was not found.")
+        print(
+            "We expect the pile archives to be in the 'pile' directory, but this was not found."
+        )
         raise Exception("Pile directory not found.")
 
     files = list(sorted(glob.glob(os.path.join(directory, "*.jsonl.zst*"))))
@@ -63,10 +69,9 @@ def yield_pile(start_offsets=None, checkpoint_offset=None):
             start_file = file_i
             pile_global_offset = start_offset
 
-    
     for file_i, file in enumerate(files):
         if file_i < start_file:
-            logger.info(f"Skipping file {file}")            
+            logger.info(f"Skipping file {file}")
             continue
         logger.info(f"Reading from pile file: {file}")
         reader = Reader()
@@ -74,12 +79,15 @@ def yield_pile(start_offsets=None, checkpoint_offset=None):
             yield (pile_global_offset, document)
             pile_global_offset += 1
 
+
 # Hash buckets > disk backed files. Supports file position checkpointing and resuming
 # Allows you to write continuously and checkpoint intermittently. If a failure occurs
 # the buckets are simply truncated at your last checkpoint.
 class Buckets:
     def __init__(self, directory, num_buckets):
-        self.bucket_files = [os.path.join(directory, f"ngrams_{i}.bkt.txt") for i in range(num_buckets)]
+        self.bucket_files = [
+            os.path.join(directory, f"ngrams_{i}.bkt.txt") for i in range(num_buckets)
+        ]
         self.buckets = list(map(TextArchive, self.bucket_files))
         self.checkpoint_file = os.path.join(directory, f"bucket_offsets.ckpt")
 
@@ -109,6 +117,7 @@ def close_buckets(self):
         for bucket in self.buckets:
             bucket.commit()
 
+
 def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
 
     pile_statistics = json.load(open("pile_statistics.json", "r"))
@@ -129,7 +138,7 @@ def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
     # Checkpoint
     checkpoint_file = os.path.join(working_directory, f"pile_offset.ckpt")
     if os.path.exists(checkpoint_file):
-        checkpoint_offset = pickle.load(open(checkpoint_file,"rb"))
+        checkpoint_offset = pickle.load(open(checkpoint_file, "rb"))
         iterate = True
     else:
         checkpoint_offset = 0
@@ -145,7 +154,7 @@ def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
     with tqdm(total=checkpoint_offset, dynamic_ncols=True, unit="docs") as progress:
         for offset, document in yield_pile(start_offsets, checkpoint_offset):
             if iterate:
-                logger.info(f"Iterating to offset {checkpoint_offset} from {offset}")                
+                logger.info(f"Iterating to offset {checkpoint_offset} from {offset}")
                 progress.update(offset)
                 iterate = False
 
@@ -165,7 +174,7 @@ def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
                 progress.update(batch_size)
                 batch_counter = 0
                 buckets.save_checkpoint()
-                pickle.dump(offset, open(checkpoint_file,"wb"))
+                pickle.dump(offset, open(checkpoint_file, "wb"))
                 if terminate:
                     buckets.close_buckets()
                     return
@@ -175,17 +184,17 @@ def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
                 buckets.add_data(ngram, f"{ngram} {offset}")
 
             batch_counter += 1
-    
+
     buckets.close_buckets()
     Path(done_file).touch()
 
 
-parser = argparse.ArgumentParser(description='Generate 13 grams from Pile.')
+parser = argparse.ArgumentParser(description="Generate 13 grams from Pile.")
 parser.add_argument("-dir", "--working_directory", default="")
 parser.add_argument("-n", "--n_value", type=int, default=13)
 parser.add_argument("-buckets", "--bucket_count", type=int, default=500)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     version = 1.00
     print(f"Running version {version}")
 
@@ -204,4 +213,4 @@ def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
 
     info_dict = {"title": "dataset ngrams", "ngram_size": 13}
     info_dict_path = os.path.join(args.working_directory, "info.json")
-    json.dump(info_dict, open(info_dict_path, "w"))
\ No newline at end of file
+    json.dump(info_dict, open(info_dict_path, "w"))
diff --git a/scripts/clean_training_data/investigate_pile.py b/scripts/clean_training_data/investigate_pile.py
index 806c379e9d..dd6bd11d06 100644
--- a/scripts/clean_training_data/investigate_pile.py
+++ b/scripts/clean_training_data/investigate_pile.py
@@ -7,6 +7,7 @@
 
 from tqdm_multiprocess import TqdmMultiProcessPool
 
+
 def get_file_stats(file_path, tqdm_func, global_tqdm):
     reader = Reader()
     total_documents = 0
@@ -14,13 +15,15 @@ def get_file_stats(file_path, tqdm_func, global_tqdm):
     update_frequency = 10000
     current_file_position = 0
 
-    with tqdm_func(total=os.path.getsize(file_path), dynamic_ncols=True, unit="byte", unit_scale=1) as progress:
+    with tqdm_func(
+        total=os.path.getsize(file_path), dynamic_ncols=True, unit="byte", unit_scale=1
+    ) as progress:
         for document in reader.read(file_path, get_meta=True):
             total_size += len(document)
             total_documents += 1
 
             if total_documents % update_frequency == 0:
-                new_file_pos = reader.fh.tell() 
+                new_file_pos = reader.fh.tell()
                 bytes_read = new_file_pos - current_file_position
                 current_file_position = new_file_pos
                 progress.update(bytes_read)
@@ -28,27 +31,33 @@ def get_file_stats(file_path, tqdm_func, global_tqdm):
 
     return (total_documents, total_size)
 
+
 def get_files():
     directory = "pile"
     files = list(sorted(glob.glob(os.path.join(directory, "*.jsonl.zst*"))))
     print(files)
     return files
 
+
 def get_stats():
     files = get_files()
     total_size_bytes = sum(map(lambda x: os.path.getsize(x), files))
-    
+
     pool = TqdmMultiProcessPool(4)
-    global_tqdm = tqdm.tqdm(total=total_size_bytes, dynamic_ncols=True, unit="byte", unit_scale=1)
+    global_tqdm = tqdm.tqdm(
+        total=total_size_bytes, dynamic_ncols=True, unit="byte", unit_scale=1
+    )
 
     # Generate minhashes with pool
     tasks = [(get_file_stats, (file,)) for file in files]
 
-    on_done = lambda _ : None
-    on_error = lambda _ : None
+    on_done = lambda _: None
+    on_error = lambda _: None
     results = pool.map(global_tqdm, tasks, on_error, on_done)
 
-    total_documents, total_size = reduce(lambda x, y: (x[0]+y[0],x[1]+y[1]), results)
+    total_documents, total_size = reduce(
+        lambda x, y: (x[0] + y[0], x[1] + y[1]), results
+    )
 
     start_offsets = []
     current_offset = 0
@@ -58,7 +67,8 @@ def get_stats():
 
     return (total_documents, total_size, start_offsets)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     version = 1.01
     print(f"Running version {version}")
 
@@ -67,12 +77,13 @@ def get_stats():
         stats = json.load(open(stats_file_path, "r"))
     else:
         document_count, total_document_size_chars, start_offsets = get_stats()
-        stats = {"Data": "Pile statistics",
-                 "Document Count": document_count,
-                 "Total Pile Characters": total_document_size_chars,
-                 "File Start Offsets": start_offsets
-                 }
-        json.dump(stats, open(stats_file_path, "w"),  indent=4)
+        stats = {
+            "Data": "Pile statistics",
+            "Document Count": document_count,
+            "Total Pile Characters": total_document_size_chars,
+            "File Start Offsets": start_offsets,
+        }
+        json.dump(stats, open(stats_file_path, "w"), indent=4)
 
     print(f"document_count: {stats['Document Count']}")
     print(f"total_chars: {stats['Total Pile Characters']}")
diff --git a/scripts/clean_training_data/janitor_util.cpp b/scripts/clean_training_data/janitor_util.cpp
index 98fa08bea2..23961aa084 100644
--- a/scripts/clean_training_data/janitor_util.cpp
+++ b/scripts/clean_training_data/janitor_util.cpp
@@ -1,193 +1,208 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
-#include <utility>
+#include <queue>
 #include <string>
-#include <vector>
 #include <tuple>
-#include <queue>
+#include <utility>
+#include <vector>
 
 bool is_whitespace(char ch) noexcept {
-    // " \t\n\r\x0b\x0c" (python string.whitespace)
-    return ch == 32 or (9 <= ch and ch <= 13);
-//    return ch <= 32; // arguably too general, but slightly faster
+  // " \t\n\r\x0b\x0c" (python string.whitespace)
+  return ch == 32 or (9 <= ch and ch <= 13);
+  //    return ch <= 32; // arguably too general, but slightly faster
 }
 
 bool is_punctuation(char c) noexcept {
-    // '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'      ascii values:    33-47,  58-64,  91-96,  123-126
-    return (33 <= c and c <= 47) or (58 <= c and c <= 64) or (91 <= c and c <= 96) or (123 <= c and c <= 126);
+  // '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'      ascii values:    33-47,  58-64,
+  // 91-96,  123-126
+  return (33 <= c and c <= 47) or (58 <= c and c <= 64) or
+         (91 <= c and c <= 96) or (123 <= c and c <= 126);
 }
 
-// Takes a string and makes ngrams of length N, splitting grams on whitespace and ignoring ignored characters
-// Returns a LARGE array of ngrams
-std::vector<std::string> clean_ngram(
-    std::string const & input, std::string const & ignore, size_t ngram_n
-) noexcept {
-
-    size_t num_grams = 0;
-    std::vector<std::string> ngram_list;
-    std::vector<uint8_t> gram_lengths;
-    std::string current_ngram;
-
-    // Max gram length is set to 10 below.
-    current_ngram.reserve(11*ngram_n);
-    gram_lengths.reserve(ngram_n);
-
-    bool started_gram = false;
-    gram_lengths.push_back(0);
-
-    //for (size_t i=0; i<input.length(); i++) {
-    // this is slightly faster, and we don't need the index in this one
-    for (auto iter = input.begin(); iter != input.end(); iter++) {
-
-        // If whitespace, end the current ngram and start the next
-        // alternatively, (perhaps marginally) faster: if (is_whitespace(ch)) { ... }
-        if (is_whitespace(*iter) || gram_lengths.back() > 10) {
-
-            // Skip all whitespace
-            while (++iter != input.end() && is_whitespace(*iter));
-            iter--;
-
-            if (started_gram){
-                num_grams += 1;
-
-                // Building 1grams is a special case
-                if (ngram_n == 1){
-                    ngram_list.push_back(current_ngram);
-                    current_ngram = current_ngram.substr(gram_lengths.front());
-                    gram_lengths.back() = 0;
-
-                // If there are enough grams to form an ngram, save
-                } else if (num_grams >= ngram_n){
-                    // Save the current ngram
-                    ngram_list.push_back(current_ngram);
-
-                    // Start the next ngram by dropping the first gram and its space from the ngram
-                    current_ngram = current_ngram.substr(gram_lengths.front() + 1);
-                    current_ngram += ' ';
-
-                    // Drop the length of the first gram and prepare to record the length of the new gram
-                    gram_lengths.erase(gram_lengths.begin());
-                    gram_lengths.push_back(0);
-
-                // Otherwise, continute building
-                } else {
-                    current_ngram += ' ';
-                    gram_lengths.push_back(0);
-                }
-
-                started_gram = false;
-            }
+// Takes a string and makes ngrams of length N, splitting grams on whitespace
+// and ignoring ignored characters Returns a LARGE array of ngrams
+std::vector<std::string> clean_ngram(std::string const &input,
+                                     std::string const &ignore,
+                                     size_t ngram_n) noexcept {
+
+  size_t num_grams = 0;
+  std::vector<std::string> ngram_list;
+  std::vector<uint8_t> gram_lengths;
+  std::string current_ngram;
+
+  // Max gram length is set to 10 below.
+  current_ngram.reserve(11 * ngram_n);
+  gram_lengths.reserve(ngram_n);
+
+  bool started_gram = false;
+  gram_lengths.push_back(0);
+
+  // for (size_t i=0; i<input.length(); i++) {
+  //  this is slightly faster, and we don't need the index in this one
+  for (auto iter = input.begin(); iter != input.end(); iter++) {
+
+    // If whitespace, end the current ngram and start the next
+    // alternatively, (perhaps marginally) faster: if (is_whitespace(ch)) { ...
+    // }
+    if (is_whitespace(*iter) || gram_lengths.back() > 10) {
+
+      // Skip all whitespace
+      while (++iter != input.end() && is_whitespace(*iter))
+        ;
+      iter--;
+
+      if (started_gram) {
+        num_grams += 1;
+
+        // Building 1grams is a special case
+        if (ngram_n == 1) {
+          ngram_list.push_back(current_ngram);
+          current_ngram = current_ngram.substr(gram_lengths.front());
+          gram_lengths.back() = 0;
+
+          // If there are enough grams to form an ngram, save
+        } else if (num_grams >= ngram_n) {
+          // Save the current ngram
+          ngram_list.push_back(current_ngram);
+
+          // Start the next ngram by dropping the first gram and its space from
+          // the ngram
+          current_ngram = current_ngram.substr(gram_lengths.front() + 1);
+          current_ngram += ' ';
+
+          // Drop the length of the first gram and prepare to record the length
+          // of the new gram
+          gram_lengths.erase(gram_lengths.begin());
+          gram_lengths.push_back(0);
+
+          // Otherwise, continute building
+        } else {
+          current_ngram += ' ';
+          gram_lengths.push_back(0);
+        }
 
+        started_gram = false;
+      }
 
-        // Skip ignored characters
-        // alternatively, (perhaps marginally) faster: if (is_punctuation(ch)) continue;
-        } else if (ignore.find(*iter) != std::string::npos) {
-            continue;
-        }
+      // Skip ignored characters
+      // alternatively, (perhaps marginally) faster: if (is_punctuation(ch))
+      // continue;
+    } else if (ignore.find(*iter) != std::string::npos) {
+      continue;
+    }
 
-        // If it is a non-ignored character, add it to the ngram and update the last gram's length
-        else {
-            current_ngram += tolower(*iter);
-            gram_lengths.back() += 1;
-            started_gram = true;
-        }
+    // If it is a non-ignored character, add it to the ngram and update the last
+    // gram's length
+    else {
+      current_ngram += tolower(*iter);
+      gram_lengths.back() += 1;
+      started_gram = true;
     }
+  }
 
-    return ngram_list;
+  return ngram_list;
 }
 
+// Takes a string and makes ngrams of length N, splitting grams on whitespace
+// and ignoring ignored characters Returns a LARGE array of tuples of (ngram,
+// start_idx, end_idx)
+std::vector<std::tuple<std::string, size_t, size_t>>
+clean_ngram_with_indices(std::string const &input, std::string const &ignore,
+                         size_t ngram_n) noexcept {
+
+  size_t num_grams = 0;
+  std::vector<std::tuple<std::string, size_t, size_t>> ngram_list;
+  std::vector<uint8_t> gram_lengths;
+  std::vector<size_t> gram_start_indices;
+  std::string current_ngram;
+
+  // Max gram length is set to 10 below.
+  current_ngram.reserve(11 * ngram_n);
+
+  bool started_gram = false;
+  gram_lengths.push_back(0);
+  gram_start_indices.push_back(0);
+
+  for (size_t i = 0; i < input.length(); i++) {
+    char ch = input[i];
+
+    // If whitespace, end the current ngram and start the next
+    if (is_whitespace(ch) || gram_lengths.back() > 10) {
+
+      // Skip all whitespace
+      while (++i < input.length() && is_whitespace(input[i]))
+        ;
+      i--;
+
+      if (started_gram) {
+        num_grams += 1;
+
+        // Building 1grams is a special case
+        if (ngram_n == 1) {
+          ngram_list.push_back(
+              std::make_tuple(current_ngram, gram_start_indices.front(), i));
+          current_ngram = current_ngram.substr(gram_lengths.front());
+          gram_lengths.back() = 0;
+          gram_start_indices.back() = i + 1;
+
+          // If there are enough grams to form an ngram, save
+        } else if (num_grams >= ngram_n) {
+
+          // Save the current ngram
+          ngram_list.push_back(
+              std::make_tuple(current_ngram, gram_start_indices.front(), i));
+
+          // Start the next ngram by dropping the first gram and its space from
+          // the ngram
+          current_ngram = current_ngram.substr(gram_lengths.front() + 1);
+          current_ngram += ' ';
+
+          // Drop the length of the first gram and prepare to record the length
+          // of the new gram
+          gram_lengths.erase(gram_lengths.begin());
+          gram_lengths.push_back(0);
+
+          gram_start_indices.erase(gram_start_indices.begin());
+          gram_start_indices.push_back(i + 1);
+
+          // Otherwise, continute building
+        } else {
+          current_ngram += ' ';
+          gram_lengths.push_back(0);
+          gram_start_indices.push_back(i + 1);
+        }
 
-// Takes a string and makes ngrams of length N, splitting grams on whitespace and ignoring ignored characters
-// Returns a LARGE array of tuples of (ngram, start_idx, end_idx)
-std::vector<std::tuple<std::string, size_t, size_t> > clean_ngram_with_indices(
-    std::string const & input, std::string const & ignore, size_t ngram_n
-) noexcept {
-
-    size_t num_grams = 0;
-    std::vector<std::tuple<std::string, size_t, size_t> > ngram_list;
-    std::vector<uint8_t> gram_lengths;
-    std::vector<size_t> gram_start_indices;
-    std::string current_ngram;
-
-    // Max gram length is set to 10 below.
-    current_ngram.reserve(11*ngram_n);
-
-    bool started_gram = false;
-    gram_lengths.push_back(0);
-    gram_start_indices.push_back(0);
-
-    for (size_t i=0; i<input.length(); i++) {
-        char ch = input[i];
-
-        // If whitespace, end the current ngram and start the next
-        if (is_whitespace(ch) || gram_lengths.back() > 10) {
-
-            // Skip all whitespace
-            while (++i < input.length() && is_whitespace(input[i]));
-            i--;
-
-            if (started_gram){
-                num_grams += 1;
-
-                // Building 1grams is a special case
-                if (ngram_n == 1){
-                    ngram_list.push_back(std::make_tuple(current_ngram, gram_start_indices.front(), i));
-                    current_ngram = current_ngram.substr(gram_lengths.front());
-                    gram_lengths.back() = 0;
-                    gram_start_indices.back() = i+1;
-
-                // If there are enough grams to form an ngram, save
-                } else if (num_grams >= ngram_n){
-
-                    // Save the current ngram
-                    ngram_list.push_back(
-                        std::make_tuple(current_ngram, gram_start_indices.front(), i)
-                    );
-
-                    // Start the next ngram by dropping the first gram and its space from the ngram
-                    current_ngram = current_ngram.substr(gram_lengths.front() + 1);
-                    current_ngram += ' ';
-
-                    // Drop the length of the first gram and prepare to record the length of the new gram
-                    gram_lengths.erase(gram_lengths.begin());
-                    gram_lengths.push_back(0);
-
-                    gram_start_indices.erase(gram_start_indices.begin());
-                    gram_start_indices.push_back(i+1);
-
-                // Otherwise, continute building
-                } else {
-                    current_ngram += ' ';
-                    gram_lengths.push_back(0);
-                    gram_start_indices.push_back(i+1);
-                }
-
-                started_gram = false;
-            }
+        started_gram = false;
+      }
 
-        // Skip ignored characters
-        } else if (ignore.find(*iter) != std::string::npos) {
-            continue;
+      // Skip ignored characters
+    } else if (ignore.find(*iter) != std::string::npos) {
+      continue;
 
-        // If it is a non-ignored character, add it to the ngram and update the last gram's length
-        } else {
-            current_ngram += tolower(ch);
-            gram_lengths.back() += 1;
-            started_gram = true;
-        }
+      // If it is a non-ignored character, add it to the ngram and update the
+      // last gram's length
+    } else {
+      current_ngram += tolower(ch);
+      gram_lengths.back() += 1;
+      started_gram = true;
     }
+  }
 
-    return ngram_list;
+  return ngram_list;
 }
 
-
 PYBIND11_MODULE(janitor_util, m) {
-    m.doc() = "pybind11 example plugin"; // optional module docstring
-//    m.def("add", &add, "A function which adds two numbers");  // example function
-    m.def("clean_ngram", &clean_ngram, "Create ngrams of words, ignoring some characters");
-    m.def("clean_ngram_with_indices", &clean_ngram_with_indices, "Create ngrams of words with indices, ignoring some characters");
+  m.doc() = "pybind11 example plugin"; // optional module docstring
+  //    m.def("add", &add, "A function which adds two numbers");  // example
+  //    function
+  m.def("clean_ngram", &clean_ngram,
+        "Create ngrams of words, ignoring some characters");
+  m.def("clean_ngram_with_indices", &clean_ngram_with_indices,
+        "Create ngrams of words with indices, ignoring some characters");
 }
 
 // Example compile
-// c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix)
-// If python and gcc aren't linked, append to the above:    -undefined dynamic_lookup
\ No newline at end of file
+// c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes)
+// janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) If
+// python and gcc aren't linked, append to the above:    -undefined
+// dynamic_lookup
diff --git a/scripts/clean_training_data/process_sorted_buckets.py b/scripts/clean_training_data/process_sorted_buckets.py
index 63f35cad77..d4eb3fa202 100644
--- a/scripts/clean_training_data/process_sorted_buckets.py
+++ b/scripts/clean_training_data/process_sorted_buckets.py
@@ -27,25 +27,32 @@
 
 import logging
 from tqdm_multiprocess.logger import setup_logger_tqdm
+
 logger = logging.getLogger(__name__)
 
 # Multiprocessed
-def process_bucket(bucket_file_path, processed_directory, move_dir, tqdm_func, global_tqdm):  
+def process_bucket(
+    bucket_file_path, processed_directory, move_dir, tqdm_func, global_tqdm
+):
 
     bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path))
-    done_file = os.path.join(processed_directory, f"ngram_bucket_processing_{bucket_id}.done")
+    done_file = os.path.join(
+        processed_directory, f"ngram_bucket_processing_{bucket_id}.done"
+    )
     if os.path.exists(done_file):
         logger.info(f"bucket {bucket_id} already processed, skipping")
         return
 
     # For managing tqdm
     file_size = os.path.getsize(bucket_file_path)
-    bucket_progress = tqdm_func(total=file_size, dynamic_ncols=True, unit="byte", unit_scale=1)
+    bucket_progress = tqdm_func(
+        total=file_size, dynamic_ncols=True, unit="byte", unit_scale=1
+    )
     current_file_position = 0
-    update_frequency = 100 * 1000000 # 100mb
+    update_frequency = 100 * 1000000  # 100mb
     update_counter = 0
 
-    # Iterate through and output ngrams which occur in more then 10 documents 
+    # Iterate through and output ngrams which occur in more then 10 documents
     bucket = TextReader(bucket_file_path)
 
     output_file_path = bucket_file_path + ".processed"
@@ -59,7 +66,9 @@ def process_bucket(bucket_file_path, processed_directory, move_dir, tqdm_func, g
         # Write ngram if more then 10 unique document occurences
         if ngram != current_ngram:
             if len(current_ngram_document_ids) > 10:
-                output_archive.add_data(f"{current_ngram} {len(current_ngram_document_ids)}")
+                output_archive.add_data(
+                    f"{current_ngram} {len(current_ngram_document_ids)}"
+                )
             current_ngram = ngram
             current_ngram_document_ids = set()
 
@@ -84,28 +93,33 @@ def process_bucket(bucket_file_path, processed_directory, move_dir, tqdm_func, g
 
     global_tqdm.update()
 
+
 def process_sorted_buckets(working_directory, move_dir, process_count):
     bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt.sorted"))
     processed_directory = os.path.join(working_directory, "processed")
     os.makedirs(processed_directory, exist_ok=True)
 
-    pool = TqdmMultiProcessPool(process_count) 
-    tasks = [(process_bucket, (bucket_file, processed_directory, move_dir)) for bucket_file in bucket_file_paths]
+    pool = TqdmMultiProcessPool(process_count)
+    tasks = [
+        (process_bucket, (bucket_file, processed_directory, move_dir))
+        for bucket_file in bucket_file_paths
+    ]
 
     global_tqdm = tqdm(total=len(bucket_file_paths), dynamic_ncols=True, unit="bucket")
-    on_done = lambda _ : None
-    on_error = lambda _ : None
+    on_done = lambda _: None
+    on_error = lambda _: None
     _ = pool.map(global_tqdm, tasks, on_error, on_done)
 
-parser = argparse.ArgumentParser(description='Process 13 grams from sorted buckets.')
+
+parser = argparse.ArgumentParser(description="Process 13 grams from sorted buckets.")
 parser.add_argument("-dir", "--working_directory", default="")
 parser.add_argument("-move", "--move_dir", default="")
 parser.add_argument("-procs", "--process_count", type=int, default=4)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
 
     logfile_path = "process13grams.log"
     setup_logger_tqdm(logfile_path)
 
     args = parser.parse_args()
-    process_sorted_buckets(args.working_directory, args.move_dir, args.process_count)
\ No newline at end of file
+    process_sorted_buckets(args.working_directory, args.move_dir, args.process_count)
diff --git a/scripts/clean_training_data/sort_13_gram_buckets.py b/scripts/clean_training_data/sort_13_gram_buckets.py
index 8ac67fec90..07a2eedcd0 100644
--- a/scripts/clean_training_data/sort_13_gram_buckets.py
+++ b/scripts/clean_training_data/sort_13_gram_buckets.py
@@ -19,20 +19,24 @@
 
 import logging
 from tqdm_multiprocess.logger import setup_logger_tqdm
+
 logger = logging.getLogger(__name__)
 
 terminate = False
+
+
 def handler(signal_received, frame):
     global terminate
     terminate = True
 
+
 def sort_13_gram_buckets(working_directory):
-    bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt")) 
+    bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt"))
 
     for bucket_file_path in tqdm(bucket_file_paths, dynamic_ncols=True):
         sorted_file_path = bucket_file_path + ".sorted"
         command = f"sort {bucket_file_path} > {sorted_file_path}"
-        logger.info(command)    
+        logger.info(command)
         subprocess.call(command, shell=True)
 
         if terminate:
@@ -40,10 +44,11 @@ def sort_13_gram_buckets(working_directory):
 
         os.remove(bucket_file_path)
 
-parser = argparse.ArgumentParser(description='sort 13gram buckets')
+
+parser = argparse.ArgumentParser(description="sort 13gram buckets")
 parser.add_argument("-dir", "--working_directory", default="")
 
-if __name__ == '__main__':
+if __name__ == "__main__":
 
     version = 1.00
     print(f"Running version {version}")
diff --git a/scripts/cost_estimate.py b/scripts/cost_estimate.py
index d2e60bfa0d..30528982b8 100644
--- a/scripts/cost_estimate.py
+++ b/scripts/cost_estimate.py
@@ -7,7 +7,7 @@
 class DryrunLM(LM):
     def __init__(self):
         self.tokencost = 0
-        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
+        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2")
         self.tokenizer.pad_token = "<|endoftext|>"
 
     @classmethod
@@ -16,16 +16,16 @@ def create_from_arg_string(cls, arg_string):
 
     def loglikelihood(self, requests):
         res = []
-        
+
         for ctx, cont in requests:
             res.append((-random.random(), False))
             self.tokencost += len(self.tokenizer.tokenize(ctx + cont))
 
         return res
-    
+
     def greedy_until(self, requests):
         res = []
-        
+
         for ctx, until in requests:
             res.append("lol")
 
@@ -33,11 +33,11 @@ def greedy_until(self, requests):
             self.tokencost += len(self.tokenizer.tokenize(ctx)) + 256
 
         return res
-    
+
     def loglikelihood_rolling(self, requests):
         res = []
-        
-        for s, in requests:
+
+        for (s,) in requests:
             # assume worst case: extra full context
             self.tokencost += len(self.tokenizer.tokenize(s)) + 2048
 
@@ -46,7 +46,7 @@ def loglikelihood_rolling(self, requests):
 
 def main():
     lm = DryrunLM()
-    
+
     task_list = "arc_challenge,arc_easy,boolq,cola,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,record,rte,sciq,sst,triviaqa,webqs,wic,wikitext,winogrande,wnli,wsc"
     values = []
     for taskname in task_list.split(","):
@@ -57,11 +57,20 @@ def main():
             num_fewshot=0,
             limit=None,
             bootstrap_iters=10,
-            description_dict=None
+            description_dict=None,
         )
 
         print(taskname, lm.tokencost)
-        values.append([taskname, lm.tokencost, lm.tokencost / 1000 * 0.0008, lm.tokencost / 1000 * 0.0012, lm.tokencost / 1000 * 0.006, lm.tokencost / 1000 * 0.06])
+        values.append(
+            [
+                taskname,
+                lm.tokencost,
+                lm.tokencost / 1000 * 0.0008,
+                lm.tokencost / 1000 * 0.0012,
+                lm.tokencost / 1000 * 0.006,
+                lm.tokencost / 1000 * 0.06,
+            ]
+        )
     from pytablewriter import MarkdownTableWriter
 
     writer = MarkdownTableWriter()
@@ -69,10 +78,21 @@ def main():
 
     values.sort(key=lambda x: -x[1])
     totcost = sum([x[1] for x in values])
-    values.append(["**Total**", totcost, totcost / 1000 * 0.0008, totcost / 1000 * 0.0012, totcost / 1000 * 0.006, totcost / 1000 * 0.06])
+    values.append(
+        [
+            "**Total**",
+            totcost,
+            totcost / 1000 * 0.0008,
+            totcost / 1000 * 0.0012,
+            totcost / 1000 * 0.006,
+            totcost / 1000 * 0.06,
+        ]
+    )
 
     writer.value_matrix = values
 
     print(writer.dumps())
+
+
 if __name__ == "__main__":
     main()
diff --git a/scripts/get_prompts.py b/scripts/get_prompts.py
index 56a9ff79f4..06e2f89c13 100644
--- a/scripts/get_prompts.py
+++ b/scripts/get_prompts.py
@@ -3,16 +3,21 @@
 
 ct = 3
 
-for tname, Task in tasks.TASK_REGISTRY.items():#[('record', tasks.superglue.ReCoRD)]:#
+for (
+    tname,
+    Task,
+) in tasks.TASK_REGISTRY.items():  # [('record', tasks.superglue.ReCoRD)]:#
     task = Task()
 
-    print('#', tname)
-    docs = islice(task.validation_docs() if task.has_validation_docs() else task.test_docs(), ct)
+    print("#", tname)
+    docs = islice(
+        task.validation_docs() if task.has_validation_docs() else task.test_docs(), ct
+    )
     print()
     for i in range(ct):
         print()
         doc = next(docs)
         print("**Context**:", "\n```\n" + task.doc_to_text(doc) + "\n```\n")
         print()
-        print('**Target**:', "\n```\n" + task.doc_to_target(doc) + "\n```\n")
+        print("**Target**:", "\n```\n" + task.doc_to_target(doc) + "\n```\n")
         print()
diff --git a/scripts/make_gpt2_test_cases.py b/scripts/make_gpt2_test_cases.py
index f1108b2457..361bc2ecd6 100644
--- a/scripts/make_gpt2_test_cases.py
+++ b/scripts/make_gpt2_test_cases.py
@@ -10,7 +10,7 @@
 data = [
     "A multilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)",
     "The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons (with threshold activation); see § Terminology",
-    "Multilayer perceptrons are sometimes colloquially referred to as \"vanilla\" neural networks, especially when they have a single hidden layer.[1]",
+    'Multilayer perceptrons are sometimes colloquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]',
     "An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear activation function.",
     "MLP utilizes a supervised learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]",
     "Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. ",
@@ -20,22 +20,28 @@
 ]
 
 
-model = transformers.GPT2LMHeadModel.from_pretrained('gpt2')
-tok = transformers.GPT2Tokenizer.from_pretrained('gpt2')
+model = transformers.GPT2LMHeadModel.from_pretrained("gpt2")
+tok = transformers.GPT2Tokenizer.from_pretrained("gpt2")
 
 tgs = []
 
 for dat in data:
     random.seed(dat)
-    #print(model(tok.encode(dat, return_tensors="pt"))[0][0])
+    # print(model(tok.encode(dat, return_tensors="pt"))[0][0])
 
     toks = tok.encode(dat, return_tensors="pt")
-    ind = random.randrange(len(toks[0])-1)
+    ind = random.randrange(len(toks[0]) - 1)
     logits = F.log_softmax(model(toks)[0], dim=-1)[:, :-1]  # [batch, seq, vocab]
 
     res = torch.gather(logits, 2, toks[:, 1:].unsqueeze(-1)).squeeze(-1)[0]
 
-    tgs.append( float(res[ind:].sum()))
-    print(r'("""' + tok.decode(toks[0, :ind+1]) + r'""", """' + tok.decode(toks[0, ind+1:]) + r'"""), ')
+    tgs.append(float(res[ind:].sum()))
+    print(
+        r'("""'
+        + tok.decode(toks[0, : ind + 1])
+        + r'""", """'
+        + tok.decode(toks[0, ind + 1 :])
+        + r'"""), '
+    )
 
-print(tgs)
\ No newline at end of file
+print(tgs)
diff --git a/scripts/make_table_tasks.py b/scripts/make_table_tasks.py
index e4f239d258..5a9de2b193 100644
--- a/scripts/make_table_tasks.py
+++ b/scripts/make_table_tasks.py
@@ -2,23 +2,32 @@
 from pytablewriter import MarkdownTableWriter
 
 writer = MarkdownTableWriter()
-writer.headers = ["Task Name", "Train", "Val", "Test","Val/Test Docs", "Metrics"]
+writer.headers = ["Task Name", "Train", "Val", "Test", "Val/Test Docs", "Metrics"]
 
 values = []
 
+
 def chk(tf):
     if tf:
-        return '✓'
+        return "✓"
     else:
-        return ' '
+        return " "
+
 
 for tname, Task in tasks.TASK_REGISTRY.items():
     task = Task()
 
-    v = [tname,chk(task.has_training_docs()),chk(task.has_validation_docs()),chk(task.has_test_docs()), len(list(task.test_docs() if task.has_test_docs() else task.validation_docs())),', '.join(task.aggregation().keys())]
+    v = [
+        tname,
+        chk(task.has_training_docs()),
+        chk(task.has_validation_docs()),
+        chk(task.has_test_docs()),
+        len(list(task.test_docs() if task.has_test_docs() else task.validation_docs())),
+        ", ".join(task.aggregation().keys()),
+    ]
     print(v)
     values.append(v)
 
 writer.value_matrix = values
 
-print(writer.dumps())
\ No newline at end of file
+print(writer.dumps())
diff --git a/scripts/write_out.py b/scripts/write_out.py
index 2039d3934f..bfcfc0b607 100644
--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -11,14 +11,14 @@
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--output_base_path', required=True)
-    parser.add_argument('--tasks', default="all_tasks")
-    parser.add_argument('--provide_description', action="store_true")
-    parser.add_argument('--sets', type=str, default="val") # example: val,test
-    parser.add_argument('--num_fewshot', type=int, default=1)
-    parser.add_argument('--seed', type=int, default=42)
-    parser.add_argument('--num_examples', type=int, default=1)
-    parser.add_argument('--description_dict_path', default=None)
+    parser.add_argument("--output_base_path", required=True)
+    parser.add_argument("--tasks", default="all_tasks")
+    parser.add_argument("--provide_description", action="store_true")
+    parser.add_argument("--sets", type=str, default="val")  # example: val,test
+    parser.add_argument("--num_fewshot", type=int, default=1)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--num_examples", type=int, default=1)
+    parser.add_argument("--description_dict_path", default=None)
     return parser.parse_args()
 
 
@@ -34,7 +34,7 @@ def main():
 
     description_dict = {}
     if args.description_dict_path:
-        with open(args.description_dict_path, 'r') as f:
+        with open(args.description_dict_path, "r") as f:
             description_dict = json.load(f)
 
     os.makedirs(args.output_base_path, exist_ok=True)
@@ -45,26 +45,34 @@ def main():
         iters = []
 
         for set in args.sets.split(","):
-            if set == 'train' and task.has_training_docs():
+            if set == "train" and task.has_training_docs():
                 docs = task.training_docs()
-            if set == 'val' and task.has_validation_docs():
+            if set == "val" and task.has_validation_docs():
                 docs = task.validation_docs()
-            if set == 'test' and task.has_test_docs():
+            if set == "test" and task.has_test_docs():
                 docs = task.test_docs()
             iters.append(docs)
 
         docs = join_iters(iters)
 
-        description = description_dict[task_name] if description_dict and task_name in description_dict else ""
+        description = (
+            description_dict[task_name]
+            if description_dict and task_name in description_dict
+            else ""
+        )
 
         with open(os.path.join(args.output_base_path, task_name), "w") as f:
-            for i, doc in zip(range(args.num_examples), docs) if args.num_examples > 0 else enumerate(docs):
+            for i, doc in (
+                zip(range(args.num_examples), docs)
+                if args.num_examples > 0
+                else enumerate(docs)
+            ):
                 f.write(EXAMPLE_DIVIDER.format(i=i))
                 ctx = task.fewshot_context(
                     doc=doc,
                     num_fewshot=args.num_fewshot,
                     rnd=rnd,
-                    description=description
+                    description=description,
                 )
                 f.write(ctx + "\n")
 
diff --git a/setup.py b/setup.py
index b6430d8d81..f932589028 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@
         "License :: OSI Approved :: MIT License",
         "Operating System :: OS Independent",
     ],
-    python_requires='>=3.6',
+    python_requires=">=3.6",
     install_requires=[
         "datasets>=2.0.0",
         "click>=7.1",
@@ -40,10 +40,10 @@
         "openai==0.6.4",
         "jieba==0.42.1",
         "nagisa==0.2.7",
-        "bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt"
+        "bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt",
     ],
     dependency_links=[
         "https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt",
     ],
-    extras_require={'dev': [ 'pytest', 'black' ]}
+    extras_require={"dev": ["pytest", "black"]},
 )
diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
index 363384a05c..ee80ccfdff 100644
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -10,23 +10,24 @@
 # TODO: more fine grained unit tests rather than this big honking integration
 # test once we break evaluator into smaller, more manageable pieces
 
+
 @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
 def test_evaluator(taskname, task_class):
     task_dict = tasks.get_task_dict([taskname])
 
     os.system("rm test_cache.db")
-    lm = base.CachingLM(models.get_model('dummy')(), "test_cache.db")
+    lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")
 
     def ll_fn(reqs):
         for ctx, cont in reqs:
             if len(ctx) == 0:
                 continue
             # space convention
-            assert ctx[-1] != ' '
-            assert cont[0] == ' ' or ctx[-1] == '\n'
-        
+            assert ctx[-1] != " "
+            assert cont[0] == " " or ctx[-1] == "\n"
+
         res = []
-        
+
         random.seed(42)
         for _ in reqs:
             res.append((-random.random(), False))
@@ -34,7 +35,7 @@ def ll_fn(reqs):
         return res
 
     def ll_perp_fn(reqs):
-        for string, in reqs:
+        for (string,) in reqs:
             assert isinstance(string, str)
 
         res = []
@@ -49,20 +50,20 @@ def ll_perp_fn(reqs):
 
     limit = 10
     e1 = evaluator.evaluate(
-            lm=lm,
-            task_dict=task_dict,
-            num_fewshot=0,
-            limit=limit,
-            bootstrap_iters=10,
-            description_dict=None
+        lm=lm,
+        task_dict=task_dict,
+        num_fewshot=0,
+        limit=limit,
+        bootstrap_iters=10,
+        description_dict=None,
     )
     e2 = evaluator.evaluate(
-            lm=lm,
-            task_dict=task_dict,
-            num_fewshot=0,
-            limit=limit,
-            bootstrap_iters=10,
-            description_dict=None
+        lm=lm,
+        task_dict=task_dict,
+        num_fewshot=0,
+        limit=limit,
+        bootstrap_iters=10,
+        description_dict=None,
     )
 
     # check that caching is working
diff --git a/tests/test_generate_13_grams.py b/tests/test_generate_13_grams.py
index 8b54d70fe8..5c6757e67a 100644
--- a/tests/test_generate_13_grams.py
+++ b/tests/test_generate_13_grams.py
@@ -8,17 +8,19 @@
 from lm_eval.decontamination.archiver import Archive, TextReader
 
 import logging
+
 logger = logging.getLogger(__name__)
 
+
 def test_generate_13_grams_1(caplog):
-    data = """A goose (plural geese) is a bird of any of several waterfowl species in the family Anatidae. 
-    This group comprises the genera Anser (the grey geese and white geese) and Branta (the black geese). 
-    Some other birds, mostly related to the shelducks, have "goose" as part of their names. 
-    More distantly related members of the family Anatidae are swans, most of which are larger 
-    than true geese, and ducks, which are smaller. The term "goose" may refer to either a male 
-    or female bird, but when paired with "gander", refers specifically to a female one (the latter referring 
-    to a male). Young birds before fledging are called goslings. The collective noun for a group of 
-    geese on the ground is a gaggle; when in flight, they are called a skein, a team, or a wedge; when 
+    data = """A goose (plural geese) is a bird of any of several waterfowl species in the family Anatidae.
+    This group comprises the genera Anser (the grey geese and white geese) and Branta (the black geese).
+    Some other birds, mostly related to the shelducks, have "goose" as part of their names.
+    More distantly related members of the family Anatidae are swans, most of which are larger
+    than true geese, and ducks, which are smaller. The term "goose" may refer to either a male
+    or female bird, but when paired with "gander", refers specifically to a female one (the latter referring
+    to a male). Young birds before fledging are called goslings. The collective noun for a group of
+    geese on the ground is a gaggle; when in flight, they are called a skein, a team, or a wedge; when
     flying close together, they are called a plump."""
 
     data = data + data
@@ -26,7 +28,7 @@ def test_generate_13_grams_1(caplog):
     # Simple Generation
     print("simple generation")
     n = 13
-    janitor = Janitor()    
+    janitor = Janitor()
     ngrams = word_ngrams(janitor.normalize_string(data), n)
     comparison = list(ngrams)
     comparison_counter = Counter(comparison)
@@ -42,7 +44,7 @@ def test_generate_13_grams_1(caplog):
         pass
     os.makedirs(test_working_directory)
 
-    assert(not os.path.exists("pile"))
+    assert not os.path.exists("pile")
     os.makedirs("pile")
     archive = Archive(os.path.join("pile", "test.jsonl.zst"))
     archive.add_data(data)
@@ -54,20 +56,22 @@ def test_generate_13_grams_1(caplog):
     # Rebuild from buckets
     print("rebuild")
     rebuilt_ngrams = []
-    bucket_file_paths = glob.glob(os.path.join(test_working_directory, "output", f"*.bkt.txt")) 
+    bucket_file_paths = glob.glob(
+        os.path.join(test_working_directory, "output", f"*.bkt.txt")
+    )
     for bucket_file_path in bucket_file_paths:
         reader = TextReader(bucket_file_path)
-        
+
         for line in reader.read():
             [ngram, document_id] = line.rsplit(" ", 1)
             rebuilt_ngrams.append(ngram)
 
     # Compare
-    print("compare")    
+    print("compare")
     result_counter = Counter(rebuilt_ngrams)
     # print(len(result_counter))
     # print(len(comparison_counter))
-    assert(len(result_counter) == len(comparison_counter))
+    assert len(result_counter) == len(comparison_counter)
     # print(result_counter)
-    # print(comparison_counter)    
-    assert(comparison_counter == result_counter)
\ No newline at end of file
+    # print(comparison_counter)
+    assert comparison_counter == result_counter
diff --git a/tests/test_gpt3.py b/tests/test_gpt3.py
index aaa27638a2..52156e8ad9 100644
--- a/tests/test_gpt3.py
+++ b/tests/test_gpt3.py
@@ -12,40 +12,78 @@ def mock_completion(**kwargs):
     # Mock completion function
     # Loads from a cached+pickled response if it exists, otherwise it will actually try to ping
     os.makedirs("tests/testdata", exist_ok=True)
-    hash = hashlib.sha256(json.dumps(kwargs, sort_keys=True).encode('utf-8')).hexdigest()
+    hash = hashlib.sha256(
+        json.dumps(kwargs, sort_keys=True).encode("utf-8")
+    ).hexdigest()
     fname = f"tests/testdata/gpt3_test_{hash}.pkl"
 
     if os.path.exists(fname):
-        with open(fname, 'rb') as fh:
+        with open(fname, "rb") as fh:
             return pickle.load(fh)
     ret = openai.Completion.create(**kwargs)
     ret.api_key = ""
-    with open(fname, 'wb') as fh:
+    with open(fname, "wb") as fh:
         pickle.dump(ret, fh)
     return ret
 
 
 @mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
 def test_gpt3():
-    if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = ""
-    gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
-    (ll_dog, ig_dog), (ll_cat, ig_cat), (_, ll_max_0), (_, ll_max_1), (_, ll_max_2), *vals = gpt3.loglikelihood([
-        ('The quick brown fox jumps over the lazy', ' dog'),
-        ('The quick brown fox jumps over the lazy', ' cat'),
-        ('The quick brown fox jumps over the lazy', ', lazy dog'),
-        ('The quick brown fox jumps over the lazy', ', lazy fox'),
-        ('The quick brown fox jumps over the lazy', ', lazy fox and they both fall to the ground'),
-        
-        ("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""),
-        ("""The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""", """ (with threshold activation); see § Terminology"""),
-        ("""Multilayer perceptrons are sometimes coll""", """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]"""), 
-        ("""An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""", """ activation function."""), 
-        ("""MLP utilizes a supervised""", """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]"""), 
-        ("""Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""", """ in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """), 
-        ("""Specifically, we train GPT-3, an autoregressive language model with 175""", """ billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general."""), 
-        ("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""), 
-        ("""Hello""", """ World"""), 
-    ])
+    if "OPENAI_API_SECRET_KEY" not in os.environ:
+        os.environ["OPENAI_API_SECRET_KEY"] = ""
+    gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
+    (
+        (ll_dog, ig_dog),
+        (ll_cat, ig_cat),
+        (_, ll_max_0),
+        (_, ll_max_1),
+        (_, ll_max_2),
+        *vals,
+    ) = gpt3.loglikelihood(
+        [
+            ("The quick brown fox jumps over the lazy", " dog"),
+            ("The quick brown fox jumps over the lazy", " cat"),
+            ("The quick brown fox jumps over the lazy", ", lazy dog"),
+            ("The quick brown fox jumps over the lazy", ", lazy fox"),
+            (
+                "The quick brown fox jumps over the lazy",
+                ", lazy fox and they both fall to the ground",
+            ),
+            (
+                """A mult""",
+                """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
+            ),
+            (
+                """The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""",
+                """ (with threshold activation); see § Terminology""",
+            ),
+            (
+                """Multilayer perceptrons are sometimes coll""",
+                """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]""",
+            ),
+            (
+                """An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""",
+                """ activation function.""",
+            ),
+            (
+                """MLP utilizes a supervised""",
+                """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]""",
+            ),
+            (
+                """Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""",
+                """ in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """,
+            ),
+            (
+                """Specifically, we train GPT-3, an autoregressive language model with 175""",
+                """ billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.""",
+            ),
+            (
+                """A mult""",
+                """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
+            ),
+            ("""Hello""", """ World"""),
+        ]
+    )
 
     assert ll_dog > ll_cat
     assert not ig_cat
@@ -56,19 +94,26 @@ def test_gpt3():
     assert not ll_max_2
 
     # test empty context
-    gpt3.loglikelihood([('', 'test')])
+    gpt3.loglikelihood([("", "test")])
 
-    gen, = gpt3.greedy_until([
-        ('The quick brown fox jumps over the lazy', ['.', '\n'])
-    ])
+    (gen,) = gpt3.greedy_until(
+        [("The quick brown fox jumps over the lazy", [".", "\n"])]
+    )
 
-    assert gen == ' dog'
+    assert gen == " dog"
 
     print([x[0] for x in vals])
 
     targets = [
-        -34.848301606999996, -47.148329679999996, -45.44380149599999, -5.285246016, -133.97821690686004,
-        -321.2616693239001, -658.0299524401041, -34.848301606999996, -7.525115,
+        -34.848301606999996,
+        -47.148329679999996,
+        -45.44380149599999,
+        -5.285246016,
+        -133.97821690686004,
+        -321.2616693239001,
+        -658.0299524401041,
+        -34.848301606999996,
+        -7.525115,
     ]
 
     for (pred, _), tgt in zip(vals, targets):
@@ -77,17 +122,20 @@ def test_gpt3():
 
 @mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
 def test_gpt3_perplexity():
-    if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = ""
-    gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
+    if "OPENAI_API_SECRET_KEY" not in os.environ:
+        os.environ["OPENAI_API_SECRET_KEY"] = ""
+    gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
     test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
     perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
     tgt = -84.38819608
     assert perplexity == pytest.approx(tgt, rel=1e-3)
 
     # Hack: modify gpt3 to have shorter context length to induce rolling windows
-    with mock.patch.object(models.gpt3.GPT3LM, 'max_length', new_callable=mock.PropertyMock) as mock_max_length:
+    with mock.patch.object(
+        models.gpt3.GPT3LM, "max_length", new_callable=mock.PropertyMock
+    ) as mock_max_length:
         mock_max_length.return_value = 5
-        gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
+        gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
         perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
     tgt = -101.81967209999999
     assert perplexity == pytest.approx(tgt, rel=1e-3)
diff --git a/tests/test_janitor.py b/tests/test_janitor.py
index c80959f62b..784198825b 100644
--- a/tests/test_janitor.py
+++ b/tests/test_janitor.py
@@ -3,6 +3,7 @@
 
 from lm_eval.decontamination.janitor import *
 
+
 def simple_ngram(sequence, n):
     ngrams = list()
     ngram = []
@@ -16,8 +17,10 @@ def simple_ngram(sequence, n):
 
 
 def test_form_ngrams():
-    sequence = "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some" \
-               " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
+    sequence = (
+        "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
+        " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
+    )
 
     n_values = [1, 2, 3, 5, 13]
     for n in n_values:
@@ -26,9 +29,12 @@ def test_form_ngrams():
         assert len(comparison) == len(result_to_test)
         assert comparison == result_to_test
 
+
 def test_word_ngrams():
-    sequence = "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some" \
-               " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
+    sequence = (
+        "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
+        " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
+    )
 
     words = sequence.split()
 
@@ -40,9 +46,12 @@ def test_word_ngrams():
         assert len(comparison) == len(result_to_test)
         assert result_to_test == comparison
 
+
 def test_split_indices():
-    sequence = "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some" \
-               " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
+    sequence = (
+        "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
+        " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
+    )
 
     comparison = []
     current_word = ""
@@ -55,17 +64,22 @@ def test_split_indices():
                 current_word = ""
 
     if current_word:
-        comparison.append((current_word, (len(sequence) - len(current_word), len(sequence) - 1)))
-        current_word = ""        
+        comparison.append(
+            (current_word, (len(sequence) - len(current_word), len(sequence) - 1))
+        )
+        current_word = ""
 
     result_to_test = list(split_indices(sequence))
     assert len(comparison) == len(result_to_test)
-    assert(comparison == result_to_test)
+    assert comparison == result_to_test
+
 
 def test_word_ngrams_indices():
 
-    sequence = "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some" \
-               " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
+    sequence = (
+        "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
+        " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
+    )
 
     n_values = [1, 2, 3, 5, 13]
 
@@ -76,55 +90,62 @@ def test_word_ngrams_indices():
         for ngram in ngrams:
             while True:
                 start = sequence.find(ngram, tracker[ngram])
-                assert start != -1 # testing the test
+                assert start != -1  # testing the test
 
                 end = start + len(ngram) - 1
                 tracker[ngram] = end + 1
 
                 # ignore partial word matches
-                if (start != 0 and sequence[start - 1] != " ") or \
-                   (end != len(sequence) - 1 and sequence[end + 1] != " "):
+                if (start != 0 and sequence[start - 1] != " ") or (
+                    end != len(sequence) - 1 and sequence[end + 1] != " "
+                ):
                     pass
                 else:
                     break
 
             comparison.append((ngram, (start, end)))
 
-        result_to_test = list(word_ngrams_indices(sequence, n)) 
+        result_to_test = list(word_ngrams_indices(sequence, n))
         assert len(result_to_test) == len(comparison)
         assert result_to_test == comparison
 
+
 # Assumptions from GPT3 Paper:
 # the 200 characters to remove include punctuation and is actually a half-window
 
 # All tests below initially test without any registered contaminants, expecting the same sequence back.
 def test_janitor1():
 
-    # First test using a 1gram and expected the first block before the filth to have some remaining 
+    # First test using a 1gram and expected the first block before the filth to have some remaining
     # characters, but the second block should be completely removed.
 
-    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. "
-
+    sequence = (
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )
 
     filth = "filth"
 
-    expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing "
+    expected_result = (
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing "
+    )
 
-    janitor = Janitor(ngram_n=1, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    janitor = Janitor(
+        ngram_n=1, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
+    )
     result = janitor.clean_python(sequence)
     result = "".join(result)
     assert result == sequence
@@ -133,42 +154,47 @@ def test_janitor1():
     assert janitor.dirt_ngrams == {filth}
 
     result = janitor.clean_python(sequence)
-    result = "".join(result) 
+    result = "".join(result)
     assert result == expected_result
 
+
 def test_janitor2():
 
-    # Second test using a 1gram and expected the first block before the filth to have some remaining 
+    # Second test using a 1gram and expected the first block before the filth to have some remaining
     # characters, and the second block is longer then 200 characters so should also have some remaining.
 
-    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. "
-
-
+    sequence = (
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )
 
     filth = "filth"
 
-    expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing " \
-                      " characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. "
-
-    janitor = Janitor(ngram_n=1, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    expected_result = (
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing "
+        " characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )
+
+    janitor = Janitor(
+        ngram_n=1, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
+    )
     result = janitor.clean_python(sequence)
     result = "".join(result)
     assert result == sequence
@@ -180,37 +206,43 @@ def test_janitor2():
     result = "".join(result)
     assert result == expected_result
 
+
 def test_janitor3():
 
     # Same test as above but with a 6gram.
 
-    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. "
-
+    sequence = (
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )
 
     filth = "filth lots of dirty filthy filth"
 
-    expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing " \
-                      " characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. "
-
-    janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    expected_result = (
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing "
+        " characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )
+
+    janitor = Janitor(
+        ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
+    )
     result = janitor.clean_python(sequence)
     result = "".join(result)
     assert result == sequence
@@ -222,45 +254,51 @@ def test_janitor3():
     result = "".join(result)
     assert result == expected_result
 
+
 def test_janitor4():
 
     # This test adds another block to that from the previous. The middle block should be entirely
     # removed as the 200 characters are removed from each side.
 
-    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. "
-
+    sequence = (
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )
 
     filth = "filth lots of dirty filthy filth"
 
-    expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing " \
-                      " characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. "
-
-    janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    expected_result = (
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing "
+        " characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )
+
+    janitor = Janitor(
+        ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
+    )
     result = janitor.clean_python(sequence)
     result = "".join(result)
     assert result == sequence
@@ -272,49 +310,55 @@ def test_janitor4():
     result = "".join(result)
     assert result == expected_result
 
+
 def test_janitor5():
 
     # Same as above but using multiple different filth 6grams.
 
-    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. lots of filtHy dirty FIlTh " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. "
-
-
-    filths = ["filth lots of dirty filthy filth",  "filth lots of filthy dirty filth"]
-
-    expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing " \
-                      " characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. "
-
-    janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    sequence = (
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. lots of filtHy dirty FIlTh "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )
+
+    filths = ["filth lots of dirty filthy filth", "filth lots of filthy dirty filth"]
+
+    expected_result = (
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing "
+        " characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )
+
+    janitor = Janitor(
+        ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
+    )
     result = janitor.clean_python(sequence)
     result = "".join(result)
     assert result == sequence
 
-    for filth in filths: 
+    for filth in filths:
         janitor.register_contaminant(filth)
     assert janitor.dirt_ngrams == set(filths)
 
@@ -322,57 +366,63 @@ def test_janitor5():
     result = "".join(result)
     assert result == expected_result
 
+
 def test_janitor6():
 
     # Same as above but now we add 10 filths and expect the same result, the following test does 11.
 
-    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. lots of filtHy dirty FIlTh " \
-               "FILTH. lots of filtHy dirty FIlTh " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. "
-
-
-    filths = ["filth lots of dirty filthy filth",  "filth lots of filthy dirty filth"]
-
-    expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing " \
-                      " characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. "
-
-    janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    sequence = (
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. lots of filtHy dirty FIlTh "
+        "FILTH. lots of filtHy dirty FIlTh "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )
+
+    filths = ["filth lots of dirty filthy filth", "filth lots of filthy dirty filth"]
+
+    expected_result = (
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing "
+        " characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )
+
+    janitor = Janitor(
+        ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
+    )
     result = janitor.clean_python(sequence)
     result = "".join(result)
     assert result == sequence
 
-    for filth in filths: 
+    for filth in filths:
         janitor.register_contaminant(filth)
     assert janitor.dirt_ngrams == set(filths)
 
@@ -380,51 +430,55 @@ def test_janitor6():
     result = "".join(result)
     assert result == expected_result
 
+
 def test_janitor7():
 
     # Same as above but now we add 9 filths and expect the same result, the following test does 10.
 
-    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. lots of filtHy dirty FIlTh " \
-               "FILTH. lots of filtHy dirty FIlTh " \
-               "FILTH. lots of filtHy dirty FIlTh " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. "
-
-
-    filths = ["filth lots of dirty filthy filth",  "filth lots of filthy dirty filth"]
+    sequence = (
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. lots of filtHy dirty FIlTh "
+        "FILTH. lots of filtHy dirty FIlTh "
+        "FILTH. lots of filtHy dirty FIlTh "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )
+
+    filths = ["filth lots of dirty filthy filth", "filth lots of filthy dirty filth"]
 
     expected_result = ""
 
-    janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    janitor = Janitor(
+        ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
+    )
     result = janitor.clean_python(sequence)
     result = "".join(result)
     assert result == sequence
 
-    for filth in filths: 
+    for filth in filths:
         janitor.register_contaminant(filth)
     assert janitor.dirt_ngrams == set(filths)
 
@@ -453,23 +507,3 @@ def test_janitor8():
     # cleaned = " ".join(jan.clean(source))
     # for contam in jan.dirt_ngrams:
     #     assert contam not in cleaned, contam
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/tests/test_models.py b/tests/test_models.py
index d66949180a..49fdce11e6 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -4,24 +4,59 @@
 
 
 def test_gpt2():
-    gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
-    (ll_dog, ig_dog), (ll_cat, ig_cat), (_, ll_max_0), (_, ll_max_1), (_, ll_max_2), *vals = gpt2.loglikelihood([
-        ('The quick brown fox jumps over the lazy', ' dog'),
-        ('The quick brown fox jumps over the lazy', ' cat'),
-        ('The quick brown fox jumps over the lazy', ', lazy dog'),
-        ('The quick brown fox jumps over the lazy', ', lazy fox'),
-        ('The quick brown fox jumps over the lazy', ', lazy fox and they both fall to the ground'),
-        
-        ("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""), 
-        ("""The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""", """ (with threshold activation); see § Terminology"""), 
-        ("""Multilayer perceptrons are sometimes coll""", """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]"""), 
-        ("""An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""", """ activation function."""), 
-        ("""MLP utilizes a supervised""", """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]"""), 
-        ("""Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""", """ in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """), 
-        ("""Specifically, we train GPT-3, an autoregressive language model with 175""", """ billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general."""), 
-        ("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""), 
-        ("""Hello""", """ World"""), 
-    ])
+    gpt2 = models.get_model("gpt2").create_from_arg_string("device=cpu")
+    (
+        (ll_dog, ig_dog),
+        (ll_cat, ig_cat),
+        (_, ll_max_0),
+        (_, ll_max_1),
+        (_, ll_max_2),
+        *vals,
+    ) = gpt2.loglikelihood(
+        [
+            ("The quick brown fox jumps over the lazy", " dog"),
+            ("The quick brown fox jumps over the lazy", " cat"),
+            ("The quick brown fox jumps over the lazy", ", lazy dog"),
+            ("The quick brown fox jumps over the lazy", ", lazy fox"),
+            (
+                "The quick brown fox jumps over the lazy",
+                ", lazy fox and they both fall to the ground",
+            ),
+            (
+                """A mult""",
+                """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
+            ),
+            (
+                """The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""",
+                """ (with threshold activation); see § Terminology""",
+            ),
+            (
+                """Multilayer perceptrons are sometimes coll""",
+                """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]""",
+            ),
+            (
+                """An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""",
+                """ activation function.""",
+            ),
+            (
+                """MLP utilizes a supervised""",
+                """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]""",
+            ),
+            (
+                """Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""",
+                """ in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """,
+            ),
+            (
+                """Specifically, we train GPT-3, an autoregressive language model with 175""",
+                """ billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.""",
+            ),
+            (
+                """A mult""",
+                """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
+            ),
+            ("""Hello""", """ World"""),
+        ]
+    )
 
     assert ll_dog > ll_cat
     assert not ig_cat
@@ -31,17 +66,24 @@ def test_gpt2():
     assert ll_max_2
 
     # test empty context
-    gpt2.loglikelihood([('', 'test')])
+    gpt2.loglikelihood([("", "test")])
 
-    gen, = gpt2.greedy_until([
-        ('The quick brown fox jumps over the lazy', ['.', '\n'])
-    ])
+    (gen,) = gpt2.greedy_until(
+        [("The quick brown fox jumps over the lazy", [".", "\n"])]
+    )
 
-    assert gen == ', lazy fox and they both fall to the ground'
+    assert gen == ", lazy fox and they both fall to the ground"
 
     targets = [
-        -61.60536193847656, -56.57843780517578, -62.131004333496094, -9.799489974975586, -153.96334838867188,
-        -341.222900390625, -731.1475830078125, -61.60536193847656, -8.682319641113281
+        -61.60536193847656,
+        -56.57843780517578,
+        -62.131004333496094,
+        -9.799489974975586,
+        -153.96334838867188,
+        -341.222900390625,
+        -731.1475830078125,
+        -61.60536193847656,
+        -8.682319641113281,
     ]
 
     for (pred, _), tgt in zip(vals, targets):
@@ -49,21 +91,57 @@ def test_gpt2():
 
 
 def test_gpt2_perplexity():
-    gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
+    gpt2 = models.get_model("gpt2").create_from_arg_string("device=cpu")
     test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
     perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
-    tgt = sum([
-        -4.9599953, -8.069298, -8.308624, -10.178513, -8.906924, -1.9318912, -7.745445, -7.146077, -5.2072,
-        -3.5882986, -1.9957212, -8.044922, -0.20841774, -5.1096807, -0.099879116, -8.888423, -4.6180487,
-    ])
+    tgt = sum(
+        [
+            -4.9599953,
+            -8.069298,
+            -8.308624,
+            -10.178513,
+            -8.906924,
+            -1.9318912,
+            -7.745445,
+            -7.146077,
+            -5.2072,
+            -3.5882986,
+            -1.9957212,
+            -8.044922,
+            -0.20841774,
+            -5.1096807,
+            -0.099879116,
+            -8.888423,
+            -4.6180487,
+        ]
+    )
     assert perplexity == pytest.approx(tgt, rel=1e-3)
 
-    with mock.patch.object(models.gpt2.HFLM, 'max_length', new_callable=mock.PropertyMock) as mock_max_length:
+    with mock.patch.object(
+        models.gpt2.HFLM, "max_length", new_callable=mock.PropertyMock
+    ) as mock_max_length:
         mock_max_length.return_value = 5
-        gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
+        gpt2 = models.get_model("gpt2").create_from_arg_string("device=cpu")
         perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
-    tgt = sum([
-        -4.96001, -8.069275, -8.308612, -10.178482, -8.90691, -4.037338, -8.09261, -11.662385, -10.206891,
-        -4.425003, -2.2563353, -7.909143, -1.9304147, -7.3610134, -2.3120654, -7.3229, -2.1643813,
-    ])
+    tgt = sum(
+        [
+            -4.96001,
+            -8.069275,
+            -8.308612,
+            -10.178482,
+            -8.90691,
+            -4.037338,
+            -8.09261,
+            -11.662385,
+            -10.206891,
+            -4.425003,
+            -2.2563353,
+            -7.909143,
+            -1.9304147,
+            -7.3610134,
+            -2.3120654,
+            -7.3229,
+            -2.1643813,
+        ]
+    )
     assert perplexity == pytest.approx(tgt, rel=1e-3)
diff --git a/tests/test_tasks.py b/tests/test_tasks.py
index 46812798a9..ad4ecf9616 100644
--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
@@ -6,7 +6,7 @@
 
 @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
 def test_basic_interface(taskname, task_class):
-    print('Evaluating task', taskname)
+    print("Evaluating task", taskname)
     # dl = task_class.download
     # task_class.download = MagicMock()
     task = task_class()
@@ -42,7 +42,7 @@ def test_basic_interface(taskname, task_class):
 
         reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
         reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
-        
+
         assert reqs == reqs2
 
     if task.has_test_docs():
@@ -53,7 +53,7 @@ def test_basic_interface(taskname, task_class):
 
         reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
         reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
-        
+
         assert reqs == reqs2
 
     if task.has_training_docs():
@@ -64,13 +64,13 @@ def test_basic_interface(taskname, task_class):
 
         reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
         reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
-        
+
         assert reqs == reqs2
 
 
 @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
 def test_documents_and_requests(taskname, task_class):
-    print('Evaluating task', taskname)
+    print("Evaluating task", taskname)
     task = task_class()
     fns = []
     if task.has_training_docs():
@@ -83,21 +83,21 @@ def test_documents_and_requests(taskname, task_class):
     for fn in fns:
         # print(list(islice(fn(), 10)))
         for doc in islice(fn(), 10):
-            
+
             txt = task.doc_to_text(doc)
             tgt = task.doc_to_target(doc)
 
             assert isinstance(txt, str)
             assert isinstance(tgt, str)
-            
+
             # space convention
             # allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
             if len(txt) != 0:
-                assert txt[-1] != ' '
-                assert tgt[0] == ' ' or txt[-1] == '\n'
+                assert txt[-1] != " "
+                assert tgt[0] == " " or txt[-1] == "\n"
 
             reqs = task.construct_requests(doc, txt)
-            
+
             # construct_requests can return just one request
             if not isinstance(reqs, (list, tuple)):
                 reqs = [reqs]
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 3d9547048c..8c219e4414 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -5,8 +5,14 @@
 def test_get_rolling_token_windows_v1():
     gold = [
         ([-100, 0, 1, 2, 3, 4, 5, 6, 7, 8], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
-        ([9, 10, 11, 12, 13, 14, 15, 16, 17, 18], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
-        ([19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
+        (
+            [9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
+            [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
+        ),
+        (
+            [19, 20, 21, 22, 23, 24, 25, 26, 27, 28],
+            [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
+        ),
         ([23, 24, 25, 26, 27, 28, 29, 30, 31, 32], [30, 31, 32, 33]),
     ]
     x = list(range(34))
@@ -123,7 +129,6 @@ def test_get_rolling_token_windows_v4():
         ([17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [27]),
         ([18, 19, 20, 21, 22, 23, 24, 25, 26, 27], [28]),
         ([19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [29]),
-
     ]
     x = list(range(30))
     generator = get_rolling_token_windows(
@@ -145,8 +150,14 @@ def test_get_rolling_token_windows_v4():
 def test_get_rolling_token_windows_v5():
     gold = [
         ([-100, 0, 1, 2, 3, 4, 5, 6, 7, 8], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
-        ([9, 10, 11, 12, 13, 14, 15, 16, 17, 18], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
-        ([19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
+        (
+            [9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
+            [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
+        ),
+        (
+            [19, 20, 21, 22, 23, 24, 25, 26, 27, 28],
+            [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
+        ),
     ]
     x = list(range(30))
     generator = get_rolling_token_windows(
@@ -203,5 +214,8 @@ def test_get_rolling_token_windows_empty():
 
 
 def test_make_disjoint_window():
-    assert make_disjoint_window(([1,2,3,4,5], [2,3,4,5,6])) == ([1], [2,3,4,5,6])
-    assert make_disjoint_window(([1,2,3,4,5], [4,5,6])) == ([1,2,3], [4,5,6])
\ No newline at end of file
+    assert make_disjoint_window(([1, 2, 3, 4, 5], [2, 3, 4, 5, 6])) == (
+        [1],
+        [2, 3, 4, 5, 6],
+    )
+    assert make_disjoint_window(([1, 2, 3, 4, 5], [4, 5, 6])) == ([1, 2, 3], [4, 5, 6])
diff --git a/tests/test_version_stable.py b/tests/test_version_stable.py
index 7dd36a94b6..bcdb765c1f 100644
--- a/tests/test_version_stable.py
+++ b/tests/test_version_stable.py
@@ -16,13 +16,14 @@ def assert_target(name, ob):
     fname = f"tests/testdata/{name}.json"
     if os.path.exists(fname):
         with open(fname) as fh:
-            # Use relative tolerance of 1e-5 and absolute tolerance of 1e-8 
-            # assuming most metrics work on `float32` values, which is the common 
+            # Use relative tolerance of 1e-5 and absolute tolerance of 1e-8
+            # assuming most metrics work on `float32` values, which is the common
             # default floating type across popular libraries (PyTorch, Tensorflow, and JAX).
             assert flatten(json.load(fh)) == pytest.approx(
-                flatten(json.loads(json.dumps(ob, sort_keys=True))), rel=1e-5, abs=1e-8)
+                flatten(json.loads(json.dumps(ob, sort_keys=True))), rel=1e-5, abs=1e-8
+            )
     else:
-        with open(fname, 'w') as fh:
+        with open(fname, "w") as fh:
             json.dump(ob, fh, sort_keys=True)
 
 
@@ -30,14 +31,23 @@ def assert_target_hashed(name, ob):
     fname = f"tests/testdata/{name}"
     if os.path.exists(fname):
         with open(fname) as fh:
-            assert fh.read() == hashlib.sha256(json.dumps(ob, sort_keys=True).encode('utf-8')).hexdigest()
+            assert (
+                fh.read()
+                == hashlib.sha256(
+                    json.dumps(ob, sort_keys=True).encode("utf-8")
+                ).hexdigest()
+            )
     else:
-        with open(fname, 'w') as fh:
-            fh.write(hashlib.sha256(json.dumps(ob, sort_keys=True).encode('utf-8')).hexdigest())
+        with open(fname, "w") as fh:
+            fh.write(
+                hashlib.sha256(
+                    json.dumps(ob, sort_keys=True).encode("utf-8")
+                ).hexdigest()
+            )
+
 
-            
 # from https://stackoverflow.com/a/6027615
-def flatten(d, parent_key='', sep='.'):
+def flatten(d, parent_key="", sep="."):
     items = []
     for k, v in d.items():
         new_key = parent_key + sep + k if parent_key else k
@@ -47,24 +57,26 @@ def flatten(d, parent_key='', sep='.'):
             items.append((new_key, v))
     return dict(items)
 
+
 # make sure eval results for a task version are stable
 
+
 @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
 def test_versions_stable(taskname, task_class):
     task_dict = tasks.get_task_dict([taskname])
-    lm = models.get_model('dummy')()
+    lm = models.get_model("dummy")()
 
     def ll_fn(reqs):
         for ctx, cont in reqs:
             if len(ctx) == 0:
                 continue
             # space convention
-            assert ctx[-1] != ' '
-            assert cont[0] == ' ' or ctx[-1] == '\n'
-        
+            assert ctx[-1] != " "
+            assert cont[0] == " " or ctx[-1] == "\n"
+
         assert_target_hashed(f"{taskname}-v{task_class.VERSION}-loglikelihood", reqs)
         res = []
-        
+
         random.seed(42)
         for _ in reqs:
             res.append((-random.random(), False))
@@ -72,10 +84,12 @@ def ll_fn(reqs):
         return res
 
     def ll_perp_fn(reqs):
-        for string, in reqs:
+        for (string,) in reqs:
             assert isinstance(string, str)
 
-        assert_target_hashed(f"{taskname}-v{task_class.VERSION}-loglikelihood_rolling", reqs)
+        assert_target_hashed(
+            f"{taskname}-v{task_class.VERSION}-loglikelihood_rolling", reqs
+        )
         res = []
 
         random.seed(42)
@@ -83,14 +97,14 @@ def ll_perp_fn(reqs):
             res.append(-random.random())
 
         return res
-    
+
     def greedy_until(reqs):
         res = []
         assert_target_hashed(f"{taskname}-v{task_class.VERSION}-greedy_until", reqs)
-        
+
         for ctx, _ in reqs:
             res.append("lol")
-            assert ctx.strip() != ''
+            assert ctx.strip() != ""
 
         return res
 
@@ -100,12 +114,12 @@ def greedy_until(reqs):
 
     limit = None
     result = evaluator.evaluate(
-            lm=lm,
-            task_dict=task_dict,
-            num_fewshot=0,
-            limit=limit,
-            bootstrap_iters=10,
-            description_dict=None
+        lm=lm,
+        task_dict=task_dict,
+        num_fewshot=0,
+        limit=limit,
+        bootstrap_iters=10,
+        description_dict=None,
     )
 
     assert_target(f"{taskname}-v{task_class.VERSION}-res", result)
diff --git a/tests/testdata/anagrams1-v0-greedy_until b/tests/testdata/anagrams1-v0-greedy_until
index 5536425002..2195ebfbf5 100644
--- a/tests/testdata/anagrams1-v0-greedy_until
+++ b/tests/testdata/anagrams1-v0-greedy_until
@@ -1 +1 @@
-7c0c5246d3f751f39119a5629ac1d4b2c6fd2a315f78d6de9b2c387e24e3fef1
\ No newline at end of file
+7c0c5246d3f751f39119a5629ac1d4b2c6fd2a315f78d6de9b2c387e24e3fef1
diff --git a/tests/testdata/anagrams1-v0-res.json b/tests/testdata/anagrams1-v0-res.json
index c89528892a..1dde182b14 100644
--- a/tests/testdata/anagrams1-v0-res.json
+++ b/tests/testdata/anagrams1-v0-res.json
@@ -1 +1 @@
-{"results": {"anagrams1": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"anagrams1": 0}}
\ No newline at end of file
+{"results": {"anagrams1": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"anagrams1": 0}}
diff --git a/tests/testdata/anagrams2-v0-greedy_until b/tests/testdata/anagrams2-v0-greedy_until
index 9db9d158dc..6349c22e73 100644
--- a/tests/testdata/anagrams2-v0-greedy_until
+++ b/tests/testdata/anagrams2-v0-greedy_until
@@ -1 +1 @@
-6700a3c44e48abe8337238dcbe3b54cf4abafe0c204c52d921e590872fbd05e7
\ No newline at end of file
+6700a3c44e48abe8337238dcbe3b54cf4abafe0c204c52d921e590872fbd05e7
diff --git a/tests/testdata/anagrams2-v0-res.json b/tests/testdata/anagrams2-v0-res.json
index f74887fe16..cdf7e295e6 100644
--- a/tests/testdata/anagrams2-v0-res.json
+++ b/tests/testdata/anagrams2-v0-res.json
@@ -1 +1 @@
-{"results": {"anagrams2": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"anagrams2": 0}}
\ No newline at end of file
+{"results": {"anagrams2": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"anagrams2": 0}}
diff --git a/tests/testdata/anli_r1-v0-loglikelihood b/tests/testdata/anli_r1-v0-loglikelihood
index 4450c0628e..b48619eddb 100644
--- a/tests/testdata/anli_r1-v0-loglikelihood
+++ b/tests/testdata/anli_r1-v0-loglikelihood
@@ -1 +1 @@
-3a84baf2f170e138c6ce0bc9f06f905def35d705fa2b8781f10c87aef404c4cb
\ No newline at end of file
+3a84baf2f170e138c6ce0bc9f06f905def35d705fa2b8781f10c87aef404c4cb
diff --git a/tests/testdata/anli_r1-v0-res.json b/tests/testdata/anli_r1-v0-res.json
index b6f6b35018..7b2c32c2b9 100644
--- a/tests/testdata/anli_r1-v0-res.json
+++ b/tests/testdata/anli_r1-v0-res.json
@@ -1 +1 @@
-{"results": {"anli_r1": {"acc": 0.334, "acc_stderr": 0.014922019523732967}}, "versions": {"anli_r1": 0}}
\ No newline at end of file
+{"results": {"anli_r1": {"acc": 0.334, "acc_stderr": 0.014922019523732967}}, "versions": {"anli_r1": 0}}
diff --git a/tests/testdata/anli_r2-v0-loglikelihood b/tests/testdata/anli_r2-v0-loglikelihood
index 4a437fc8a8..1f312526f2 100644
--- a/tests/testdata/anli_r2-v0-loglikelihood
+++ b/tests/testdata/anli_r2-v0-loglikelihood
@@ -1 +1 @@
-d0ea3c3e09d533982c15b4c034439896d6af4bbafb2254d305e20215534a251d
\ No newline at end of file
+d0ea3c3e09d533982c15b4c034439896d6af4bbafb2254d305e20215534a251d
diff --git a/tests/testdata/anli_r2-v0-res.json b/tests/testdata/anli_r2-v0-res.json
index 6dc08ebbaa..81e68ebb47 100644
--- a/tests/testdata/anli_r2-v0-res.json
+++ b/tests/testdata/anli_r2-v0-res.json
@@ -1 +1 @@
-{"results": {"anli_r2": {"acc": 0.356, "acc_stderr": 0.015149042659306628}}, "versions": {"anli_r2": 0}}
\ No newline at end of file
+{"results": {"anli_r2": {"acc": 0.356, "acc_stderr": 0.015149042659306628}}, "versions": {"anli_r2": 0}}
diff --git a/tests/testdata/anli_r3-v0-loglikelihood b/tests/testdata/anli_r3-v0-loglikelihood
index 29d3d67c8b..75bd1be817 100644
--- a/tests/testdata/anli_r3-v0-loglikelihood
+++ b/tests/testdata/anli_r3-v0-loglikelihood
@@ -1 +1 @@
-6b6e5c6a794f2fbff78b7aa24fe0c90156039334bbd1cb34f7af9fc6e6183845
\ No newline at end of file
+6b6e5c6a794f2fbff78b7aa24fe0c90156039334bbd1cb34f7af9fc6e6183845
diff --git a/tests/testdata/anli_r3-v0-res.json b/tests/testdata/anli_r3-v0-res.json
index 548dea1e22..b41cf014b7 100644
--- a/tests/testdata/anli_r3-v0-res.json
+++ b/tests/testdata/anli_r3-v0-res.json
@@ -1 +1 @@
-{"results": {"anli_r3": {"acc": 0.31916666666666665, "acc_stderr": 0.01346230971200514}}, "versions": {"anli_r3": 0}}
\ No newline at end of file
+{"results": {"anli_r3": {"acc": 0.31916666666666665, "acc_stderr": 0.01346230971200514}}, "versions": {"anli_r3": 0}}
diff --git a/tests/testdata/arc_challenge-v0-loglikelihood b/tests/testdata/arc_challenge-v0-loglikelihood
index 91a3560635..9722e1bd9b 100644
--- a/tests/testdata/arc_challenge-v0-loglikelihood
+++ b/tests/testdata/arc_challenge-v0-loglikelihood
@@ -1 +1 @@
-41c34c96cca8ace661911d0033d630c554b283f5a3953bcdc50720ae6b00a9c1
\ No newline at end of file
+41c34c96cca8ace661911d0033d630c554b283f5a3953bcdc50720ae6b00a9c1
diff --git a/tests/testdata/arc_challenge-v0-res.json b/tests/testdata/arc_challenge-v0-res.json
index 49f34a7306..e6b74ebaec 100644
--- a/tests/testdata/arc_challenge-v0-res.json
+++ b/tests/testdata/arc_challenge-v0-res.json
@@ -1 +1 @@
-{"results": {"arc_challenge": {"acc": 0.24488054607508533, "acc_norm": 0.2440273037542662, "acc_norm_stderr": 0.012551447627856257, "acc_stderr": 0.012566273985131354}}, "versions": {"arc_challenge": 0}}
\ No newline at end of file
+{"results": {"arc_challenge": {"acc": 0.24488054607508533, "acc_norm": 0.2440273037542662, "acc_norm_stderr": 0.012551447627856257, "acc_stderr": 0.012566273985131354}}, "versions": {"arc_challenge": 0}}
diff --git a/tests/testdata/arc_easy-v0-loglikelihood b/tests/testdata/arc_easy-v0-loglikelihood
index d82be433ab..090cb9eb6c 100644
--- a/tests/testdata/arc_easy-v0-loglikelihood
+++ b/tests/testdata/arc_easy-v0-loglikelihood
@@ -1 +1 @@
-ffa6e39a35a16299dcb015f17f986aaa598ad8b4840c4cebe0339a7042232741
\ No newline at end of file
+ffa6e39a35a16299dcb015f17f986aaa598ad8b4840c4cebe0339a7042232741
diff --git a/tests/testdata/arc_easy-v0-res.json b/tests/testdata/arc_easy-v0-res.json
index f217448594..de15549b7a 100644
--- a/tests/testdata/arc_easy-v0-res.json
+++ b/tests/testdata/arc_easy-v0-res.json
@@ -1 +1 @@
-{"results": {"arc_easy": {"acc": 0.2474747474747475, "acc_norm": 0.24074074074074073, "acc_norm_stderr": 0.008772796145221907, "acc_stderr": 0.008855114414834707}}, "versions": {"arc_easy": 0}}
\ No newline at end of file
+{"results": {"arc_easy": {"acc": 0.2474747474747475, "acc_norm": 0.24074074074074073, "acc_norm_stderr": 0.008772796145221907, "acc_stderr": 0.008855114414834707}}, "versions": {"arc_easy": 0}}
diff --git a/tests/testdata/arithmetic_1dc-v0-loglikelihood b/tests/testdata/arithmetic_1dc-v0-loglikelihood
index 01756b4d47..4e4b4919ac 100644
--- a/tests/testdata/arithmetic_1dc-v0-loglikelihood
+++ b/tests/testdata/arithmetic_1dc-v0-loglikelihood
@@ -1 +1 @@
-04c3a63a6b3c579bd3775d92b3076ba9130041d5ce7cf9244d3f86e95c804387
\ No newline at end of file
+04c3a63a6b3c579bd3775d92b3076ba9130041d5ce7cf9244d3f86e95c804387
diff --git a/tests/testdata/arithmetic_1dc-v0-res.json b/tests/testdata/arithmetic_1dc-v0-res.json
index 29e447d578..6c8b47d066 100644
--- a/tests/testdata/arithmetic_1dc-v0-res.json
+++ b/tests/testdata/arithmetic_1dc-v0-res.json
@@ -1 +1 @@
-{"results": {"arithmetic_1dc": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_1dc": 0}}
\ No newline at end of file
+{"results": {"arithmetic_1dc": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_1dc": 0}}
diff --git a/tests/testdata/arithmetic_2da-v0-loglikelihood b/tests/testdata/arithmetic_2da-v0-loglikelihood
index fd95bb231e..0792aacf28 100644
--- a/tests/testdata/arithmetic_2da-v0-loglikelihood
+++ b/tests/testdata/arithmetic_2da-v0-loglikelihood
@@ -1 +1 @@
-6ca1ca6ebd7cac4420d5005f7f35b0edbc921377f5e4f8874cc176e4fb6d79d4
\ No newline at end of file
+6ca1ca6ebd7cac4420d5005f7f35b0edbc921377f5e4f8874cc176e4fb6d79d4
diff --git a/tests/testdata/arithmetic_2da-v0-res.json b/tests/testdata/arithmetic_2da-v0-res.json
index 874256a0b8..34a9d37d6a 100644
--- a/tests/testdata/arithmetic_2da-v0-res.json
+++ b/tests/testdata/arithmetic_2da-v0-res.json
@@ -1 +1 @@
-{"results": {"arithmetic_2da": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_2da": 0}}
\ No newline at end of file
+{"results": {"arithmetic_2da": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_2da": 0}}
diff --git a/tests/testdata/arithmetic_2dm-v0-loglikelihood b/tests/testdata/arithmetic_2dm-v0-loglikelihood
index 7b7adaf862..e9d660a0c7 100644
--- a/tests/testdata/arithmetic_2dm-v0-loglikelihood
+++ b/tests/testdata/arithmetic_2dm-v0-loglikelihood
@@ -1 +1 @@
-14ac5e510cdf82967d6827a9ca059906ee1db2e347be1b17f36403a157e73552
\ No newline at end of file
+14ac5e510cdf82967d6827a9ca059906ee1db2e347be1b17f36403a157e73552
diff --git a/tests/testdata/arithmetic_2dm-v0-res.json b/tests/testdata/arithmetic_2dm-v0-res.json
index 8fc5d47310..086c2ce6e1 100644
--- a/tests/testdata/arithmetic_2dm-v0-res.json
+++ b/tests/testdata/arithmetic_2dm-v0-res.json
@@ -1 +1 @@
-{"results": {"arithmetic_2dm": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_2dm": 0}}
\ No newline at end of file
+{"results": {"arithmetic_2dm": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_2dm": 0}}
diff --git a/tests/testdata/arithmetic_2ds-v0-loglikelihood b/tests/testdata/arithmetic_2ds-v0-loglikelihood
index 28f32c92c6..ec2090bcb9 100644
--- a/tests/testdata/arithmetic_2ds-v0-loglikelihood
+++ b/tests/testdata/arithmetic_2ds-v0-loglikelihood
@@ -1 +1 @@
-66f7ff3b40251ee38fadcbee658e309a200224356fc3efa07d0a490a2c24bfa3
\ No newline at end of file
+66f7ff3b40251ee38fadcbee658e309a200224356fc3efa07d0a490a2c24bfa3
diff --git a/tests/testdata/arithmetic_2ds-v0-res.json b/tests/testdata/arithmetic_2ds-v0-res.json
index a18e6eec6e..79209c8edd 100644
--- a/tests/testdata/arithmetic_2ds-v0-res.json
+++ b/tests/testdata/arithmetic_2ds-v0-res.json
@@ -1 +1 @@
-{"results": {"arithmetic_2ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_2ds": 0}}
\ No newline at end of file
+{"results": {"arithmetic_2ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_2ds": 0}}
diff --git a/tests/testdata/arithmetic_3da-v0-loglikelihood b/tests/testdata/arithmetic_3da-v0-loglikelihood
index 6c99dece22..1048cb0a8a 100644
--- a/tests/testdata/arithmetic_3da-v0-loglikelihood
+++ b/tests/testdata/arithmetic_3da-v0-loglikelihood
@@ -1 +1 @@
-c421f9cd5a5001b80e528441da925128177a04db8526ebcdab543a90b33c9ce2
\ No newline at end of file
+c421f9cd5a5001b80e528441da925128177a04db8526ebcdab543a90b33c9ce2
diff --git a/tests/testdata/arithmetic_3da-v0-res.json b/tests/testdata/arithmetic_3da-v0-res.json
index 1bbb3eb0c2..596733ef8a 100644
--- a/tests/testdata/arithmetic_3da-v0-res.json
+++ b/tests/testdata/arithmetic_3da-v0-res.json
@@ -1 +1 @@
-{"results": {"arithmetic_3da": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_3da": 0}}
\ No newline at end of file
+{"results": {"arithmetic_3da": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_3da": 0}}
diff --git a/tests/testdata/arithmetic_3ds-v0-loglikelihood b/tests/testdata/arithmetic_3ds-v0-loglikelihood
index 6bc029c520..ad55700d82 100644
--- a/tests/testdata/arithmetic_3ds-v0-loglikelihood
+++ b/tests/testdata/arithmetic_3ds-v0-loglikelihood
@@ -1 +1 @@
-d3d8bad8827d4530945a1d8b3c7589c0235bbed0bc89e7561a6fdac678f6ce5c
\ No newline at end of file
+d3d8bad8827d4530945a1d8b3c7589c0235bbed0bc89e7561a6fdac678f6ce5c
diff --git a/tests/testdata/arithmetic_3ds-v0-res.json b/tests/testdata/arithmetic_3ds-v0-res.json
index d76cc9bdf5..1d84cdbd0b 100644
--- a/tests/testdata/arithmetic_3ds-v0-res.json
+++ b/tests/testdata/arithmetic_3ds-v0-res.json
@@ -1 +1 @@
-{"results": {"arithmetic_3ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_3ds": 0}}
\ No newline at end of file
+{"results": {"arithmetic_3ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_3ds": 0}}
diff --git a/tests/testdata/arithmetic_4da-v0-loglikelihood b/tests/testdata/arithmetic_4da-v0-loglikelihood
index b52790c74b..4b14518f94 100644
--- a/tests/testdata/arithmetic_4da-v0-loglikelihood
+++ b/tests/testdata/arithmetic_4da-v0-loglikelihood
@@ -1 +1 @@
-d3557beb8b9e5704122c2fc6362b11fbe2c3f2f3cb72aed4462b208767c40e01
\ No newline at end of file
+d3557beb8b9e5704122c2fc6362b11fbe2c3f2f3cb72aed4462b208767c40e01
diff --git a/tests/testdata/arithmetic_4da-v0-res.json b/tests/testdata/arithmetic_4da-v0-res.json
index 57ce0e3007..698c584bbd 100644
--- a/tests/testdata/arithmetic_4da-v0-res.json
+++ b/tests/testdata/arithmetic_4da-v0-res.json
@@ -1 +1 @@
-{"results": {"arithmetic_4da": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_4da": 0}}
\ No newline at end of file
+{"results": {"arithmetic_4da": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_4da": 0}}
diff --git a/tests/testdata/arithmetic_4ds-v0-loglikelihood b/tests/testdata/arithmetic_4ds-v0-loglikelihood
index 154cf9c594..2c5e3bcae3 100644
--- a/tests/testdata/arithmetic_4ds-v0-loglikelihood
+++ b/tests/testdata/arithmetic_4ds-v0-loglikelihood
@@ -1 +1 @@
-d915830b8621e66331383bb2ae4c60acebf008e2f94741092ef4c33ea5441037
\ No newline at end of file
+d915830b8621e66331383bb2ae4c60acebf008e2f94741092ef4c33ea5441037
diff --git a/tests/testdata/arithmetic_4ds-v0-res.json b/tests/testdata/arithmetic_4ds-v0-res.json
index 4321db2604..4408839724 100644
--- a/tests/testdata/arithmetic_4ds-v0-res.json
+++ b/tests/testdata/arithmetic_4ds-v0-res.json
@@ -1 +1 @@
-{"results": {"arithmetic_4ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_4ds": 0}}
\ No newline at end of file
+{"results": {"arithmetic_4ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_4ds": 0}}
diff --git a/tests/testdata/arithmetic_5da-v0-loglikelihood b/tests/testdata/arithmetic_5da-v0-loglikelihood
index a751332bc6..c977e8d084 100644
--- a/tests/testdata/arithmetic_5da-v0-loglikelihood
+++ b/tests/testdata/arithmetic_5da-v0-loglikelihood
@@ -1 +1 @@
-49edb1e735660631ea6cc309721e6c0b80b7106a613a6959514852ca48f1130e
\ No newline at end of file
+49edb1e735660631ea6cc309721e6c0b80b7106a613a6959514852ca48f1130e
diff --git a/tests/testdata/arithmetic_5da-v0-res.json b/tests/testdata/arithmetic_5da-v0-res.json
index fb9a5671e8..44816832f5 100644
--- a/tests/testdata/arithmetic_5da-v0-res.json
+++ b/tests/testdata/arithmetic_5da-v0-res.json
@@ -1 +1 @@
-{"results": {"arithmetic_5da": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_5da": 0}}
\ No newline at end of file
+{"results": {"arithmetic_5da": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_5da": 0}}
diff --git a/tests/testdata/arithmetic_5ds-v0-loglikelihood b/tests/testdata/arithmetic_5ds-v0-loglikelihood
index 0f959c21f6..d83afd50b3 100644
--- a/tests/testdata/arithmetic_5ds-v0-loglikelihood
+++ b/tests/testdata/arithmetic_5ds-v0-loglikelihood
@@ -1 +1 @@
-2888d6d098a5ef8c1e7f0d8295ba80826e2e04e431f57508dfb71d53e1cd4604
\ No newline at end of file
+2888d6d098a5ef8c1e7f0d8295ba80826e2e04e431f57508dfb71d53e1cd4604
diff --git a/tests/testdata/arithmetic_5ds-v0-res.json b/tests/testdata/arithmetic_5ds-v0-res.json
index c7773f373d..f5580eb045 100644
--- a/tests/testdata/arithmetic_5ds-v0-res.json
+++ b/tests/testdata/arithmetic_5ds-v0-res.json
@@ -1 +1 @@
-{"results": {"arithmetic_5ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_5ds": 0}}
\ No newline at end of file
+{"results": {"arithmetic_5ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_5ds": 0}}
diff --git a/tests/testdata/boolq-v0-loglikelihood b/tests/testdata/boolq-v0-loglikelihood
index 14c1bf5f5e..9e546c722d 100644
--- a/tests/testdata/boolq-v0-loglikelihood
+++ b/tests/testdata/boolq-v0-loglikelihood
@@ -1 +1 @@
-de5aa6f77a2e0fd050b9c272f10c4d5d5581e4f75ffa60926f79e60ae1738960
\ No newline at end of file
+de5aa6f77a2e0fd050b9c272f10c4d5d5581e4f75ffa60926f79e60ae1738960
diff --git a/tests/testdata/boolq-v0-res.json b/tests/testdata/boolq-v0-res.json
index 2b459d8b28..9c858c4ea5 100644
--- a/tests/testdata/boolq-v0-res.json
+++ b/tests/testdata/boolq-v0-res.json
@@ -1 +1 @@
-{"results": {"boolq": {"acc": 0.5048929663608562, "acc_stderr": 0.00874463623355505}}, "versions": {"boolq": 0}}
\ No newline at end of file
+{"results": {"boolq": {"acc": 0.5048929663608562, "acc_stderr": 0.00874463623355505}}, "versions": {"boolq": 0}}
diff --git a/tests/testdata/boolq-v1-loglikelihood b/tests/testdata/boolq-v1-loglikelihood
index 7811121c9f..ebd0f5ec32 100644
--- a/tests/testdata/boolq-v1-loglikelihood
+++ b/tests/testdata/boolq-v1-loglikelihood
@@ -1 +1 @@
-6577e0d88572772ef08e64f624c0e3df0953286ae1f118ccef15623b59ffeabf
\ No newline at end of file
+6577e0d88572772ef08e64f624c0e3df0953286ae1f118ccef15623b59ffeabf
diff --git a/tests/testdata/boolq-v1-res.json b/tests/testdata/boolq-v1-res.json
index 291b9f122d..ec53d0b932 100644
--- a/tests/testdata/boolq-v1-res.json
+++ b/tests/testdata/boolq-v1-res.json
@@ -1 +1 @@
-{"results": {"boolq": {"acc": 0.5048929663608562, "acc_stderr": 0.00874463623355505}}, "versions": {"boolq": 1}}
\ No newline at end of file
+{"results": {"boolq": {"acc": 0.5048929663608562, "acc_stderr": 0.00874463623355505}}, "versions": {"boolq": 1}}
diff --git a/tests/testdata/cb-v0-loglikelihood b/tests/testdata/cb-v0-loglikelihood
index 6fa6f6dae6..01e69edb0c 100644
--- a/tests/testdata/cb-v0-loglikelihood
+++ b/tests/testdata/cb-v0-loglikelihood
@@ -1 +1 @@
-ec3b1bbb9561e39c43c6f77a23b4060b15c606141c5346e3d0791b3e92aaa5d0
\ No newline at end of file
+ec3b1bbb9561e39c43c6f77a23b4060b15c606141c5346e3d0791b3e92aaa5d0
diff --git a/tests/testdata/cb-v0-res.json b/tests/testdata/cb-v0-res.json
index ba386fd6c7..6adeae3439 100644
--- a/tests/testdata/cb-v0-res.json
+++ b/tests/testdata/cb-v0-res.json
@@ -1 +1 @@
-{"results": {"cb": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.2819143819143819}}, "versions": {"cb": 0}}
\ No newline at end of file
+{"results": {"cb": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.2819143819143819}}, "versions": {"cb": 0}}
diff --git a/tests/testdata/cb-v1-loglikelihood b/tests/testdata/cb-v1-loglikelihood
index ad7e928fe6..6052306670 100644
--- a/tests/testdata/cb-v1-loglikelihood
+++ b/tests/testdata/cb-v1-loglikelihood
@@ -1 +1 @@
-77b11f4348eb8a7f57faf95c531fda01ab4bf0e729f91a82451ed8e71ec8e66d
\ No newline at end of file
+77b11f4348eb8a7f57faf95c531fda01ab4bf0e729f91a82451ed8e71ec8e66d
diff --git a/tests/testdata/cb-v1-res.json b/tests/testdata/cb-v1-res.json
index 1cff410b2c..44cca02ed9 100644
--- a/tests/testdata/cb-v1-res.json
+++ b/tests/testdata/cb-v1-res.json
@@ -1 +1 @@
-{"results": {"cb": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.2819143819143819}}, "versions": {"cb": 1}}
\ No newline at end of file
+{"results": {"cb": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.2819143819143819}}, "versions": {"cb": 1}}
diff --git a/tests/testdata/cola-v0-loglikelihood b/tests/testdata/cola-v0-loglikelihood
index 45737909e7..396e1f1560 100644
--- a/tests/testdata/cola-v0-loglikelihood
+++ b/tests/testdata/cola-v0-loglikelihood
@@ -1 +1 @@
-e8635578ed8ee70b707a666d35e468b9321db24470f80c92080651e2bfa01751
\ No newline at end of file
+e8635578ed8ee70b707a666d35e468b9321db24470f80c92080651e2bfa01751
diff --git a/tests/testdata/cola-v0-res.json b/tests/testdata/cola-v0-res.json
index 462e5d9401..11d13c0d8c 100644
--- a/tests/testdata/cola-v0-res.json
+++ b/tests/testdata/cola-v0-res.json
@@ -1 +1 @@
-{"results": {"cola": {"mcc": -0.04538802810223175, "mcc_stderr": 0.023100371589225246}}, "versions": {"cola": 0}}
\ No newline at end of file
+{"results": {"cola": {"mcc": -0.04538802810223175, "mcc_stderr": 0.023100371589225246}}, "versions": {"cola": 0}}
diff --git a/tests/testdata/copa-v0-loglikelihood b/tests/testdata/copa-v0-loglikelihood
index ebe4c6512a..9636e9f8bd 100644
--- a/tests/testdata/copa-v0-loglikelihood
+++ b/tests/testdata/copa-v0-loglikelihood
@@ -1 +1 @@
-66276b9045b5300cba4b81340db06f674f031fa0b8883714ad0d03be464cd799
\ No newline at end of file
+66276b9045b5300cba4b81340db06f674f031fa0b8883714ad0d03be464cd799
diff --git a/tests/testdata/copa-v0-res.json b/tests/testdata/copa-v0-res.json
index 9a537ec768..659b3bddce 100644
--- a/tests/testdata/copa-v0-res.json
+++ b/tests/testdata/copa-v0-res.json
@@ -1 +1 @@
-{"results": {"copa": {"acc": 0.48, "acc_stderr": 0.050211673156867795}}, "versions": {"copa": 0}}
\ No newline at end of file
+{"results": {"copa": {"acc": 0.48, "acc_stderr": 0.050211673156867795}}, "versions": {"copa": 0}}
diff --git a/tests/testdata/coqa-v0-greedy_until b/tests/testdata/coqa-v0-greedy_until
index c1a9e165a7..fc8eff9390 100644
--- a/tests/testdata/coqa-v0-greedy_until
+++ b/tests/testdata/coqa-v0-greedy_until
@@ -1 +1 @@
-4a8605d5deed0423ec095700251ed93325b45d320aca35d4ce1e94702094435e
\ No newline at end of file
+4a8605d5deed0423ec095700251ed93325b45d320aca35d4ce1e94702094435e
diff --git a/tests/testdata/coqa-v0-res.json b/tests/testdata/coqa-v0-res.json
index 9ca8024e3b..d91e4e6113 100644
--- a/tests/testdata/coqa-v0-res.json
+++ b/tests/testdata/coqa-v0-res.json
@@ -1 +1 @@
-{"results": {"coqa": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"coqa": 0}}
\ No newline at end of file
+{"results": {"coqa": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"coqa": 0}}
diff --git a/tests/testdata/coqa-v1-greedy_until b/tests/testdata/coqa-v1-greedy_until
index f6e3f64b18..0669bc24c5 100644
--- a/tests/testdata/coqa-v1-greedy_until
+++ b/tests/testdata/coqa-v1-greedy_until
@@ -1 +1 @@
-57581470b921435d40da97872bb1cfda6ecf963ccc4b0240a3b04e3fea8c8e3a
\ No newline at end of file
+57581470b921435d40da97872bb1cfda6ecf963ccc4b0240a3b04e3fea8c8e3a
diff --git a/tests/testdata/coqa-v1-res.json b/tests/testdata/coqa-v1-res.json
index 7941ad6299..4778173060 100644
--- a/tests/testdata/coqa-v1-res.json
+++ b/tests/testdata/coqa-v1-res.json
@@ -1 +1 @@
-{"results": {"coqa": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"coqa": 1}}
\ No newline at end of file
+{"results": {"coqa": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"coqa": 1}}
diff --git a/tests/testdata/cycle_letters-v0-greedy_until b/tests/testdata/cycle_letters-v0-greedy_until
index 9068a24ef5..5dcb1c46f0 100644
--- a/tests/testdata/cycle_letters-v0-greedy_until
+++ b/tests/testdata/cycle_letters-v0-greedy_until
@@ -1 +1 @@
-eb23f7d5de7528eefd8ed5f8054c402ff947319cccfef7195995946f99389201
\ No newline at end of file
+eb23f7d5de7528eefd8ed5f8054c402ff947319cccfef7195995946f99389201
diff --git a/tests/testdata/cycle_letters-v0-res.json b/tests/testdata/cycle_letters-v0-res.json
index 5b05a9430e..0048e6b709 100644
--- a/tests/testdata/cycle_letters-v0-res.json
+++ b/tests/testdata/cycle_letters-v0-res.json
@@ -1 +1 @@
-{"results": {"cycle_letters": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"cycle_letters": 0}}
\ No newline at end of file
+{"results": {"cycle_letters": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"cycle_letters": 0}}
diff --git a/tests/testdata/drop-v0-greedy_until b/tests/testdata/drop-v0-greedy_until
index 6470b349d2..6a6f4da4ea 100644
--- a/tests/testdata/drop-v0-greedy_until
+++ b/tests/testdata/drop-v0-greedy_until
@@ -1 +1 @@
-ca566c630d8ac853d5785d4b5c40a5137172c34b48af3350e1f79e6d548b36ba
\ No newline at end of file
+ca566c630d8ac853d5785d4b5c40a5137172c34b48af3350e1f79e6d548b36ba
diff --git a/tests/testdata/drop-v0-res.json b/tests/testdata/drop-v0-res.json
index 9384ca72fe..a60d623006 100644
--- a/tests/testdata/drop-v0-res.json
+++ b/tests/testdata/drop-v0-res.json
@@ -1 +1 @@
-{"results": {"drop": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"drop": 0}}
\ No newline at end of file
+{"results": {"drop": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"drop": 0}}
diff --git a/tests/testdata/drop-v1-greedy_until b/tests/testdata/drop-v1-greedy_until
index 3b2b697c91..7695a37e7c 100644
--- a/tests/testdata/drop-v1-greedy_until
+++ b/tests/testdata/drop-v1-greedy_until
@@ -1 +1 @@
-a670f911ab2999d72db15f534b22703d19e7837edbda4f9f199ad587f7aae6b2
\ No newline at end of file
+a670f911ab2999d72db15f534b22703d19e7837edbda4f9f199ad587f7aae6b2
diff --git a/tests/testdata/drop-v1-res.json b/tests/testdata/drop-v1-res.json
index 8f397b410d..d11936576c 100644
--- a/tests/testdata/drop-v1-res.json
+++ b/tests/testdata/drop-v1-res.json
@@ -1 +1 @@
-{"results": {"drop": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"drop": 1}}
\ No newline at end of file
+{"results": {"drop": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"drop": 1}}
diff --git a/tests/testdata/ethics_cm-v0-loglikelihood b/tests/testdata/ethics_cm-v0-loglikelihood
index 69289144e0..208dbc2829 100644
--- a/tests/testdata/ethics_cm-v0-loglikelihood
+++ b/tests/testdata/ethics_cm-v0-loglikelihood
@@ -1 +1 @@
-92d136ebb2bd86cd036e61699ad9a1417dbb48651f0a3afa5045cf57cef5a3f6
\ No newline at end of file
+92d136ebb2bd86cd036e61699ad9a1417dbb48651f0a3afa5045cf57cef5a3f6
diff --git a/tests/testdata/ethics_cm-v0-res.json b/tests/testdata/ethics_cm-v0-res.json
index f81a700903..5234987304 100644
--- a/tests/testdata/ethics_cm-v0-res.json
+++ b/tests/testdata/ethics_cm-v0-res.json
@@ -1 +1 @@
-{"results": {"ethics_cm": {"acc": 0.49987129987129986, "acc_stderr": 0.008022881531793336}}, "versions": {"ethics_cm": 0}}
\ No newline at end of file
+{"results": {"ethics_cm": {"acc": 0.49987129987129986, "acc_stderr": 0.008022881531793336}}, "versions": {"ethics_cm": 0}}
diff --git a/tests/testdata/ethics_deontology-v0-loglikelihood b/tests/testdata/ethics_deontology-v0-loglikelihood
index ab01349737..94ba432a0c 100644
--- a/tests/testdata/ethics_deontology-v0-loglikelihood
+++ b/tests/testdata/ethics_deontology-v0-loglikelihood
@@ -1 +1 @@
-74ecebe322457d70afc16fde848978410a09b854dc65c47f428d100bd1593248
\ No newline at end of file
+74ecebe322457d70afc16fde848978410a09b854dc65c47f428d100bd1593248
diff --git a/tests/testdata/ethics_deontology-v0-res.json b/tests/testdata/ethics_deontology-v0-res.json
index 3af24f414a..c8988b2188 100644
--- a/tests/testdata/ethics_deontology-v0-res.json
+++ b/tests/testdata/ethics_deontology-v0-res.json
@@ -1 +1 @@
-{"results": {"ethics_deontology": {"acc": 0.503615127919911, "acc_stderr": 0.008338908432085105, "em": 0.07119021134593993}}, "versions": {"ethics_deontology": 0}}
\ No newline at end of file
+{"results": {"ethics_deontology": {"acc": 0.503615127919911, "acc_stderr": 0.008338908432085105, "em": 0.07119021134593993}}, "versions": {"ethics_deontology": 0}}
diff --git a/tests/testdata/ethics_justice-v0-loglikelihood b/tests/testdata/ethics_justice-v0-loglikelihood
index cc18a7e67b..4688ceb090 100644
--- a/tests/testdata/ethics_justice-v0-loglikelihood
+++ b/tests/testdata/ethics_justice-v0-loglikelihood
@@ -1 +1 @@
-d7dfc44fea507b5c5c3a8218f79ed8197da8599ebb396d85feb91c25512126b6
\ No newline at end of file
+d7dfc44fea507b5c5c3a8218f79ed8197da8599ebb396d85feb91c25512126b6
diff --git a/tests/testdata/ethics_justice-v0-res.json b/tests/testdata/ethics_justice-v0-res.json
index 39efbc506a..3a82972c3b 100644
--- a/tests/testdata/ethics_justice-v0-res.json
+++ b/tests/testdata/ethics_justice-v0-res.json
@@ -1 +1 @@
-{"results": {"ethics_justice": {"acc": 0.49556213017751477, "acc_stderr": 0.009616784279885177, "em": 0.057692307692307696}}, "versions": {"ethics_justice": 0}}
\ No newline at end of file
+{"results": {"ethics_justice": {"acc": 0.49556213017751477, "acc_stderr": 0.009616784279885177, "em": 0.057692307692307696}}, "versions": {"ethics_justice": 0}}
diff --git a/tests/testdata/ethics_utilitarianism-v0-loglikelihood b/tests/testdata/ethics_utilitarianism-v0-loglikelihood
index 0c01f54880..ba1de12b7b 100644
--- a/tests/testdata/ethics_utilitarianism-v0-loglikelihood
+++ b/tests/testdata/ethics_utilitarianism-v0-loglikelihood
@@ -1 +1 @@
-88872f1ed1b203f9649a4ced4fb4627d18c17af455d713de6e17c05eced4ec60
\ No newline at end of file
+88872f1ed1b203f9649a4ced4fb4627d18c17af455d713de6e17c05eced4ec60
diff --git a/tests/testdata/ethics_utilitarianism-v0-res.json b/tests/testdata/ethics_utilitarianism-v0-res.json
index 857af346b4..a2bb3786bb 100644
--- a/tests/testdata/ethics_utilitarianism-v0-res.json
+++ b/tests/testdata/ethics_utilitarianism-v0-res.json
@@ -1 +1 @@
-{"results": {"ethics_utilitarianism": {"acc": 0.49771214642262895, "acc_stderr": 0.007211546310787838}}, "versions": {"ethics_utilitarianism": 0}}
\ No newline at end of file
+{"results": {"ethics_utilitarianism": {"acc": 0.49771214642262895, "acc_stderr": 0.007211546310787838}}, "versions": {"ethics_utilitarianism": 0}}
diff --git a/tests/testdata/ethics_utilitarianism_original-v0-loglikelihood b/tests/testdata/ethics_utilitarianism_original-v0-loglikelihood
index bd3ff6c459..34f7c0b13a 100644
--- a/tests/testdata/ethics_utilitarianism_original-v0-loglikelihood
+++ b/tests/testdata/ethics_utilitarianism_original-v0-loglikelihood
@@ -1 +1 @@
-5b42ba1faf5ece6a6ec9a3976ce79c1fac8df5b98272aab85457188c2142693c
\ No newline at end of file
+5b42ba1faf5ece6a6ec9a3976ce79c1fac8df5b98272aab85457188c2142693c
diff --git a/tests/testdata/ethics_utilitarianism_original-v0-res.json b/tests/testdata/ethics_utilitarianism_original-v0-res.json
index 16940c8f5a..b16cc34831 100644
--- a/tests/testdata/ethics_utilitarianism_original-v0-res.json
+++ b/tests/testdata/ethics_utilitarianism_original-v0-res.json
@@ -1 +1 @@
-{"results": {"ethics_utilitarianism_original": {"acc": 0.5214226289517471, "acc_stderr": 0.007204999520618661}}, "versions": {"ethics_utilitarianism_original": 0}}
\ No newline at end of file
+{"results": {"ethics_utilitarianism_original": {"acc": 0.5214226289517471, "acc_stderr": 0.007204999520618661}}, "versions": {"ethics_utilitarianism_original": 0}}
diff --git a/tests/testdata/ethics_virtue-v0-loglikelihood b/tests/testdata/ethics_virtue-v0-loglikelihood
index 48652c4689..eb892a6cf6 100644
--- a/tests/testdata/ethics_virtue-v0-loglikelihood
+++ b/tests/testdata/ethics_virtue-v0-loglikelihood
@@ -1 +1 @@
-8021db8de46850090ddae6e6ec2d382029c3027b7c69884607503f916d09b709
\ No newline at end of file
+8021db8de46850090ddae6e6ec2d382029c3027b7c69884607503f916d09b709
diff --git a/tests/testdata/ethics_virtue-v0-res.json b/tests/testdata/ethics_virtue-v0-res.json
index cf3e02d826..cb98c99e62 100644
--- a/tests/testdata/ethics_virtue-v0-res.json
+++ b/tests/testdata/ethics_virtue-v0-res.json
@@ -1 +1 @@
-{"results": {"ethics_virtue": {"acc": 0.5035175879396985, "acc_stderr": 0.0070893491553555765, "em": 0.036180904522613064}}, "versions": {"ethics_virtue": 0}}
\ No newline at end of file
+{"results": {"ethics_virtue": {"acc": 0.5035175879396985, "acc_stderr": 0.0070893491553555765, "em": 0.036180904522613064}}, "versions": {"ethics_virtue": 0}}
diff --git a/tests/testdata/gsm8k-v0-greedy_until b/tests/testdata/gsm8k-v0-greedy_until
index d49400007f..43a57cab39 100644
--- a/tests/testdata/gsm8k-v0-greedy_until
+++ b/tests/testdata/gsm8k-v0-greedy_until
@@ -1 +1 @@
-e7292dbdd7fd8419ba954f2e0701e04c8d0e8842fe053dbf2fe47d926630e35e
\ No newline at end of file
+e7292dbdd7fd8419ba954f2e0701e04c8d0e8842fe053dbf2fe47d926630e35e
diff --git a/tests/testdata/gsm8k-v0-res.json b/tests/testdata/gsm8k-v0-res.json
index fb6514a0e7..f542395c78 100644
--- a/tests/testdata/gsm8k-v0-res.json
+++ b/tests/testdata/gsm8k-v0-res.json
@@ -1 +1 @@
-{"results": {"gsm8k": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"gsm8k": 0}}
\ No newline at end of file
+{"results": {"gsm8k": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"gsm8k": 0}}
diff --git a/tests/testdata/headqa-v0-loglikelihood b/tests/testdata/headqa-v0-loglikelihood
index 9129d834b6..e991bf5a0b 100644
--- a/tests/testdata/headqa-v0-loglikelihood
+++ b/tests/testdata/headqa-v0-loglikelihood
@@ -1 +1 @@
-767ca34d9714edd9fb030ddbcc35a64e5180d1e247b0cb557fbb22fdf971ad1f
\ No newline at end of file
+767ca34d9714edd9fb030ddbcc35a64e5180d1e247b0cb557fbb22fdf971ad1f
diff --git a/tests/testdata/headqa-v0-res.json b/tests/testdata/headqa-v0-res.json
index adc093cf62..dba0624600 100644
--- a/tests/testdata/headqa-v0-res.json
+++ b/tests/testdata/headqa-v0-res.json
@@ -1 +1 @@
-{"results": {"headqa": {"acc": 0.23559445660102116, "acc_norm": 0.25018234865062, "acc_norm_stderr": 0.008272783230806014, "acc_stderr": 0.008105688874297972}}, "versions": {"headqa": 0}}
\ No newline at end of file
+{"results": {"headqa": {"acc": 0.23559445660102116, "acc_norm": 0.25018234865062, "acc_norm_stderr": 0.008272783230806014, "acc_stderr": 0.008105688874297972}}, "versions": {"headqa": 0}}
diff --git a/tests/testdata/headqa_en-v0-loglikelihood b/tests/testdata/headqa_en-v0-loglikelihood
index 11f07878fb..c8d26b86e2 100644
--- a/tests/testdata/headqa_en-v0-loglikelihood
+++ b/tests/testdata/headqa_en-v0-loglikelihood
@@ -1 +1 @@
-09da45119b12a0144e3081f8fb790c2a22af7b9c3aac42f54423d348a711fbf5
\ No newline at end of file
+09da45119b12a0144e3081f8fb790c2a22af7b9c3aac42f54423d348a711fbf5
diff --git a/tests/testdata/headqa_en-v0-res.json b/tests/testdata/headqa_en-v0-res.json
index 6ac5a9c0b8..b2353a4d44 100644
--- a/tests/testdata/headqa_en-v0-res.json
+++ b/tests/testdata/headqa_en-v0-res.json
@@ -1 +1 @@
-{"results": {"headqa_en": {"acc": 0.23559445660102116, "acc_norm": 0.2447118891320204, "acc_norm_stderr": 0.008211629406841468, "acc_stderr": 0.008105688874297972}}, "versions": {"headqa_en": 0}}
\ No newline at end of file
+{"results": {"headqa_en": {"acc": 0.23559445660102116, "acc_norm": 0.2447118891320204, "acc_norm_stderr": 0.008211629406841468, "acc_stderr": 0.008105688874297972}}, "versions": {"headqa_en": 0}}
diff --git a/tests/testdata/headqa_es-v0-loglikelihood b/tests/testdata/headqa_es-v0-loglikelihood
index 9129d834b6..e991bf5a0b 100644
--- a/tests/testdata/headqa_es-v0-loglikelihood
+++ b/tests/testdata/headqa_es-v0-loglikelihood
@@ -1 +1 @@
-767ca34d9714edd9fb030ddbcc35a64e5180d1e247b0cb557fbb22fdf971ad1f
\ No newline at end of file
+767ca34d9714edd9fb030ddbcc35a64e5180d1e247b0cb557fbb22fdf971ad1f
diff --git a/tests/testdata/headqa_es-v0-res.json b/tests/testdata/headqa_es-v0-res.json
index 0964db9bbb..878dd5197a 100644
--- a/tests/testdata/headqa_es-v0-res.json
+++ b/tests/testdata/headqa_es-v0-res.json
@@ -1 +1 @@
-{"results": {"headqa_es": {"acc": 0.23559445660102116, "acc_norm": 0.25018234865062, "acc_norm_stderr": 0.008272783230806014, "acc_stderr": 0.008105688874297972}}, "versions": {"headqa_es": 0}}
\ No newline at end of file
+{"results": {"headqa_es": {"acc": 0.23559445660102116, "acc_norm": 0.25018234865062, "acc_norm_stderr": 0.008272783230806014, "acc_stderr": 0.008105688874297972}}, "versions": {"headqa_es": 0}}
diff --git a/tests/testdata/hellaswag-v0-loglikelihood b/tests/testdata/hellaswag-v0-loglikelihood
index c679a3e311..96ad6cac9a 100644
--- a/tests/testdata/hellaswag-v0-loglikelihood
+++ b/tests/testdata/hellaswag-v0-loglikelihood
@@ -1 +1 @@
-abb808c97d6529eda6c11067837a132c62d25cba0394d720f80cca6df9f7196e
\ No newline at end of file
+abb808c97d6529eda6c11067837a132c62d25cba0394d720f80cca6df9f7196e
diff --git a/tests/testdata/hellaswag-v0-res.json b/tests/testdata/hellaswag-v0-res.json
index 6be94a6409..8f24fceed4 100644
--- a/tests/testdata/hellaswag-v0-res.json
+++ b/tests/testdata/hellaswag-v0-res.json
@@ -1 +1 @@
-{"results": {"hellaswag": {"acc": 0.24965146385182235, "acc_norm": 0.24756024696275641, "acc_norm_stderr": 0.004307128573285236, "acc_stderr": 0.004319267432460666}}, "versions": {"hellaswag": 0}}
\ No newline at end of file
+{"results": {"hellaswag": {"acc": 0.24965146385182235, "acc_norm": 0.24756024696275641, "acc_norm_stderr": 0.004307128573285236, "acc_stderr": 0.004319267432460666}}, "versions": {"hellaswag": 0}}
diff --git a/tests/testdata/hendrycksTest-abstract_algebra-v0-loglikelihood b/tests/testdata/hendrycksTest-abstract_algebra-v0-loglikelihood
index d0d0fe872b..7563b14750 100644
--- a/tests/testdata/hendrycksTest-abstract_algebra-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-abstract_algebra-v0-loglikelihood
@@ -1 +1 @@
-e35d1eeb356ac1084d4e9773f028cb3c81ba1c6e5574d598ac4a78aa467cd797
\ No newline at end of file
+e35d1eeb356ac1084d4e9773f028cb3c81ba1c6e5574d598ac4a78aa467cd797
diff --git a/tests/testdata/hendrycksTest-abstract_algebra-v0-res.json b/tests/testdata/hendrycksTest-abstract_algebra-v0-res.json
index dc2c9a0d7d..7fc82d2528 100644
--- a/tests/testdata/hendrycksTest-abstract_algebra-v0-res.json
+++ b/tests/testdata/hendrycksTest-abstract_algebra-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-abstract_algebra": {"acc": 0.32, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235, "acc_stderr": 0.04688261722621504}}, "versions": {"hendrycksTest-abstract_algebra": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-abstract_algebra": {"acc": 0.32, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235, "acc_stderr": 0.04688261722621504}}, "versions": {"hendrycksTest-abstract_algebra": 0}}
diff --git a/tests/testdata/hendrycksTest-anatomy-v0-loglikelihood b/tests/testdata/hendrycksTest-anatomy-v0-loglikelihood
index a7ae5fa705..fe9b56c6eb 100644
--- a/tests/testdata/hendrycksTest-anatomy-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-anatomy-v0-loglikelihood
@@ -1 +1 @@
-bf05e04ed8cf61cf3aad294ed3f5a16137775ffdd20f1b129022ddffc1251768
\ No newline at end of file
+bf05e04ed8cf61cf3aad294ed3f5a16137775ffdd20f1b129022ddffc1251768
diff --git a/tests/testdata/hendrycksTest-anatomy-v0-res.json b/tests/testdata/hendrycksTest-anatomy-v0-res.json
index 67bc2e7be6..32086e1f03 100644
--- a/tests/testdata/hendrycksTest-anatomy-v0-res.json
+++ b/tests/testdata/hendrycksTest-anatomy-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-anatomy": {"acc": 0.2222222222222222, "acc_norm": 0.23703703703703705, "acc_norm_stderr": 0.03673731683969506, "acc_stderr": 0.0359144408419697}}, "versions": {"hendrycksTest-anatomy": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-anatomy": {"acc": 0.2222222222222222, "acc_norm": 0.23703703703703705, "acc_norm_stderr": 0.03673731683969506, "acc_stderr": 0.0359144408419697}}, "versions": {"hendrycksTest-anatomy": 0}}
diff --git a/tests/testdata/hendrycksTest-astronomy-v0-loglikelihood b/tests/testdata/hendrycksTest-astronomy-v0-loglikelihood
index 8ecb637cfe..1a2082a1cb 100644
--- a/tests/testdata/hendrycksTest-astronomy-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-astronomy-v0-loglikelihood
@@ -1 +1 @@
-bed1e47127cc2893c6aef63b9a0909cca31aa351a703da2a166b01cae03c3311
\ No newline at end of file
+bed1e47127cc2893c6aef63b9a0909cca31aa351a703da2a166b01cae03c3311
diff --git a/tests/testdata/hendrycksTest-astronomy-v0-res.json b/tests/testdata/hendrycksTest-astronomy-v0-res.json
index d3626ccf80..c9754d0619 100644
--- a/tests/testdata/hendrycksTest-astronomy-v0-res.json
+++ b/tests/testdata/hendrycksTest-astronomy-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-astronomy": {"acc": 0.2565789473684211, "acc_norm": 0.29605263157894735, "acc_norm_stderr": 0.03715062154998904, "acc_stderr": 0.0355418036802569}}, "versions": {"hendrycksTest-astronomy": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-astronomy": {"acc": 0.2565789473684211, "acc_norm": 0.29605263157894735, "acc_norm_stderr": 0.03715062154998904, "acc_stderr": 0.0355418036802569}}, "versions": {"hendrycksTest-astronomy": 0}}
diff --git a/tests/testdata/hendrycksTest-business_ethics-v0-loglikelihood b/tests/testdata/hendrycksTest-business_ethics-v0-loglikelihood
index a0f8b7c09b..6a4bc72e2a 100644
--- a/tests/testdata/hendrycksTest-business_ethics-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-business_ethics-v0-loglikelihood
@@ -1 +1 @@
-b3b27e9dbad587377d3c8cab1072782de883e245da93a563bd8b3099017b1fc0
\ No newline at end of file
+b3b27e9dbad587377d3c8cab1072782de883e245da93a563bd8b3099017b1fc0
diff --git a/tests/testdata/hendrycksTest-business_ethics-v0-res.json b/tests/testdata/hendrycksTest-business_ethics-v0-res.json
index dcc5116204..b57f8c3e87 100644
--- a/tests/testdata/hendrycksTest-business_ethics-v0-res.json
+++ b/tests/testdata/hendrycksTest-business_ethics-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-business_ethics": {"acc": 0.29, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847394, "acc_stderr": 0.045604802157206845}}, "versions": {"hendrycksTest-business_ethics": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-business_ethics": {"acc": 0.29, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847394, "acc_stderr": 0.045604802157206845}}, "versions": {"hendrycksTest-business_ethics": 0}}
diff --git a/tests/testdata/hendrycksTest-clinical_knowledge-v0-loglikelihood b/tests/testdata/hendrycksTest-clinical_knowledge-v0-loglikelihood
index 86f54245d5..6734238740 100644
--- a/tests/testdata/hendrycksTest-clinical_knowledge-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-clinical_knowledge-v0-loglikelihood
@@ -1 +1 @@
-fbcb7ce507e0675d811e71e10a67c8d05a6605e29036f46776e04a6588cefbda
\ No newline at end of file
+fbcb7ce507e0675d811e71e10a67c8d05a6605e29036f46776e04a6588cefbda
diff --git a/tests/testdata/hendrycksTest-clinical_knowledge-v0-res.json b/tests/testdata/hendrycksTest-clinical_knowledge-v0-res.json
index 596bb28a93..7277e99c86 100644
--- a/tests/testdata/hendrycksTest-clinical_knowledge-v0-res.json
+++ b/tests/testdata/hendrycksTest-clinical_knowledge-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-clinical_knowledge": {"acc": 0.23773584905660378, "acc_norm": 0.27169811320754716, "acc_norm_stderr": 0.027377706624670713, "acc_stderr": 0.02619980880756191}}, "versions": {"hendrycksTest-clinical_knowledge": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-clinical_knowledge": {"acc": 0.23773584905660378, "acc_norm": 0.27169811320754716, "acc_norm_stderr": 0.027377706624670713, "acc_stderr": 0.02619980880756191}}, "versions": {"hendrycksTest-clinical_knowledge": 0}}
diff --git a/tests/testdata/hendrycksTest-college_biology-v0-loglikelihood b/tests/testdata/hendrycksTest-college_biology-v0-loglikelihood
index 7f665ef4a1..0041165ecb 100644
--- a/tests/testdata/hendrycksTest-college_biology-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-college_biology-v0-loglikelihood
@@ -1 +1 @@
-c29e4e67ff91af29b9434884874414d1b1b32ccc32903c6b1639469b19907419
\ No newline at end of file
+c29e4e67ff91af29b9434884874414d1b1b32ccc32903c6b1639469b19907419
diff --git a/tests/testdata/hendrycksTest-college_biology-v0-res.json b/tests/testdata/hendrycksTest-college_biology-v0-res.json
index 6705b9cad2..fb3cd80392 100644
--- a/tests/testdata/hendrycksTest-college_biology-v0-res.json
+++ b/tests/testdata/hendrycksTest-college_biology-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-college_biology": {"acc": 0.24305555555555555, "acc_norm": 0.2361111111111111, "acc_norm_stderr": 0.03551446610810826, "acc_stderr": 0.03586879280080341}}, "versions": {"hendrycksTest-college_biology": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-college_biology": {"acc": 0.24305555555555555, "acc_norm": 0.2361111111111111, "acc_norm_stderr": 0.03551446610810826, "acc_stderr": 0.03586879280080341}}, "versions": {"hendrycksTest-college_biology": 0}}
diff --git a/tests/testdata/hendrycksTest-college_chemistry-v0-loglikelihood b/tests/testdata/hendrycksTest-college_chemistry-v0-loglikelihood
index 52a255e82a..c35a97a4dc 100644
--- a/tests/testdata/hendrycksTest-college_chemistry-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-college_chemistry-v0-loglikelihood
@@ -1 +1 @@
-044752b21540db95118b8cbe7e75c4c9b8758e27df56543deaeadec7f749a28d
\ No newline at end of file
+044752b21540db95118b8cbe7e75c4c9b8758e27df56543deaeadec7f749a28d
diff --git a/tests/testdata/hendrycksTest-college_chemistry-v0-res.json b/tests/testdata/hendrycksTest-college_chemistry-v0-res.json
index 4dc95a151a..91bbb16f1a 100644
--- a/tests/testdata/hendrycksTest-college_chemistry-v0-res.json
+++ b/tests/testdata/hendrycksTest-college_chemistry-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-college_chemistry": {"acc": 0.28, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768078, "acc_stderr": 0.04512608598542127}}, "versions": {"hendrycksTest-college_chemistry": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-college_chemistry": {"acc": 0.28, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768078, "acc_stderr": 0.04512608598542127}}, "versions": {"hendrycksTest-college_chemistry": 0}}
diff --git a/tests/testdata/hendrycksTest-college_computer_science-v0-loglikelihood b/tests/testdata/hendrycksTest-college_computer_science-v0-loglikelihood
index 695bc8c315..c9076ffdd8 100644
--- a/tests/testdata/hendrycksTest-college_computer_science-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-college_computer_science-v0-loglikelihood
@@ -1 +1 @@
-4ea26ad780290429ac5a3317559c154848d662bd40532c966458ba6f2a32d0a3
\ No newline at end of file
+4ea26ad780290429ac5a3317559c154848d662bd40532c966458ba6f2a32d0a3
diff --git a/tests/testdata/hendrycksTest-college_computer_science-v0-res.json b/tests/testdata/hendrycksTest-college_computer_science-v0-res.json
index aea595c09f..82ba6d8d7f 100644
--- a/tests/testdata/hendrycksTest-college_computer_science-v0-res.json
+++ b/tests/testdata/hendrycksTest-college_computer_science-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-college_computer_science": {"acc": 0.22, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909282, "acc_stderr": 0.041633319989322695}}, "versions": {"hendrycksTest-college_computer_science": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-college_computer_science": {"acc": 0.22, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909282, "acc_stderr": 0.041633319989322695}}, "versions": {"hendrycksTest-college_computer_science": 0}}
diff --git a/tests/testdata/hendrycksTest-college_mathematics-v0-loglikelihood b/tests/testdata/hendrycksTest-college_mathematics-v0-loglikelihood
index a840b6b642..69d6270c27 100644
--- a/tests/testdata/hendrycksTest-college_mathematics-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-college_mathematics-v0-loglikelihood
@@ -1 +1 @@
-e9fe80752686527281f834d2397875b4580581434b94799f9de6aaa450bd73ff
\ No newline at end of file
+e9fe80752686527281f834d2397875b4580581434b94799f9de6aaa450bd73ff
diff --git a/tests/testdata/hendrycksTest-college_mathematics-v0-res.json b/tests/testdata/hendrycksTest-college_mathematics-v0-res.json
index 766b3388ed..d3b2fce1ed 100644
--- a/tests/testdata/hendrycksTest-college_mathematics-v0-res.json
+++ b/tests/testdata/hendrycksTest-college_mathematics-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-college_mathematics": {"acc": 0.18, "acc_norm": 0.2, "acc_norm_stderr": 0.04020151261036844, "acc_stderr": 0.038612291966536955}}, "versions": {"hendrycksTest-college_mathematics": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-college_mathematics": {"acc": 0.18, "acc_norm": 0.2, "acc_norm_stderr": 0.04020151261036844, "acc_stderr": 0.038612291966536955}}, "versions": {"hendrycksTest-college_mathematics": 0}}
diff --git a/tests/testdata/hendrycksTest-college_medicine-v0-loglikelihood b/tests/testdata/hendrycksTest-college_medicine-v0-loglikelihood
index 2fb96497d1..8ce2673f48 100644
--- a/tests/testdata/hendrycksTest-college_medicine-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-college_medicine-v0-loglikelihood
@@ -1 +1 @@
-dd6e0a9be1407890e9f8cd4434fb6aa4752ab3d2473837fd465ad99f60ad685e
\ No newline at end of file
+dd6e0a9be1407890e9f8cd4434fb6aa4752ab3d2473837fd465ad99f60ad685e
diff --git a/tests/testdata/hendrycksTest-college_medicine-v0-res.json b/tests/testdata/hendrycksTest-college_medicine-v0-res.json
index 524552c9bb..2045d09d11 100644
--- a/tests/testdata/hendrycksTest-college_medicine-v0-res.json
+++ b/tests/testdata/hendrycksTest-college_medicine-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-college_medicine": {"acc": 0.27167630057803466, "acc_norm": 0.2543352601156069, "acc_norm_stderr": 0.0332055644308557, "acc_stderr": 0.03391750322321659}}, "versions": {"hendrycksTest-college_medicine": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-college_medicine": {"acc": 0.27167630057803466, "acc_norm": 0.2543352601156069, "acc_norm_stderr": 0.0332055644308557, "acc_stderr": 0.03391750322321659}}, "versions": {"hendrycksTest-college_medicine": 0}}
diff --git a/tests/testdata/hendrycksTest-college_physics-v0-loglikelihood b/tests/testdata/hendrycksTest-college_physics-v0-loglikelihood
index 7c2e2f4bf7..176fd28e6b 100644
--- a/tests/testdata/hendrycksTest-college_physics-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-college_physics-v0-loglikelihood
@@ -1 +1 @@
-704a7671ef981fb95594782bc446dd632e87ebdbe89436a0603b714fb5786c75
\ No newline at end of file
+704a7671ef981fb95594782bc446dd632e87ebdbe89436a0603b714fb5786c75
diff --git a/tests/testdata/hendrycksTest-college_physics-v0-res.json b/tests/testdata/hendrycksTest-college_physics-v0-res.json
index 97e56f2ae6..5b31788208 100644
--- a/tests/testdata/hendrycksTest-college_physics-v0-res.json
+++ b/tests/testdata/hendrycksTest-college_physics-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-college_physics": {"acc": 0.23529411764705882, "acc_norm": 0.23529411764705882, "acc_norm_stderr": 0.04220773659171453, "acc_stderr": 0.04220773659171452}}, "versions": {"hendrycksTest-college_physics": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-college_physics": {"acc": 0.23529411764705882, "acc_norm": 0.23529411764705882, "acc_norm_stderr": 0.04220773659171453, "acc_stderr": 0.04220773659171452}}, "versions": {"hendrycksTest-college_physics": 0}}
diff --git a/tests/testdata/hendrycksTest-computer_security-v0-loglikelihood b/tests/testdata/hendrycksTest-computer_security-v0-loglikelihood
index d4c0ee2d78..149f9afbfd 100644
--- a/tests/testdata/hendrycksTest-computer_security-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-computer_security-v0-loglikelihood
@@ -1 +1 @@
-a8a1892d1906cc3e7ffd321043f0a60f3b8b69ef76e5c6ff03c6ea41dc87d0cb
\ No newline at end of file
+a8a1892d1906cc3e7ffd321043f0a60f3b8b69ef76e5c6ff03c6ea41dc87d0cb
diff --git a/tests/testdata/hendrycksTest-computer_security-v0-res.json b/tests/testdata/hendrycksTest-computer_security-v0-res.json
index 60f02eba9c..e624212b1d 100644
--- a/tests/testdata/hendrycksTest-computer_security-v0-res.json
+++ b/tests/testdata/hendrycksTest-computer_security-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-computer_security": {"acc": 0.24, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847394, "acc_stderr": 0.042923469599092816}}, "versions": {"hendrycksTest-computer_security": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-computer_security": {"acc": 0.24, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847394, "acc_stderr": 0.042923469599092816}}, "versions": {"hendrycksTest-computer_security": 0}}
diff --git a/tests/testdata/hendrycksTest-conceptual_physics-v0-loglikelihood b/tests/testdata/hendrycksTest-conceptual_physics-v0-loglikelihood
index 05c4db0e22..bffd2ada62 100644
--- a/tests/testdata/hendrycksTest-conceptual_physics-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-conceptual_physics-v0-loglikelihood
@@ -1 +1 @@
-622f191ccfc7a597d99f39897ebe3f95a9ddce0e662fcfb411aa554b289bb355
\ No newline at end of file
+622f191ccfc7a597d99f39897ebe3f95a9ddce0e662fcfb411aa554b289bb355
diff --git a/tests/testdata/hendrycksTest-conceptual_physics-v0-res.json b/tests/testdata/hendrycksTest-conceptual_physics-v0-res.json
index 1388bcdcd9..d60c387ab7 100644
--- a/tests/testdata/hendrycksTest-conceptual_physics-v0-res.json
+++ b/tests/testdata/hendrycksTest-conceptual_physics-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-conceptual_physics": {"acc": 0.2680851063829787, "acc_norm": 0.2553191489361702, "acc_norm_stderr": 0.028504856470514185, "acc_stderr": 0.028957342788342347}}, "versions": {"hendrycksTest-conceptual_physics": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-conceptual_physics": {"acc": 0.2680851063829787, "acc_norm": 0.2553191489361702, "acc_norm_stderr": 0.028504856470514185, "acc_stderr": 0.028957342788342347}}, "versions": {"hendrycksTest-conceptual_physics": 0}}
diff --git a/tests/testdata/hendrycksTest-econometrics-v0-loglikelihood b/tests/testdata/hendrycksTest-econometrics-v0-loglikelihood
index ed3332edda..80b70a8300 100644
--- a/tests/testdata/hendrycksTest-econometrics-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-econometrics-v0-loglikelihood
@@ -1 +1 @@
-cde76ba2c7382b4876e17136c94f52aca2774e50342ab757b2a2d18da370dcb6
\ No newline at end of file
+cde76ba2c7382b4876e17136c94f52aca2774e50342ab757b2a2d18da370dcb6
diff --git a/tests/testdata/hendrycksTest-econometrics-v0-res.json b/tests/testdata/hendrycksTest-econometrics-v0-res.json
index 4656fac3c3..d56eb5a560 100644
--- a/tests/testdata/hendrycksTest-econometrics-v0-res.json
+++ b/tests/testdata/hendrycksTest-econometrics-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-econometrics": {"acc": 0.24561403508771928, "acc_norm": 0.24561403508771928, "acc_norm_stderr": 0.04049339297748142, "acc_stderr": 0.040493392977481425}}, "versions": {"hendrycksTest-econometrics": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-econometrics": {"acc": 0.24561403508771928, "acc_norm": 0.24561403508771928, "acc_norm_stderr": 0.04049339297748142, "acc_stderr": 0.040493392977481425}}, "versions": {"hendrycksTest-econometrics": 0}}
diff --git a/tests/testdata/hendrycksTest-electrical_engineering-v0-loglikelihood b/tests/testdata/hendrycksTest-electrical_engineering-v0-loglikelihood
index 9c9e72efdf..7311e57c5a 100644
--- a/tests/testdata/hendrycksTest-electrical_engineering-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-electrical_engineering-v0-loglikelihood
@@ -1 +1 @@
-b9b5d8b8bb02696302ec6bc2a99bf987a5504d3bae0e529d2c8f263538c97518
\ No newline at end of file
+b9b5d8b8bb02696302ec6bc2a99bf987a5504d3bae0e529d2c8f263538c97518
diff --git a/tests/testdata/hendrycksTest-electrical_engineering-v0-res.json b/tests/testdata/hendrycksTest-electrical_engineering-v0-res.json
index 13b76c1d5f..2dacd09ebb 100644
--- a/tests/testdata/hendrycksTest-electrical_engineering-v0-res.json
+++ b/tests/testdata/hendrycksTest-electrical_engineering-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-electrical_engineering": {"acc": 0.2689655172413793, "acc_norm": 0.2827586206896552, "acc_norm_stderr": 0.037528339580033376, "acc_stderr": 0.036951833116502325}}, "versions": {"hendrycksTest-electrical_engineering": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-electrical_engineering": {"acc": 0.2689655172413793, "acc_norm": 0.2827586206896552, "acc_norm_stderr": 0.037528339580033376, "acc_stderr": 0.036951833116502325}}, "versions": {"hendrycksTest-electrical_engineering": 0}}
diff --git a/tests/testdata/hendrycksTest-elementary_mathematics-v0-loglikelihood b/tests/testdata/hendrycksTest-elementary_mathematics-v0-loglikelihood
index e281f72feb..cac4a7eaa1 100644
--- a/tests/testdata/hendrycksTest-elementary_mathematics-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-elementary_mathematics-v0-loglikelihood
@@ -1 +1 @@
-6b21f5cd5606268421a667152ec989424b66905c02adbab8d4ff6bb9d21b77d1
\ No newline at end of file
+6b21f5cd5606268421a667152ec989424b66905c02adbab8d4ff6bb9d21b77d1
diff --git a/tests/testdata/hendrycksTest-elementary_mathematics-v0-res.json b/tests/testdata/hendrycksTest-elementary_mathematics-v0-res.json
index 84cd983ee9..19947fd21e 100644
--- a/tests/testdata/hendrycksTest-elementary_mathematics-v0-res.json
+++ b/tests/testdata/hendrycksTest-elementary_mathematics-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-elementary_mathematics": {"acc": 0.2724867724867725, "acc_norm": 0.2830687830687831, "acc_norm_stderr": 0.023201392938194978, "acc_stderr": 0.022930973071633345}}, "versions": {"hendrycksTest-elementary_mathematics": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-elementary_mathematics": {"acc": 0.2724867724867725, "acc_norm": 0.2830687830687831, "acc_norm_stderr": 0.023201392938194978, "acc_stderr": 0.022930973071633345}}, "versions": {"hendrycksTest-elementary_mathematics": 0}}
diff --git a/tests/testdata/hendrycksTest-formal_logic-v0-loglikelihood b/tests/testdata/hendrycksTest-formal_logic-v0-loglikelihood
index ef6bec3f70..8ac034a4e5 100644
--- a/tests/testdata/hendrycksTest-formal_logic-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-formal_logic-v0-loglikelihood
@@ -1 +1 @@
-c0d0f0c008a5f3faf2f6f4268d87bbc09c40bb66ae08cf38eea0bf2e519c5a59
\ No newline at end of file
+c0d0f0c008a5f3faf2f6f4268d87bbc09c40bb66ae08cf38eea0bf2e519c5a59
diff --git a/tests/testdata/hendrycksTest-formal_logic-v0-res.json b/tests/testdata/hendrycksTest-formal_logic-v0-res.json
index acde01d4d7..3ee6766b7f 100644
--- a/tests/testdata/hendrycksTest-formal_logic-v0-res.json
+++ b/tests/testdata/hendrycksTest-formal_logic-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-formal_logic": {"acc": 0.25396825396825395, "acc_norm": 0.2698412698412698, "acc_norm_stderr": 0.03970158273235172, "acc_stderr": 0.03893259610604674}}, "versions": {"hendrycksTest-formal_logic": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-formal_logic": {"acc": 0.25396825396825395, "acc_norm": 0.2698412698412698, "acc_norm_stderr": 0.03970158273235172, "acc_stderr": 0.03893259610604674}}, "versions": {"hendrycksTest-formal_logic": 0}}
diff --git a/tests/testdata/hendrycksTest-global_facts-v0-loglikelihood b/tests/testdata/hendrycksTest-global_facts-v0-loglikelihood
index a4751fdbfa..8c92f96a48 100644
--- a/tests/testdata/hendrycksTest-global_facts-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-global_facts-v0-loglikelihood
@@ -1 +1 @@
-9fdc85240b8170839278b1e883ee0868611d84dce202cb8aa037c841ec76d089
\ No newline at end of file
+9fdc85240b8170839278b1e883ee0868611d84dce202cb8aa037c841ec76d089
diff --git a/tests/testdata/hendrycksTest-global_facts-v0-res.json b/tests/testdata/hendrycksTest-global_facts-v0-res.json
index d2fff47bcb..94a30256e1 100644
--- a/tests/testdata/hendrycksTest-global_facts-v0-res.json
+++ b/tests/testdata/hendrycksTest-global_facts-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-global_facts": {"acc": 0.23, "acc_norm": 0.23, "acc_norm_stderr": 0.04229525846816507, "acc_stderr": 0.04229525846816507}}, "versions": {"hendrycksTest-global_facts": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-global_facts": {"acc": 0.23, "acc_norm": 0.23, "acc_norm_stderr": 0.04229525846816507, "acc_stderr": 0.04229525846816507}}, "versions": {"hendrycksTest-global_facts": 0}}
diff --git a/tests/testdata/hendrycksTest-high_school_biology-v0-loglikelihood b/tests/testdata/hendrycksTest-high_school_biology-v0-loglikelihood
index 1e2c01e2b1..d38bb991d9 100644
--- a/tests/testdata/hendrycksTest-high_school_biology-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-high_school_biology-v0-loglikelihood
@@ -1 +1 @@
-d4dc051f37a49dc75c218741e87bc826fd44f31ee1309b55e0f33bd191c1bc78
\ No newline at end of file
+d4dc051f37a49dc75c218741e87bc826fd44f31ee1309b55e0f33bd191c1bc78
diff --git a/tests/testdata/hendrycksTest-high_school_biology-v0-res.json b/tests/testdata/hendrycksTest-high_school_biology-v0-res.json
index a666d9ce9c..c581516ebb 100644
--- a/tests/testdata/hendrycksTest-high_school_biology-v0-res.json
+++ b/tests/testdata/hendrycksTest-high_school_biology-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-high_school_biology": {"acc": 0.23870967741935484, "acc_norm": 0.2709677419354839, "acc_norm_stderr": 0.025284416114900152, "acc_stderr": 0.024251071262208834}}, "versions": {"hendrycksTest-high_school_biology": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-high_school_biology": {"acc": 0.23870967741935484, "acc_norm": 0.2709677419354839, "acc_norm_stderr": 0.025284416114900152, "acc_stderr": 0.024251071262208834}}, "versions": {"hendrycksTest-high_school_biology": 0}}
diff --git a/tests/testdata/hendrycksTest-high_school_chemistry-v0-loglikelihood b/tests/testdata/hendrycksTest-high_school_chemistry-v0-loglikelihood
index d0ca97d6a5..a519bee854 100644
--- a/tests/testdata/hendrycksTest-high_school_chemistry-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-high_school_chemistry-v0-loglikelihood
@@ -1 +1 @@
-f4f338e45415c4b5ee7f1d249155bcd910c8401bd1436760a5ec61cb6bb211b6
\ No newline at end of file
+f4f338e45415c4b5ee7f1d249155bcd910c8401bd1436760a5ec61cb6bb211b6
diff --git a/tests/testdata/hendrycksTest-high_school_chemistry-v0-res.json b/tests/testdata/hendrycksTest-high_school_chemistry-v0-res.json
index 2d81594963..00a28f1755 100644
--- a/tests/testdata/hendrycksTest-high_school_chemistry-v0-res.json
+++ b/tests/testdata/hendrycksTest-high_school_chemistry-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-high_school_chemistry": {"acc": 0.2857142857142857, "acc_norm": 0.2660098522167488, "acc_norm_stderr": 0.031089826002937523, "acc_stderr": 0.031785297106427496}}, "versions": {"hendrycksTest-high_school_chemistry": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-high_school_chemistry": {"acc": 0.2857142857142857, "acc_norm": 0.2660098522167488, "acc_norm_stderr": 0.031089826002937523, "acc_stderr": 0.031785297106427496}}, "versions": {"hendrycksTest-high_school_chemistry": 0}}
diff --git a/tests/testdata/hendrycksTest-high_school_computer_science-v0-loglikelihood b/tests/testdata/hendrycksTest-high_school_computer_science-v0-loglikelihood
index a421564657..8d175c549b 100644
--- a/tests/testdata/hendrycksTest-high_school_computer_science-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-high_school_computer_science-v0-loglikelihood
@@ -1 +1 @@
-870d5a6300c527077aaf6baa3e750e75fa840b41657cf82549f39b768b14862d
\ No newline at end of file
+870d5a6300c527077aaf6baa3e750e75fa840b41657cf82549f39b768b14862d
diff --git a/tests/testdata/hendrycksTest-high_school_computer_science-v0-res.json b/tests/testdata/hendrycksTest-high_school_computer_science-v0-res.json
index bbc2dacf5f..2a27c1641d 100644
--- a/tests/testdata/hendrycksTest-high_school_computer_science-v0-res.json
+++ b/tests/testdata/hendrycksTest-high_school_computer_science-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-high_school_computer_science": {"acc": 0.2, "acc_norm": 0.22, "acc_norm_stderr": 0.04163331998932269, "acc_stderr": 0.04020151261036845}}, "versions": {"hendrycksTest-high_school_computer_science": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-high_school_computer_science": {"acc": 0.2, "acc_norm": 0.22, "acc_norm_stderr": 0.04163331998932269, "acc_stderr": 0.04020151261036845}}, "versions": {"hendrycksTest-high_school_computer_science": 0}}
diff --git a/tests/testdata/hendrycksTest-high_school_european_history-v0-loglikelihood b/tests/testdata/hendrycksTest-high_school_european_history-v0-loglikelihood
index eec5858ef9..c900c590d6 100644
--- a/tests/testdata/hendrycksTest-high_school_european_history-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-high_school_european_history-v0-loglikelihood
@@ -1 +1 @@
-d8070e113be9d420fef5578cb69c70df4ea5118f9b18553023fd9efd5ff0b7f4
\ No newline at end of file
+d8070e113be9d420fef5578cb69c70df4ea5118f9b18553023fd9efd5ff0b7f4
diff --git a/tests/testdata/hendrycksTest-high_school_european_history-v0-res.json b/tests/testdata/hendrycksTest-high_school_european_history-v0-res.json
index b5cea9cbe3..0dcb282bcd 100644
--- a/tests/testdata/hendrycksTest-high_school_european_history-v0-res.json
+++ b/tests/testdata/hendrycksTest-high_school_european_history-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-high_school_european_history": {"acc": 0.23636363636363636, "acc_norm": 0.24242424242424243, "acc_norm_stderr": 0.03346409881055953, "acc_stderr": 0.033175059300091805}}, "versions": {"hendrycksTest-high_school_european_history": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-high_school_european_history": {"acc": 0.23636363636363636, "acc_norm": 0.24242424242424243, "acc_norm_stderr": 0.03346409881055953, "acc_stderr": 0.033175059300091805}}, "versions": {"hendrycksTest-high_school_european_history": 0}}
diff --git a/tests/testdata/hendrycksTest-high_school_geography-v0-loglikelihood b/tests/testdata/hendrycksTest-high_school_geography-v0-loglikelihood
index ac80d17880..5d28be4d50 100644
--- a/tests/testdata/hendrycksTest-high_school_geography-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-high_school_geography-v0-loglikelihood
@@ -1 +1 @@
-add45970ea3865be7c7a31f788a835949f6937ac73f699b122ca56a3431e95f8
\ No newline at end of file
+add45970ea3865be7c7a31f788a835949f6937ac73f699b122ca56a3431e95f8
diff --git a/tests/testdata/hendrycksTest-high_school_geography-v0-res.json b/tests/testdata/hendrycksTest-high_school_geography-v0-res.json
index 0fb76aa9ba..2c18a78811 100644
--- a/tests/testdata/hendrycksTest-high_school_geography-v0-res.json
+++ b/tests/testdata/hendrycksTest-high_school_geography-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-high_school_geography": {"acc": 0.2474747474747475, "acc_norm": 0.2777777777777778, "acc_norm_stderr": 0.03191178226713547, "acc_stderr": 0.03074630074212452}}, "versions": {"hendrycksTest-high_school_geography": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-high_school_geography": {"acc": 0.2474747474747475, "acc_norm": 0.2777777777777778, "acc_norm_stderr": 0.03191178226713547, "acc_stderr": 0.03074630074212452}}, "versions": {"hendrycksTest-high_school_geography": 0}}
diff --git a/tests/testdata/hendrycksTest-high_school_government_and_politics-v0-loglikelihood b/tests/testdata/hendrycksTest-high_school_government_and_politics-v0-loglikelihood
index 12ea726b4b..462d8186bd 100644
--- a/tests/testdata/hendrycksTest-high_school_government_and_politics-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-high_school_government_and_politics-v0-loglikelihood
@@ -1 +1 @@
-11f40d8f48ba5cd739e21d54c3c04d3761f81df5cb7ddd77df868d24ced44b49
\ No newline at end of file
+11f40d8f48ba5cd739e21d54c3c04d3761f81df5cb7ddd77df868d24ced44b49
diff --git a/tests/testdata/hendrycksTest-high_school_government_and_politics-v0-res.json b/tests/testdata/hendrycksTest-high_school_government_and_politics-v0-res.json
index 16cc02ff0a..4cf14d721d 100644
--- a/tests/testdata/hendrycksTest-high_school_government_and_politics-v0-res.json
+++ b/tests/testdata/hendrycksTest-high_school_government_and_politics-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-high_school_government_and_politics": {"acc": 0.24352331606217617, "acc_norm": 0.23834196891191708, "acc_norm_stderr": 0.03074890536390988, "acc_stderr": 0.030975436386845436}}, "versions": {"hendrycksTest-high_school_government_and_politics": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-high_school_government_and_politics": {"acc": 0.24352331606217617, "acc_norm": 0.23834196891191708, "acc_norm_stderr": 0.03074890536390988, "acc_stderr": 0.030975436386845436}}, "versions": {"hendrycksTest-high_school_government_and_politics": 0}}
diff --git a/tests/testdata/hendrycksTest-high_school_macroeconomics-v0-loglikelihood b/tests/testdata/hendrycksTest-high_school_macroeconomics-v0-loglikelihood
index c0106d373d..ef61269b60 100644
--- a/tests/testdata/hendrycksTest-high_school_macroeconomics-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-high_school_macroeconomics-v0-loglikelihood
@@ -1 +1 @@
-ce4faae2fb6628caa48f6fc74cbc848880db49e6ff51079392778a2322bcefef
\ No newline at end of file
+ce4faae2fb6628caa48f6fc74cbc848880db49e6ff51079392778a2322bcefef
diff --git a/tests/testdata/hendrycksTest-high_school_macroeconomics-v0-res.json b/tests/testdata/hendrycksTest-high_school_macroeconomics-v0-res.json
index fb6835039c..9d55ab39a1 100644
--- a/tests/testdata/hendrycksTest-high_school_macroeconomics-v0-res.json
+++ b/tests/testdata/hendrycksTest-high_school_macroeconomics-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-high_school_macroeconomics": {"acc": 0.2230769230769231, "acc_norm": 0.22564102564102564, "acc_norm_stderr": 0.021193632525148522, "acc_stderr": 0.021107730127244}}, "versions": {"hendrycksTest-high_school_macroeconomics": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-high_school_macroeconomics": {"acc": 0.2230769230769231, "acc_norm": 0.22564102564102564, "acc_norm_stderr": 0.021193632525148522, "acc_stderr": 0.021107730127244}}, "versions": {"hendrycksTest-high_school_macroeconomics": 0}}
diff --git a/tests/testdata/hendrycksTest-high_school_mathematics-v0-loglikelihood b/tests/testdata/hendrycksTest-high_school_mathematics-v0-loglikelihood
index dc86769fa9..085c71f415 100644
--- a/tests/testdata/hendrycksTest-high_school_mathematics-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-high_school_mathematics-v0-loglikelihood
@@ -1 +1 @@
-ab368d16fc4648ad27940f71abd266366663f51db612f732a0b9b0eea28de9f8
\ No newline at end of file
+ab368d16fc4648ad27940f71abd266366663f51db612f732a0b9b0eea28de9f8
diff --git a/tests/testdata/hendrycksTest-high_school_mathematics-v0-res.json b/tests/testdata/hendrycksTest-high_school_mathematics-v0-res.json
index cb3a3ec068..711db35dd1 100644
--- a/tests/testdata/hendrycksTest-high_school_mathematics-v0-res.json
+++ b/tests/testdata/hendrycksTest-high_school_mathematics-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-high_school_mathematics": {"acc": 0.22592592592592592, "acc_norm": 0.24814814814814815, "acc_norm_stderr": 0.0263357394040558, "acc_stderr": 0.025497532639609553}}, "versions": {"hendrycksTest-high_school_mathematics": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-high_school_mathematics": {"acc": 0.22592592592592592, "acc_norm": 0.24814814814814815, "acc_norm_stderr": 0.0263357394040558, "acc_stderr": 0.025497532639609553}}, "versions": {"hendrycksTest-high_school_mathematics": 0}}
diff --git a/tests/testdata/hendrycksTest-high_school_microeconomics-v0-loglikelihood b/tests/testdata/hendrycksTest-high_school_microeconomics-v0-loglikelihood
index 37962bf9fb..5a806df0b3 100644
--- a/tests/testdata/hendrycksTest-high_school_microeconomics-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-high_school_microeconomics-v0-loglikelihood
@@ -1 +1 @@
-513b998585ebc1ebdefca6435b7c84fd73dc36fc80321a22503467f04efed23e
\ No newline at end of file
+513b998585ebc1ebdefca6435b7c84fd73dc36fc80321a22503467f04efed23e
diff --git a/tests/testdata/hendrycksTest-high_school_microeconomics-v0-res.json b/tests/testdata/hendrycksTest-high_school_microeconomics-v0-res.json
index cf698d181c..f5bd4ff6d0 100644
--- a/tests/testdata/hendrycksTest-high_school_microeconomics-v0-res.json
+++ b/tests/testdata/hendrycksTest-high_school_microeconomics-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-high_school_microeconomics": {"acc": 0.24369747899159663, "acc_norm": 0.22268907563025211, "acc_norm_stderr": 0.027025433498882378, "acc_stderr": 0.027886828078380558}}, "versions": {"hendrycksTest-high_school_microeconomics": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-high_school_microeconomics": {"acc": 0.24369747899159663, "acc_norm": 0.22268907563025211, "acc_norm_stderr": 0.027025433498882378, "acc_stderr": 0.027886828078380558}}, "versions": {"hendrycksTest-high_school_microeconomics": 0}}
diff --git a/tests/testdata/hendrycksTest-high_school_physics-v0-loglikelihood b/tests/testdata/hendrycksTest-high_school_physics-v0-loglikelihood
index 49a780bc97..7a2e1602f4 100644
--- a/tests/testdata/hendrycksTest-high_school_physics-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-high_school_physics-v0-loglikelihood
@@ -1 +1 @@
-dae59e82d3d4d8dec82239d9620b72cc47bb6efbe2f1c2f9b9d23e849c9c5e32
\ No newline at end of file
+dae59e82d3d4d8dec82239d9620b72cc47bb6efbe2f1c2f9b9d23e849c9c5e32
diff --git a/tests/testdata/hendrycksTest-high_school_physics-v0-res.json b/tests/testdata/hendrycksTest-high_school_physics-v0-res.json
index b6b3bb9d01..3b49922213 100644
--- a/tests/testdata/hendrycksTest-high_school_physics-v0-res.json
+++ b/tests/testdata/hendrycksTest-high_school_physics-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-high_school_physics": {"acc": 0.2582781456953642, "acc_norm": 0.271523178807947, "acc_norm_stderr": 0.03631329803969653, "acc_stderr": 0.035737053147634576}}, "versions": {"hendrycksTest-high_school_physics": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-high_school_physics": {"acc": 0.2582781456953642, "acc_norm": 0.271523178807947, "acc_norm_stderr": 0.03631329803969653, "acc_stderr": 0.035737053147634576}}, "versions": {"hendrycksTest-high_school_physics": 0}}
diff --git a/tests/testdata/hendrycksTest-high_school_psychology-v0-loglikelihood b/tests/testdata/hendrycksTest-high_school_psychology-v0-loglikelihood
index 0f39ddfde7..cb2c6e48b5 100644
--- a/tests/testdata/hendrycksTest-high_school_psychology-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-high_school_psychology-v0-loglikelihood
@@ -1 +1 @@
-0e4c8d13806d3696167e40544d2d114c557c10c74bc61fcb9c51bbfced0266ef
\ No newline at end of file
+0e4c8d13806d3696167e40544d2d114c557c10c74bc61fcb9c51bbfced0266ef
diff --git a/tests/testdata/hendrycksTest-high_school_psychology-v0-res.json b/tests/testdata/hendrycksTest-high_school_psychology-v0-res.json
index 42b781149b..a4fadea7ea 100644
--- a/tests/testdata/hendrycksTest-high_school_psychology-v0-res.json
+++ b/tests/testdata/hendrycksTest-high_school_psychology-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-high_school_psychology": {"acc": 0.24587155963302754, "acc_norm": 0.23302752293577983, "acc_norm_stderr": 0.018125669180861493, "acc_stderr": 0.018461940968708436}}, "versions": {"hendrycksTest-high_school_psychology": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-high_school_psychology": {"acc": 0.24587155963302754, "acc_norm": 0.23302752293577983, "acc_norm_stderr": 0.018125669180861493, "acc_stderr": 0.018461940968708436}}, "versions": {"hendrycksTest-high_school_psychology": 0}}
diff --git a/tests/testdata/hendrycksTest-high_school_statistics-v0-loglikelihood b/tests/testdata/hendrycksTest-high_school_statistics-v0-loglikelihood
index 8a915ef7fc..7b0eb829b0 100644
--- a/tests/testdata/hendrycksTest-high_school_statistics-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-high_school_statistics-v0-loglikelihood
@@ -1 +1 @@
-33d1d6eaaa2c3a944bf49d3f220a4efc328d7c3b3465b7cec40ae36d8984b75f
\ No newline at end of file
+33d1d6eaaa2c3a944bf49d3f220a4efc328d7c3b3465b7cec40ae36d8984b75f
diff --git a/tests/testdata/hendrycksTest-high_school_statistics-v0-res.json b/tests/testdata/hendrycksTest-high_school_statistics-v0-res.json
index 4c6a21d7da..77ca941749 100644
--- a/tests/testdata/hendrycksTest-high_school_statistics-v0-res.json
+++ b/tests/testdata/hendrycksTest-high_school_statistics-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-high_school_statistics": {"acc": 0.2962962962962963, "acc_norm": 0.3055555555555556, "acc_norm_stderr": 0.03141554629402544, "acc_stderr": 0.03114144782353604}}, "versions": {"hendrycksTest-high_school_statistics": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-high_school_statistics": {"acc": 0.2962962962962963, "acc_norm": 0.3055555555555556, "acc_norm_stderr": 0.03141554629402544, "acc_stderr": 0.03114144782353604}}, "versions": {"hendrycksTest-high_school_statistics": 0}}
diff --git a/tests/testdata/hendrycksTest-high_school_us_history-v0-loglikelihood b/tests/testdata/hendrycksTest-high_school_us_history-v0-loglikelihood
index e05b91503e..386bedb860 100644
--- a/tests/testdata/hendrycksTest-high_school_us_history-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-high_school_us_history-v0-loglikelihood
@@ -1 +1 @@
-8c65c1a28330dd001d395ac11f1bb80c3b33f5935f503e74067aef6e9e1d9d9b
\ No newline at end of file
+8c65c1a28330dd001d395ac11f1bb80c3b33f5935f503e74067aef6e9e1d9d9b
diff --git a/tests/testdata/hendrycksTest-high_school_us_history-v0-res.json b/tests/testdata/hendrycksTest-high_school_us_history-v0-res.json
index 5b7a76909c..f6460f4248 100644
--- a/tests/testdata/hendrycksTest-high_school_us_history-v0-res.json
+++ b/tests/testdata/hendrycksTest-high_school_us_history-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-high_school_us_history": {"acc": 0.29901960784313725, "acc_norm": 0.28431372549019607, "acc_norm_stderr": 0.03166009679399814, "acc_stderr": 0.03213325717373618}}, "versions": {"hendrycksTest-high_school_us_history": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-high_school_us_history": {"acc": 0.29901960784313725, "acc_norm": 0.28431372549019607, "acc_norm_stderr": 0.03166009679399814, "acc_stderr": 0.03213325717373618}}, "versions": {"hendrycksTest-high_school_us_history": 0}}
diff --git a/tests/testdata/hendrycksTest-high_school_world_history-v0-loglikelihood b/tests/testdata/hendrycksTest-high_school_world_history-v0-loglikelihood
index 228dfe072c..c938b0a287 100644
--- a/tests/testdata/hendrycksTest-high_school_world_history-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-high_school_world_history-v0-loglikelihood
@@ -1 +1 @@
-1c8b994bd9a63ec874fc8d0e3a27077118b7adc472306b2fd6c55635a78b9d52
\ No newline at end of file
+1c8b994bd9a63ec874fc8d0e3a27077118b7adc472306b2fd6c55635a78b9d52
diff --git a/tests/testdata/hendrycksTest-high_school_world_history-v0-res.json b/tests/testdata/hendrycksTest-high_school_world_history-v0-res.json
index ca1bf95b9d..30ea7361db 100644
--- a/tests/testdata/hendrycksTest-high_school_world_history-v0-res.json
+++ b/tests/testdata/hendrycksTest-high_school_world_history-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-high_school_world_history": {"acc": 0.23628691983122363, "acc_norm": 0.24472573839662448, "acc_norm_stderr": 0.02798569938703642, "acc_stderr": 0.027652153144159263}}, "versions": {"hendrycksTest-high_school_world_history": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-high_school_world_history": {"acc": 0.23628691983122363, "acc_norm": 0.24472573839662448, "acc_norm_stderr": 0.02798569938703642, "acc_stderr": 0.027652153144159263}}, "versions": {"hendrycksTest-high_school_world_history": 0}}
diff --git a/tests/testdata/hendrycksTest-human_aging-v0-loglikelihood b/tests/testdata/hendrycksTest-human_aging-v0-loglikelihood
index d34fa52980..9c6422494a 100644
--- a/tests/testdata/hendrycksTest-human_aging-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-human_aging-v0-loglikelihood
@@ -1 +1 @@
-0880b3a78f8d7b17ffc612031427b9085367cf65dabe2a68c4b64e3171d17e88
\ No newline at end of file
+0880b3a78f8d7b17ffc612031427b9085367cf65dabe2a68c4b64e3171d17e88
diff --git a/tests/testdata/hendrycksTest-human_aging-v0-res.json b/tests/testdata/hendrycksTest-human_aging-v0-res.json
index 061678f2e4..95d8742924 100644
--- a/tests/testdata/hendrycksTest-human_aging-v0-res.json
+++ b/tests/testdata/hendrycksTest-human_aging-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-human_aging": {"acc": 0.21524663677130046, "acc_norm": 0.17937219730941703, "acc_norm_stderr": 0.025749819569192804, "acc_stderr": 0.02758406660220827}}, "versions": {"hendrycksTest-human_aging": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-human_aging": {"acc": 0.21524663677130046, "acc_norm": 0.17937219730941703, "acc_norm_stderr": 0.025749819569192804, "acc_stderr": 0.02758406660220827}}, "versions": {"hendrycksTest-human_aging": 0}}
diff --git a/tests/testdata/hendrycksTest-human_sexuality-v0-loglikelihood b/tests/testdata/hendrycksTest-human_sexuality-v0-loglikelihood
index b3d3ae438c..7626c89962 100644
--- a/tests/testdata/hendrycksTest-human_sexuality-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-human_sexuality-v0-loglikelihood
@@ -1 +1 @@
-4b07922fa1d549b655c21440b13d869263ce7dd9771d8147c450f11c91d26c10
\ No newline at end of file
+4b07922fa1d549b655c21440b13d869263ce7dd9771d8147c450f11c91d26c10
diff --git a/tests/testdata/hendrycksTest-human_sexuality-v0-res.json b/tests/testdata/hendrycksTest-human_sexuality-v0-res.json
index 091d7352ce..960a6e3b52 100644
--- a/tests/testdata/hendrycksTest-human_sexuality-v0-res.json
+++ b/tests/testdata/hendrycksTest-human_sexuality-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-human_sexuality": {"acc": 0.22137404580152673, "acc_norm": 0.22900763358778625, "acc_norm_stderr": 0.036853466317118506, "acc_stderr": 0.0364129708131373}}, "versions": {"hendrycksTest-human_sexuality": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-human_sexuality": {"acc": 0.22137404580152673, "acc_norm": 0.22900763358778625, "acc_norm_stderr": 0.036853466317118506, "acc_stderr": 0.0364129708131373}}, "versions": {"hendrycksTest-human_sexuality": 0}}
diff --git a/tests/testdata/hendrycksTest-international_law-v0-loglikelihood b/tests/testdata/hendrycksTest-international_law-v0-loglikelihood
index 2b6aa8d605..23c6b58e9d 100644
--- a/tests/testdata/hendrycksTest-international_law-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-international_law-v0-loglikelihood
@@ -1 +1 @@
-ea9b2cefd27959db564168f6ad1169a5eaa012fc5a5d5b8faf9e34d94e335dc1
\ No newline at end of file
+ea9b2cefd27959db564168f6ad1169a5eaa012fc5a5d5b8faf9e34d94e335dc1
diff --git a/tests/testdata/hendrycksTest-international_law-v0-res.json b/tests/testdata/hendrycksTest-international_law-v0-res.json
index bd4edd2394..97c70d2f2e 100644
--- a/tests/testdata/hendrycksTest-international_law-v0-res.json
+++ b/tests/testdata/hendrycksTest-international_law-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-international_law": {"acc": 0.2396694214876033, "acc_norm": 0.3140495867768595, "acc_norm_stderr": 0.042369647530410164, "acc_stderr": 0.03896878985070417}}, "versions": {"hendrycksTest-international_law": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-international_law": {"acc": 0.2396694214876033, "acc_norm": 0.3140495867768595, "acc_norm_stderr": 0.042369647530410164, "acc_stderr": 0.03896878985070417}}, "versions": {"hendrycksTest-international_law": 0}}
diff --git a/tests/testdata/hendrycksTest-jurisprudence-v0-loglikelihood b/tests/testdata/hendrycksTest-jurisprudence-v0-loglikelihood
index 3d55d21e02..37aea2c8a2 100644
--- a/tests/testdata/hendrycksTest-jurisprudence-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-jurisprudence-v0-loglikelihood
@@ -1 +1 @@
-cac440189f1ec778e82f4975d88b74689553ecc5116aaa7f76587a50c1a610e0
\ No newline at end of file
+cac440189f1ec778e82f4975d88b74689553ecc5116aaa7f76587a50c1a610e0
diff --git a/tests/testdata/hendrycksTest-jurisprudence-v0-res.json b/tests/testdata/hendrycksTest-jurisprudence-v0-res.json
index 4ef1819749..66203b63bf 100644
--- a/tests/testdata/hendrycksTest-jurisprudence-v0-res.json
+++ b/tests/testdata/hendrycksTest-jurisprudence-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-jurisprudence": {"acc": 0.25, "acc_norm": 0.3148148148148148, "acc_norm_stderr": 0.04489931073591312, "acc_stderr": 0.04186091791394607}}, "versions": {"hendrycksTest-jurisprudence": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-jurisprudence": {"acc": 0.25, "acc_norm": 0.3148148148148148, "acc_norm_stderr": 0.04489931073591312, "acc_stderr": 0.04186091791394607}}, "versions": {"hendrycksTest-jurisprudence": 0}}
diff --git a/tests/testdata/hendrycksTest-logical_fallacies-v0-loglikelihood b/tests/testdata/hendrycksTest-logical_fallacies-v0-loglikelihood
index a5807b5831..56300c43a8 100644
--- a/tests/testdata/hendrycksTest-logical_fallacies-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-logical_fallacies-v0-loglikelihood
@@ -1 +1 @@
-2e9449dd803f9e2334dc562d9f04031fd013ed36b883b44ab500533a5dbbface
\ No newline at end of file
+2e9449dd803f9e2334dc562d9f04031fd013ed36b883b44ab500533a5dbbface
diff --git a/tests/testdata/hendrycksTest-logical_fallacies-v0-res.json b/tests/testdata/hendrycksTest-logical_fallacies-v0-res.json
index c5cf5cb467..e8ce4b58a8 100644
--- a/tests/testdata/hendrycksTest-logical_fallacies-v0-res.json
+++ b/tests/testdata/hendrycksTest-logical_fallacies-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-logical_fallacies": {"acc": 0.20245398773006135, "acc_norm": 0.2147239263803681, "acc_norm_stderr": 0.03226219377286774, "acc_stderr": 0.03157065078911902}}, "versions": {"hendrycksTest-logical_fallacies": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-logical_fallacies": {"acc": 0.20245398773006135, "acc_norm": 0.2147239263803681, "acc_norm_stderr": 0.03226219377286774, "acc_stderr": 0.03157065078911902}}, "versions": {"hendrycksTest-logical_fallacies": 0}}
diff --git a/tests/testdata/hendrycksTest-machine_learning-v0-loglikelihood b/tests/testdata/hendrycksTest-machine_learning-v0-loglikelihood
index 53e498ddd4..681794c7dc 100644
--- a/tests/testdata/hendrycksTest-machine_learning-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-machine_learning-v0-loglikelihood
@@ -1 +1 @@
-7a7138821a66ef946e427b40344cf7f1a916a2926995a85ef731a3bee40cb7ce
\ No newline at end of file
+7a7138821a66ef946e427b40344cf7f1a916a2926995a85ef731a3bee40cb7ce
diff --git a/tests/testdata/hendrycksTest-machine_learning-v0-res.json b/tests/testdata/hendrycksTest-machine_learning-v0-res.json
index 26be724f24..9138d9c40a 100644
--- a/tests/testdata/hendrycksTest-machine_learning-v0-res.json
+++ b/tests/testdata/hendrycksTest-machine_learning-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-machine_learning": {"acc": 0.23214285714285715, "acc_norm": 0.22321428571428573, "acc_norm_stderr": 0.039523019677025116, "acc_stderr": 0.04007341809755806}}, "versions": {"hendrycksTest-machine_learning": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-machine_learning": {"acc": 0.23214285714285715, "acc_norm": 0.22321428571428573, "acc_norm_stderr": 0.039523019677025116, "acc_stderr": 0.04007341809755806}}, "versions": {"hendrycksTest-machine_learning": 0}}
diff --git a/tests/testdata/hendrycksTest-management-v0-loglikelihood b/tests/testdata/hendrycksTest-management-v0-loglikelihood
index 5718739857..02b34a2f8b 100644
--- a/tests/testdata/hendrycksTest-management-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-management-v0-loglikelihood
@@ -1 +1 @@
-355489f4bd176ab84db5ef4c03d56ddeeeb1b0ad69827122b2d800e1cdc7e5f0
\ No newline at end of file
+355489f4bd176ab84db5ef4c03d56ddeeeb1b0ad69827122b2d800e1cdc7e5f0
diff --git a/tests/testdata/hendrycksTest-management-v0-res.json b/tests/testdata/hendrycksTest-management-v0-res.json
index 7a84623fab..7ddab6c17b 100644
--- a/tests/testdata/hendrycksTest-management-v0-res.json
+++ b/tests/testdata/hendrycksTest-management-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-management": {"acc": 0.24271844660194175, "acc_norm": 0.2621359223300971, "acc_norm_stderr": 0.043546310772605956, "acc_stderr": 0.04245022486384495}}, "versions": {"hendrycksTest-management": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-management": {"acc": 0.24271844660194175, "acc_norm": 0.2621359223300971, "acc_norm_stderr": 0.043546310772605956, "acc_stderr": 0.04245022486384495}}, "versions": {"hendrycksTest-management": 0}}
diff --git a/tests/testdata/hendrycksTest-marketing-v0-loglikelihood b/tests/testdata/hendrycksTest-marketing-v0-loglikelihood
index 1d241a9773..809d76d0ab 100644
--- a/tests/testdata/hendrycksTest-marketing-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-marketing-v0-loglikelihood
@@ -1 +1 @@
-b4fa0681fe54671a80509779d4338d744097a7206687f62977df7145dfa74a66
\ No newline at end of file
+b4fa0681fe54671a80509779d4338d744097a7206687f62977df7145dfa74a66
diff --git a/tests/testdata/hendrycksTest-marketing-v0-res.json b/tests/testdata/hendrycksTest-marketing-v0-res.json
index 2cc7a93f1c..3bd328bbad 100644
--- a/tests/testdata/hendrycksTest-marketing-v0-res.json
+++ b/tests/testdata/hendrycksTest-marketing-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-marketing": {"acc": 0.2863247863247863, "acc_norm": 0.2905982905982906, "acc_norm_stderr": 0.029745048572674043, "acc_stderr": 0.029614323690456648}}, "versions": {"hendrycksTest-marketing": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-marketing": {"acc": 0.2863247863247863, "acc_norm": 0.2905982905982906, "acc_norm_stderr": 0.029745048572674043, "acc_stderr": 0.029614323690456648}}, "versions": {"hendrycksTest-marketing": 0}}
diff --git a/tests/testdata/hendrycksTest-medical_genetics-v0-loglikelihood b/tests/testdata/hendrycksTest-medical_genetics-v0-loglikelihood
index 48d49de839..856f74e41c 100644
--- a/tests/testdata/hendrycksTest-medical_genetics-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-medical_genetics-v0-loglikelihood
@@ -1 +1 @@
-db6141246889a19dd3f6b9109f314d49c1a70f7a98795858804378b095c4a2fe
\ No newline at end of file
+db6141246889a19dd3f6b9109f314d49c1a70f7a98795858804378b095c4a2fe
diff --git a/tests/testdata/hendrycksTest-medical_genetics-v0-res.json b/tests/testdata/hendrycksTest-medical_genetics-v0-res.json
index eac53bcf4a..24258f7338 100644
--- a/tests/testdata/hendrycksTest-medical_genetics-v0-res.json
+++ b/tests/testdata/hendrycksTest-medical_genetics-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-medical_genetics": {"acc": 0.27, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684, "acc_stderr": 0.0446196043338474}}, "versions": {"hendrycksTest-medical_genetics": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-medical_genetics": {"acc": 0.27, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684, "acc_stderr": 0.0446196043338474}}, "versions": {"hendrycksTest-medical_genetics": 0}}
diff --git a/tests/testdata/hendrycksTest-miscellaneous-v0-loglikelihood b/tests/testdata/hendrycksTest-miscellaneous-v0-loglikelihood
index b09e99721b..2ec63e648a 100644
--- a/tests/testdata/hendrycksTest-miscellaneous-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-miscellaneous-v0-loglikelihood
@@ -1 +1 @@
-972dd88dbbaf09d14766e243cfc233425e7c01a26dbc61bdb9eeefa788822331
\ No newline at end of file
+972dd88dbbaf09d14766e243cfc233425e7c01a26dbc61bdb9eeefa788822331
diff --git a/tests/testdata/hendrycksTest-miscellaneous-v0-res.json b/tests/testdata/hendrycksTest-miscellaneous-v0-res.json
index 5c7859eb3a..8eac1f20ea 100644
--- a/tests/testdata/hendrycksTest-miscellaneous-v0-res.json
+++ b/tests/testdata/hendrycksTest-miscellaneous-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-miscellaneous": {"acc": 0.23499361430395913, "acc_norm": 0.2515964240102171, "acc_norm_stderr": 0.015517322365529622, "acc_stderr": 0.015162024152278445}}, "versions": {"hendrycksTest-miscellaneous": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-miscellaneous": {"acc": 0.23499361430395913, "acc_norm": 0.2515964240102171, "acc_norm_stderr": 0.015517322365529622, "acc_stderr": 0.015162024152278445}}, "versions": {"hendrycksTest-miscellaneous": 0}}
diff --git a/tests/testdata/hendrycksTest-moral_disputes-v0-loglikelihood b/tests/testdata/hendrycksTest-moral_disputes-v0-loglikelihood
index 953fc3be48..b267c94234 100644
--- a/tests/testdata/hendrycksTest-moral_disputes-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-moral_disputes-v0-loglikelihood
@@ -1 +1 @@
-d6ef028022c02b69d1516973e08bebaa14d8debcf2589a2bb124823178202d20
\ No newline at end of file
+d6ef028022c02b69d1516973e08bebaa14d8debcf2589a2bb124823178202d20
diff --git a/tests/testdata/hendrycksTest-moral_disputes-v0-res.json b/tests/testdata/hendrycksTest-moral_disputes-v0-res.json
index 26ea1c2a75..7e852dd1ae 100644
--- a/tests/testdata/hendrycksTest-moral_disputes-v0-res.json
+++ b/tests/testdata/hendrycksTest-moral_disputes-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-moral_disputes": {"acc": 0.24855491329479767, "acc_norm": 0.27167630057803466, "acc_norm_stderr": 0.023948512905468365, "acc_stderr": 0.023267528432100174}}, "versions": {"hendrycksTest-moral_disputes": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-moral_disputes": {"acc": 0.24855491329479767, "acc_norm": 0.27167630057803466, "acc_norm_stderr": 0.023948512905468365, "acc_stderr": 0.023267528432100174}}, "versions": {"hendrycksTest-moral_disputes": 0}}
diff --git a/tests/testdata/hendrycksTest-moral_scenarios-v0-loglikelihood b/tests/testdata/hendrycksTest-moral_scenarios-v0-loglikelihood
index d5ea0d8156..727957ef45 100644
--- a/tests/testdata/hendrycksTest-moral_scenarios-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-moral_scenarios-v0-loglikelihood
@@ -1 +1 @@
-a8e1882e77728b53c8b86312254d08320d8363fb606d746a8dd145b812f62cf5
\ No newline at end of file
+a8e1882e77728b53c8b86312254d08320d8363fb606d746a8dd145b812f62cf5
diff --git a/tests/testdata/hendrycksTest-moral_scenarios-v0-res.json b/tests/testdata/hendrycksTest-moral_scenarios-v0-res.json
index 62ec159712..b66e588bac 100644
--- a/tests/testdata/hendrycksTest-moral_scenarios-v0-res.json
+++ b/tests/testdata/hendrycksTest-moral_scenarios-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-moral_scenarios": {"acc": 0.2547486033519553, "acc_norm": 0.25251396648044694, "acc_norm_stderr": 0.014530330201468654, "acc_stderr": 0.014572650383409158}}, "versions": {"hendrycksTest-moral_scenarios": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-moral_scenarios": {"acc": 0.2547486033519553, "acc_norm": 0.25251396648044694, "acc_norm_stderr": 0.014530330201468654, "acc_stderr": 0.014572650383409158}}, "versions": {"hendrycksTest-moral_scenarios": 0}}
diff --git a/tests/testdata/hendrycksTest-nutrition-v0-loglikelihood b/tests/testdata/hendrycksTest-nutrition-v0-loglikelihood
index 2716bebe69..4993a6ab57 100644
--- a/tests/testdata/hendrycksTest-nutrition-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-nutrition-v0-loglikelihood
@@ -1 +1 @@
-19e49d218f55ed5ec4bd1a6cd3f3388c6f620b81484e7abe8b298e5481c3044d
\ No newline at end of file
+19e49d218f55ed5ec4bd1a6cd3f3388c6f620b81484e7abe8b298e5481c3044d
diff --git a/tests/testdata/hendrycksTest-nutrition-v0-res.json b/tests/testdata/hendrycksTest-nutrition-v0-res.json
index e2838f8805..9004159609 100644
--- a/tests/testdata/hendrycksTest-nutrition-v0-res.json
+++ b/tests/testdata/hendrycksTest-nutrition-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-nutrition": {"acc": 0.24509803921568626, "acc_norm": 0.28104575163398693, "acc_norm_stderr": 0.025738854797818723, "acc_stderr": 0.02463004897982476}}, "versions": {"hendrycksTest-nutrition": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-nutrition": {"acc": 0.24509803921568626, "acc_norm": 0.28104575163398693, "acc_norm_stderr": 0.025738854797818723, "acc_stderr": 0.02463004897982476}}, "versions": {"hendrycksTest-nutrition": 0}}
diff --git a/tests/testdata/hendrycksTest-philosophy-v0-loglikelihood b/tests/testdata/hendrycksTest-philosophy-v0-loglikelihood
index 3ea8ef0a0e..1cd1e1604d 100644
--- a/tests/testdata/hendrycksTest-philosophy-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-philosophy-v0-loglikelihood
@@ -1 +1 @@
-a419204da36c2b7a70fa8909a3a804260cc3283c7e07917534dfb76216c77f46
\ No newline at end of file
+a419204da36c2b7a70fa8909a3a804260cc3283c7e07917534dfb76216c77f46
diff --git a/tests/testdata/hendrycksTest-philosophy-v0-res.json b/tests/testdata/hendrycksTest-philosophy-v0-res.json
index ec9c1e79c1..77f82f7fcc 100644
--- a/tests/testdata/hendrycksTest-philosophy-v0-res.json
+++ b/tests/testdata/hendrycksTest-philosophy-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-philosophy": {"acc": 0.26366559485530544, "acc_norm": 0.2733118971061093, "acc_norm_stderr": 0.02531176597542612, "acc_stderr": 0.02502553850053234}}, "versions": {"hendrycksTest-philosophy": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-philosophy": {"acc": 0.26366559485530544, "acc_norm": 0.2733118971061093, "acc_norm_stderr": 0.02531176597542612, "acc_stderr": 0.02502553850053234}}, "versions": {"hendrycksTest-philosophy": 0}}
diff --git a/tests/testdata/hendrycksTest-prehistory-v0-loglikelihood b/tests/testdata/hendrycksTest-prehistory-v0-loglikelihood
index 4c01847ef5..c92c929cf4 100644
--- a/tests/testdata/hendrycksTest-prehistory-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-prehistory-v0-loglikelihood
@@ -1 +1 @@
-6983c560a562749f4f702249a3a6ae51fa495acc0643a980bf2cf52c6c5d4b95
\ No newline at end of file
+6983c560a562749f4f702249a3a6ae51fa495acc0643a980bf2cf52c6c5d4b95
diff --git a/tests/testdata/hendrycksTest-prehistory-v0-res.json b/tests/testdata/hendrycksTest-prehistory-v0-res.json
index e0163dd555..85a6d79ca6 100644
--- a/tests/testdata/hendrycksTest-prehistory-v0-res.json
+++ b/tests/testdata/hendrycksTest-prehistory-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-prehistory": {"acc": 0.2623456790123457, "acc_norm": 0.26851851851851855, "acc_norm_stderr": 0.024659685185967277, "acc_stderr": 0.02447722285613511}}, "versions": {"hendrycksTest-prehistory": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-prehistory": {"acc": 0.2623456790123457, "acc_norm": 0.26851851851851855, "acc_norm_stderr": 0.024659685185967277, "acc_stderr": 0.02447722285613511}}, "versions": {"hendrycksTest-prehistory": 0}}
diff --git a/tests/testdata/hendrycksTest-professional_accounting-v0-loglikelihood b/tests/testdata/hendrycksTest-professional_accounting-v0-loglikelihood
index fe5997427e..6e86a91554 100644
--- a/tests/testdata/hendrycksTest-professional_accounting-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-professional_accounting-v0-loglikelihood
@@ -1 +1 @@
-847418f7b22cd9b499e95fd73c40a2fbc40076895280cc2c560199c0c4c4f433
\ No newline at end of file
+847418f7b22cd9b499e95fd73c40a2fbc40076895280cc2c560199c0c4c4f433
diff --git a/tests/testdata/hendrycksTest-professional_accounting-v0-res.json b/tests/testdata/hendrycksTest-professional_accounting-v0-res.json
index b665d57e23..45fee739cf 100644
--- a/tests/testdata/hendrycksTest-professional_accounting-v0-res.json
+++ b/tests/testdata/hendrycksTest-professional_accounting-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-professional_accounting": {"acc": 0.2553191489361702, "acc_norm": 0.26595744680851063, "acc_norm_stderr": 0.026358065698880582, "acc_stderr": 0.026011992930902006}}, "versions": {"hendrycksTest-professional_accounting": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-professional_accounting": {"acc": 0.2553191489361702, "acc_norm": 0.26595744680851063, "acc_norm_stderr": 0.026358065698880582, "acc_stderr": 0.026011992930902006}}, "versions": {"hendrycksTest-professional_accounting": 0}}
diff --git a/tests/testdata/hendrycksTest-professional_law-v0-loglikelihood b/tests/testdata/hendrycksTest-professional_law-v0-loglikelihood
index 23fbfcf78e..b37d4bf2a5 100644
--- a/tests/testdata/hendrycksTest-professional_law-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-professional_law-v0-loglikelihood
@@ -1 +1 @@
-c38c9d5d84eeb7a5f3c4a34d6e70d7e15847b3c38f26e4b119c982bb935e118f
\ No newline at end of file
+c38c9d5d84eeb7a5f3c4a34d6e70d7e15847b3c38f26e4b119c982bb935e118f
diff --git a/tests/testdata/hendrycksTest-professional_law-v0-res.json b/tests/testdata/hendrycksTest-professional_law-v0-res.json
index f15a9b34ff..231e6b76a9 100644
--- a/tests/testdata/hendrycksTest-professional_law-v0-res.json
+++ b/tests/testdata/hendrycksTest-professional_law-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-professional_law": {"acc": 0.2561929595827901, "acc_norm": 0.2470664928292047, "acc_norm_stderr": 0.011015752255279352, "acc_stderr": 0.011149173153110582}}, "versions": {"hendrycksTest-professional_law": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-professional_law": {"acc": 0.2561929595827901, "acc_norm": 0.2470664928292047, "acc_norm_stderr": 0.011015752255279352, "acc_stderr": 0.011149173153110582}}, "versions": {"hendrycksTest-professional_law": 0}}
diff --git a/tests/testdata/hendrycksTest-professional_medicine-v0-loglikelihood b/tests/testdata/hendrycksTest-professional_medicine-v0-loglikelihood
index cc3c3be8c6..c1c71f612b 100644
--- a/tests/testdata/hendrycksTest-professional_medicine-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-professional_medicine-v0-loglikelihood
@@ -1 +1 @@
-7a30599858398169cde61430c18efdd7fb4dcd09c34aa9baba70f0f8cf17a9f1
\ No newline at end of file
+7a30599858398169cde61430c18efdd7fb4dcd09c34aa9baba70f0f8cf17a9f1
diff --git a/tests/testdata/hendrycksTest-professional_medicine-v0-res.json b/tests/testdata/hendrycksTest-professional_medicine-v0-res.json
index 801ea2d224..07daf13a24 100644
--- a/tests/testdata/hendrycksTest-professional_medicine-v0-res.json
+++ b/tests/testdata/hendrycksTest-professional_medicine-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-professional_medicine": {"acc": 0.23161764705882354, "acc_norm": 0.2536764705882353, "acc_norm_stderr": 0.02643132987078953, "acc_stderr": 0.025626533803777562}}, "versions": {"hendrycksTest-professional_medicine": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-professional_medicine": {"acc": 0.23161764705882354, "acc_norm": 0.2536764705882353, "acc_norm_stderr": 0.02643132987078953, "acc_stderr": 0.025626533803777562}}, "versions": {"hendrycksTest-professional_medicine": 0}}
diff --git a/tests/testdata/hendrycksTest-professional_psychology-v0-loglikelihood b/tests/testdata/hendrycksTest-professional_psychology-v0-loglikelihood
index 9865854da3..70673e36ee 100644
--- a/tests/testdata/hendrycksTest-professional_psychology-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-professional_psychology-v0-loglikelihood
@@ -1 +1 @@
-92a5fad6e9ec700f84946faeccd399dda3569fb71837c9fb0c5c87f5ec29c43e
\ No newline at end of file
+92a5fad6e9ec700f84946faeccd399dda3569fb71837c9fb0c5c87f5ec29c43e
diff --git a/tests/testdata/hendrycksTest-professional_psychology-v0-res.json b/tests/testdata/hendrycksTest-professional_psychology-v0-res.json
index c6b33f4be1..90890c6230 100644
--- a/tests/testdata/hendrycksTest-professional_psychology-v0-res.json
+++ b/tests/testdata/hendrycksTest-professional_psychology-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-professional_psychology": {"acc": 0.27124183006535946, "acc_norm": 0.2826797385620915, "acc_norm_stderr": 0.01821726955205344, "acc_stderr": 0.01798661530403031}}, "versions": {"hendrycksTest-professional_psychology": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-professional_psychology": {"acc": 0.27124183006535946, "acc_norm": 0.2826797385620915, "acc_norm_stderr": 0.01821726955205344, "acc_stderr": 0.01798661530403031}}, "versions": {"hendrycksTest-professional_psychology": 0}}
diff --git a/tests/testdata/hendrycksTest-public_relations-v0-loglikelihood b/tests/testdata/hendrycksTest-public_relations-v0-loglikelihood
index 8f7b30ba88..51b3d974eb 100644
--- a/tests/testdata/hendrycksTest-public_relations-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-public_relations-v0-loglikelihood
@@ -1 +1 @@
-ab70f500cf24e876f6ae6bdc27525a1d6074fa9b6ea97770255d9fc2559b36ff
\ No newline at end of file
+ab70f500cf24e876f6ae6bdc27525a1d6074fa9b6ea97770255d9fc2559b36ff
diff --git a/tests/testdata/hendrycksTest-public_relations-v0-res.json b/tests/testdata/hendrycksTest-public_relations-v0-res.json
index 9ba711cca7..c9922317a8 100644
--- a/tests/testdata/hendrycksTest-public_relations-v0-res.json
+++ b/tests/testdata/hendrycksTest-public_relations-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-public_relations": {"acc": 0.3090909090909091, "acc_norm": 0.2636363636363636, "acc_norm_stderr": 0.04220224692971987, "acc_stderr": 0.044262946482000985}}, "versions": {"hendrycksTest-public_relations": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-public_relations": {"acc": 0.3090909090909091, "acc_norm": 0.2636363636363636, "acc_norm_stderr": 0.04220224692971987, "acc_stderr": 0.044262946482000985}}, "versions": {"hendrycksTest-public_relations": 0}}
diff --git a/tests/testdata/hendrycksTest-security_studies-v0-loglikelihood b/tests/testdata/hendrycksTest-security_studies-v0-loglikelihood
index 6aa9b5ec00..7bb47e4a55 100644
--- a/tests/testdata/hendrycksTest-security_studies-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-security_studies-v0-loglikelihood
@@ -1 +1 @@
-92dfffe2acf3278256486d3e1cf1edb5a739ad0a54c0f9c67695f7a411ed5f76
\ No newline at end of file
+92dfffe2acf3278256486d3e1cf1edb5a739ad0a54c0f9c67695f7a411ed5f76
diff --git a/tests/testdata/hendrycksTest-security_studies-v0-res.json b/tests/testdata/hendrycksTest-security_studies-v0-res.json
index 2c9de8886a..109196000e 100644
--- a/tests/testdata/hendrycksTest-security_studies-v0-res.json
+++ b/tests/testdata/hendrycksTest-security_studies-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-security_studies": {"acc": 0.2979591836734694, "acc_norm": 0.2693877551020408, "acc_norm_stderr": 0.02840125202902294, "acc_stderr": 0.029279567411065674}}, "versions": {"hendrycksTest-security_studies": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-security_studies": {"acc": 0.2979591836734694, "acc_norm": 0.2693877551020408, "acc_norm_stderr": 0.02840125202902294, "acc_stderr": 0.029279567411065674}}, "versions": {"hendrycksTest-security_studies": 0}}
diff --git a/tests/testdata/hendrycksTest-sociology-v0-loglikelihood b/tests/testdata/hendrycksTest-sociology-v0-loglikelihood
index d3f581c9f2..534d7c09b3 100644
--- a/tests/testdata/hendrycksTest-sociology-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-sociology-v0-loglikelihood
@@ -1 +1 @@
-f99a3caece11169f2a5cc951001f92027104afd25d29b2a399883bd4bf118605
\ No newline at end of file
+f99a3caece11169f2a5cc951001f92027104afd25d29b2a399883bd4bf118605
diff --git a/tests/testdata/hendrycksTest-sociology-v0-res.json b/tests/testdata/hendrycksTest-sociology-v0-res.json
index 8711cf195e..0974f7e22f 100644
--- a/tests/testdata/hendrycksTest-sociology-v0-res.json
+++ b/tests/testdata/hendrycksTest-sociology-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-sociology": {"acc": 0.23383084577114427, "acc_norm": 0.24875621890547264, "acc_norm_stderr": 0.030567675938916707, "acc_stderr": 0.02992941540834838}}, "versions": {"hendrycksTest-sociology": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-sociology": {"acc": 0.23383084577114427, "acc_norm": 0.24875621890547264, "acc_norm_stderr": 0.030567675938916707, "acc_stderr": 0.02992941540834838}}, "versions": {"hendrycksTest-sociology": 0}}
diff --git a/tests/testdata/hendrycksTest-us_foreign_policy-v0-loglikelihood b/tests/testdata/hendrycksTest-us_foreign_policy-v0-loglikelihood
index eed85dbaf9..c315cc8b77 100644
--- a/tests/testdata/hendrycksTest-us_foreign_policy-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-us_foreign_policy-v0-loglikelihood
@@ -1 +1 @@
-a1a338d0083a21054f74d36a296d6bd8e2e457327c0fd630bebcc61ed758044d
\ No newline at end of file
+a1a338d0083a21054f74d36a296d6bd8e2e457327c0fd630bebcc61ed758044d
diff --git a/tests/testdata/hendrycksTest-us_foreign_policy-v0-res.json b/tests/testdata/hendrycksTest-us_foreign_policy-v0-res.json
index 1077380de8..f594f9bb49 100644
--- a/tests/testdata/hendrycksTest-us_foreign_policy-v0-res.json
+++ b/tests/testdata/hendrycksTest-us_foreign_policy-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-us_foreign_policy": {"acc": 0.2, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909283, "acc_stderr": 0.040201512610368445}}, "versions": {"hendrycksTest-us_foreign_policy": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-us_foreign_policy": {"acc": 0.2, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909283, "acc_stderr": 0.040201512610368445}}, "versions": {"hendrycksTest-us_foreign_policy": 0}}
diff --git a/tests/testdata/hendrycksTest-virology-v0-loglikelihood b/tests/testdata/hendrycksTest-virology-v0-loglikelihood
index 3555c2c535..0af2342855 100644
--- a/tests/testdata/hendrycksTest-virology-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-virology-v0-loglikelihood
@@ -1 +1 @@
-0ffa491f7bad2abbb64ecd752a295729167599b3815238cab0ecf4cb08bba9b6
\ No newline at end of file
+0ffa491f7bad2abbb64ecd752a295729167599b3815238cab0ecf4cb08bba9b6
diff --git a/tests/testdata/hendrycksTest-virology-v0-res.json b/tests/testdata/hendrycksTest-virology-v0-res.json
index 0004b19404..eb2639a6b8 100644
--- a/tests/testdata/hendrycksTest-virology-v0-res.json
+++ b/tests/testdata/hendrycksTest-virology-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-virology": {"acc": 0.27710843373493976, "acc_norm": 0.2710843373493976, "acc_norm_stderr": 0.03460579907553027, "acc_stderr": 0.034843315926805875}}, "versions": {"hendrycksTest-virology": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-virology": {"acc": 0.27710843373493976, "acc_norm": 0.2710843373493976, "acc_norm_stderr": 0.03460579907553027, "acc_stderr": 0.034843315926805875}}, "versions": {"hendrycksTest-virology": 0}}
diff --git a/tests/testdata/hendrycksTest-world_religions-v0-loglikelihood b/tests/testdata/hendrycksTest-world_religions-v0-loglikelihood
index 118c9b7435..d7c0bd73b0 100644
--- a/tests/testdata/hendrycksTest-world_religions-v0-loglikelihood
+++ b/tests/testdata/hendrycksTest-world_religions-v0-loglikelihood
@@ -1 +1 @@
-97a0f68ba30ea3a6ef1db1a2925c964b09ecc54455a0a930da083e52677815bd
\ No newline at end of file
+97a0f68ba30ea3a6ef1db1a2925c964b09ecc54455a0a930da083e52677815bd
diff --git a/tests/testdata/hendrycksTest-world_religions-v0-res.json b/tests/testdata/hendrycksTest-world_religions-v0-res.json
index 0fff75a7ea..6c3ce29f0d 100644
--- a/tests/testdata/hendrycksTest-world_religions-v0-res.json
+++ b/tests/testdata/hendrycksTest-world_religions-v0-res.json
@@ -1 +1 @@
-{"results": {"hendrycksTest-world_religions": {"acc": 0.21637426900584794, "acc_norm": 0.22807017543859648, "acc_norm_stderr": 0.03218093795602357, "acc_stderr": 0.03158149539338734}}, "versions": {"hendrycksTest-world_religions": 0}}
\ No newline at end of file
+{"results": {"hendrycksTest-world_religions": {"acc": 0.21637426900584794, "acc_norm": 0.22807017543859648, "acc_norm_stderr": 0.03218093795602357, "acc_stderr": 0.03158149539338734}}, "versions": {"hendrycksTest-world_religions": 0}}
diff --git a/tests/testdata/iwslt17-ar-en-v0-greedy_until b/tests/testdata/iwslt17-ar-en-v0-greedy_until
index 82921d1db0..12c5803946 100644
--- a/tests/testdata/iwslt17-ar-en-v0-greedy_until
+++ b/tests/testdata/iwslt17-ar-en-v0-greedy_until
@@ -1 +1 @@
-e94d310de91fad7ce36f4cf3305552020221482c5588f2efcefaa019893504f1
\ No newline at end of file
+e94d310de91fad7ce36f4cf3305552020221482c5588f2efcefaa019893504f1
diff --git a/tests/testdata/iwslt17-ar-en-v0-res.json b/tests/testdata/iwslt17-ar-en-v0-res.json
index 0f414a928b..541ff17888 100644
--- a/tests/testdata/iwslt17-ar-en-v0-res.json
+++ b/tests/testdata/iwslt17-ar-en-v0-res.json
@@ -1 +1 @@
-{"results": {"iwslt17-ar-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.015049895477752772, "chrf_stderr": 0.0002940315671893584, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"iwslt17-ar-en": 0}}
\ No newline at end of file
+{"results": {"iwslt17-ar-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.015049895477752772, "chrf_stderr": 0.0002940315671893584, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"iwslt17-ar-en": 0}}
diff --git a/tests/testdata/iwslt17-en-ar-v0-greedy_until b/tests/testdata/iwslt17-en-ar-v0-greedy_until
index fc59546576..31e16e1f56 100644
--- a/tests/testdata/iwslt17-en-ar-v0-greedy_until
+++ b/tests/testdata/iwslt17-en-ar-v0-greedy_until
@@ -1 +1 @@
-b20adbcd2c6d135e28600b427113532c5df624cb3a90e8c5e48715c09a3a38fa
\ No newline at end of file
+b20adbcd2c6d135e28600b427113532c5df624cb3a90e8c5e48715c09a3a38fa
diff --git a/tests/testdata/iwslt17-en-ar-v0-res.json b/tests/testdata/iwslt17-en-ar-v0-res.json
index a22fa9036c..27184cdd64 100644
--- a/tests/testdata/iwslt17-en-ar-v0-res.json
+++ b/tests/testdata/iwslt17-en-ar-v0-res.json
@@ -1 +1 @@
-{"results": {"iwslt17-en-ar": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.0, "chrf_stderr": 0.0, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"iwslt17-en-ar": 0}}
\ No newline at end of file
+{"results": {"iwslt17-en-ar": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.0, "chrf_stderr": 0.0, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"iwslt17-en-ar": 0}}
diff --git a/tests/testdata/lambada-v0-loglikelihood b/tests/testdata/lambada-v0-loglikelihood
index efd450a8f2..60dc7f7338 100644
--- a/tests/testdata/lambada-v0-loglikelihood
+++ b/tests/testdata/lambada-v0-loglikelihood
@@ -1 +1 @@
-6829e6a8aa5922e6c92dd31403cc060f242dc0ede4a775e085a70da095ab2e20
\ No newline at end of file
+6829e6a8aa5922e6c92dd31403cc060f242dc0ede4a775e085a70da095ab2e20
diff --git a/tests/testdata/lambada-v0-res.json b/tests/testdata/lambada-v0-res.json
index ead0e9ce5d..cf02bafc3b 100644
--- a/tests/testdata/lambada-v0-res.json
+++ b/tests/testdata/lambada-v0-res.json
@@ -1 +1 @@
-{"results": {"lambada": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada": 0}}
\ No newline at end of file
+{"results": {"lambada": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada": 0}}
diff --git a/tests/testdata/lambada_cloze-v0-loglikelihood b/tests/testdata/lambada_cloze-v0-loglikelihood
index b599a89f7a..3657eb6e71 100644
--- a/tests/testdata/lambada_cloze-v0-loglikelihood
+++ b/tests/testdata/lambada_cloze-v0-loglikelihood
@@ -1 +1 @@
-7655e748b63ae7e9911411d2d2a2577221d6c861ca4448509992541294d689f3
\ No newline at end of file
+7655e748b63ae7e9911411d2d2a2577221d6c861ca4448509992541294d689f3
diff --git a/tests/testdata/lambada_cloze-v0-res.json b/tests/testdata/lambada_cloze-v0-res.json
index f3f3f931ac..7cd36e9763 100644
--- a/tests/testdata/lambada_cloze-v0-res.json
+++ b/tests/testdata/lambada_cloze-v0-res.json
@@ -1 +1 @@
-{"results": {"lambada_cloze": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_cloze": 0}}
\ No newline at end of file
+{"results": {"lambada_cloze": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_cloze": 0}}
diff --git a/tests/testdata/lambada_mt_de-v0-loglikelihood b/tests/testdata/lambada_mt_de-v0-loglikelihood
index ae19de0e69..cae8d9bc38 100644
--- a/tests/testdata/lambada_mt_de-v0-loglikelihood
+++ b/tests/testdata/lambada_mt_de-v0-loglikelihood
@@ -1 +1 @@
-5ad125e1708499832b2cee8c3388f89f9c0277010fd96fbd3359039ce8105984
\ No newline at end of file
+5ad125e1708499832b2cee8c3388f89f9c0277010fd96fbd3359039ce8105984
diff --git a/tests/testdata/lambada_mt_de-v0-res.json b/tests/testdata/lambada_mt_de-v0-res.json
index 7267ea739a..a7a0a44989 100644
--- a/tests/testdata/lambada_mt_de-v0-res.json
+++ b/tests/testdata/lambada_mt_de-v0-res.json
@@ -1 +1 @@
-{"results": {"lambada_mt_de": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_de": 0}}
\ No newline at end of file
+{"results": {"lambada_mt_de": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_de": 0}}
diff --git a/tests/testdata/lambada_mt_en-v0-loglikelihood b/tests/testdata/lambada_mt_en-v0-loglikelihood
index efd450a8f2..60dc7f7338 100644
--- a/tests/testdata/lambada_mt_en-v0-loglikelihood
+++ b/tests/testdata/lambada_mt_en-v0-loglikelihood
@@ -1 +1 @@
-6829e6a8aa5922e6c92dd31403cc060f242dc0ede4a775e085a70da095ab2e20
\ No newline at end of file
+6829e6a8aa5922e6c92dd31403cc060f242dc0ede4a775e085a70da095ab2e20
diff --git a/tests/testdata/lambada_mt_en-v0-res.json b/tests/testdata/lambada_mt_en-v0-res.json
index 561b88ffe1..eb948a0fb0 100644
--- a/tests/testdata/lambada_mt_en-v0-res.json
+++ b/tests/testdata/lambada_mt_en-v0-res.json
@@ -1 +1 @@
-{"results": {"lambada_mt_en": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_en": 0}}
\ No newline at end of file
+{"results": {"lambada_mt_en": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_en": 0}}
diff --git a/tests/testdata/lambada_mt_es-v0-loglikelihood b/tests/testdata/lambada_mt_es-v0-loglikelihood
index df895fe6d6..ce6a044012 100644
--- a/tests/testdata/lambada_mt_es-v0-loglikelihood
+++ b/tests/testdata/lambada_mt_es-v0-loglikelihood
@@ -1 +1 @@
-4a88f4b316c72fe0396c382d6cbb33568ac4d0ad225150d3536635c085359fc9
\ No newline at end of file
+4a88f4b316c72fe0396c382d6cbb33568ac4d0ad225150d3536635c085359fc9
diff --git a/tests/testdata/lambada_mt_es-v0-res.json b/tests/testdata/lambada_mt_es-v0-res.json
index 5f95957324..107b63fa92 100644
--- a/tests/testdata/lambada_mt_es-v0-res.json
+++ b/tests/testdata/lambada_mt_es-v0-res.json
@@ -1 +1 @@
-{"results": {"lambada_mt_es": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_es": 0}}
\ No newline at end of file
+{"results": {"lambada_mt_es": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_es": 0}}
diff --git a/tests/testdata/lambada_mt_fr-v0-loglikelihood b/tests/testdata/lambada_mt_fr-v0-loglikelihood
index 3c444f6661..b1495180e9 100644
--- a/tests/testdata/lambada_mt_fr-v0-loglikelihood
+++ b/tests/testdata/lambada_mt_fr-v0-loglikelihood
@@ -1 +1 @@
-5d16f4a0c51dc6d7b6df2ebeba2bbfa51e700b843779b559b3d90183d7b02a11
\ No newline at end of file
+5d16f4a0c51dc6d7b6df2ebeba2bbfa51e700b843779b559b3d90183d7b02a11
diff --git a/tests/testdata/lambada_mt_fr-v0-res.json b/tests/testdata/lambada_mt_fr-v0-res.json
index 637c23500b..ec0b038c81 100644
--- a/tests/testdata/lambada_mt_fr-v0-res.json
+++ b/tests/testdata/lambada_mt_fr-v0-res.json
@@ -1 +1 @@
-{"results": {"lambada_mt_fr": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_fr": 0}}
\ No newline at end of file
+{"results": {"lambada_mt_fr": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_fr": 0}}
diff --git a/tests/testdata/lambada_mt_it-v0-loglikelihood b/tests/testdata/lambada_mt_it-v0-loglikelihood
index ca3fd80298..3885f316af 100644
--- a/tests/testdata/lambada_mt_it-v0-loglikelihood
+++ b/tests/testdata/lambada_mt_it-v0-loglikelihood
@@ -1 +1 @@
-fd87c6c5cf4e0499c5f9f80e5bd7ee6a4f3d2991902a0cc3ec9e6eaf22d6760a
\ No newline at end of file
+fd87c6c5cf4e0499c5f9f80e5bd7ee6a4f3d2991902a0cc3ec9e6eaf22d6760a
diff --git a/tests/testdata/lambada_mt_it-v0-res.json b/tests/testdata/lambada_mt_it-v0-res.json
index b652210ae3..79efb8675a 100644
--- a/tests/testdata/lambada_mt_it-v0-res.json
+++ b/tests/testdata/lambada_mt_it-v0-res.json
@@ -1 +1 @@
-{"results": {"lambada_mt_it": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_it": 0}}
\ No newline at end of file
+{"results": {"lambada_mt_it": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_it": 0}}
diff --git a/tests/testdata/logiqa-v0-loglikelihood b/tests/testdata/logiqa-v0-loglikelihood
index 9cd40fce0a..91fe00756a 100644
--- a/tests/testdata/logiqa-v0-loglikelihood
+++ b/tests/testdata/logiqa-v0-loglikelihood
@@ -1 +1 @@
-12495c50454ba5e1ce0753bd18c09aaca516bebd27648d815e37b15229dbf198
\ No newline at end of file
+12495c50454ba5e1ce0753bd18c09aaca516bebd27648d815e37b15229dbf198
diff --git a/tests/testdata/logiqa-v0-res.json b/tests/testdata/logiqa-v0-res.json
index 7a80c24d1b..d76464cd63 100644
--- a/tests/testdata/logiqa-v0-res.json
+++ b/tests/testdata/logiqa-v0-res.json
@@ -1 +1 @@
-{"results": {"logiqa": {"acc": 0.25806451612903225, "acc_norm": 0.2764976958525346, "acc_norm_stderr": 0.017543209075825194, "acc_stderr": 0.017162894755127077}}, "versions": {"logiqa": 0}}
\ No newline at end of file
+{"results": {"logiqa": {"acc": 0.25806451612903225, "acc_norm": 0.2764976958525346, "acc_norm_stderr": 0.017543209075825194, "acc_stderr": 0.017162894755127077}}, "versions": {"logiqa": 0}}
diff --git a/tests/testdata/math_algebra-v0-greedy_until b/tests/testdata/math_algebra-v0-greedy_until
index ce881a0232..48090ee978 100644
--- a/tests/testdata/math_algebra-v0-greedy_until
+++ b/tests/testdata/math_algebra-v0-greedy_until
@@ -1 +1 @@
-f19182ce697a2c095d9e5b56ee6659dc38c93994b69ca75d7c3d3f5fd87572b4
\ No newline at end of file
+f19182ce697a2c095d9e5b56ee6659dc38c93994b69ca75d7c3d3f5fd87572b4
diff --git a/tests/testdata/math_algebra-v0-res.json b/tests/testdata/math_algebra-v0-res.json
index 192cb9d852..8e2552d4cc 100644
--- a/tests/testdata/math_algebra-v0-res.json
+++ b/tests/testdata/math_algebra-v0-res.json
@@ -1 +1 @@
-{"results": {"math_algebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_algebra": 0}}
\ No newline at end of file
+{"results": {"math_algebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_algebra": 0}}
diff --git a/tests/testdata/math_algebra-v1-greedy_until b/tests/testdata/math_algebra-v1-greedy_until
index ce881a0232..48090ee978 100644
--- a/tests/testdata/math_algebra-v1-greedy_until
+++ b/tests/testdata/math_algebra-v1-greedy_until
@@ -1 +1 @@
-f19182ce697a2c095d9e5b56ee6659dc38c93994b69ca75d7c3d3f5fd87572b4
\ No newline at end of file
+f19182ce697a2c095d9e5b56ee6659dc38c93994b69ca75d7c3d3f5fd87572b4
diff --git a/tests/testdata/math_algebra-v1-res.json b/tests/testdata/math_algebra-v1-res.json
index 10d18c2f86..00b237d6c5 100644
--- a/tests/testdata/math_algebra-v1-res.json
+++ b/tests/testdata/math_algebra-v1-res.json
@@ -1 +1 @@
-{"results": {"math_algebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_algebra": 1}}
\ No newline at end of file
+{"results": {"math_algebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_algebra": 1}}
diff --git a/tests/testdata/math_counting_and_prob-v0-greedy_until b/tests/testdata/math_counting_and_prob-v0-greedy_until
index 6f49557ecf..a81e7cc085 100644
--- a/tests/testdata/math_counting_and_prob-v0-greedy_until
+++ b/tests/testdata/math_counting_and_prob-v0-greedy_until
@@ -1 +1 @@
-2aa9ae43ee9dbb2457525247d7b65358632c5eaa9cbfc40cf95a4f17f5d942ad
\ No newline at end of file
+2aa9ae43ee9dbb2457525247d7b65358632c5eaa9cbfc40cf95a4f17f5d942ad
diff --git a/tests/testdata/math_counting_and_prob-v0-res.json b/tests/testdata/math_counting_and_prob-v0-res.json
index 8ee1d031de..df7f4e47a6 100644
--- a/tests/testdata/math_counting_and_prob-v0-res.json
+++ b/tests/testdata/math_counting_and_prob-v0-res.json
@@ -1 +1 @@
-{"results": {"math_counting_and_prob": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_counting_and_prob": 0}}
\ No newline at end of file
+{"results": {"math_counting_and_prob": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_counting_and_prob": 0}}
diff --git a/tests/testdata/math_counting_and_prob-v1-greedy_until b/tests/testdata/math_counting_and_prob-v1-greedy_until
index 6f49557ecf..a81e7cc085 100644
--- a/tests/testdata/math_counting_and_prob-v1-greedy_until
+++ b/tests/testdata/math_counting_and_prob-v1-greedy_until
@@ -1 +1 @@
-2aa9ae43ee9dbb2457525247d7b65358632c5eaa9cbfc40cf95a4f17f5d942ad
\ No newline at end of file
+2aa9ae43ee9dbb2457525247d7b65358632c5eaa9cbfc40cf95a4f17f5d942ad
diff --git a/tests/testdata/math_counting_and_prob-v1-res.json b/tests/testdata/math_counting_and_prob-v1-res.json
index 240f7b6b42..1eae75c7cd 100644
--- a/tests/testdata/math_counting_and_prob-v1-res.json
+++ b/tests/testdata/math_counting_and_prob-v1-res.json
@@ -1 +1 @@
-{"results": {"math_counting_and_prob": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_counting_and_prob": 1}}
\ No newline at end of file
+{"results": {"math_counting_and_prob": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_counting_and_prob": 1}}
diff --git a/tests/testdata/math_geometry-v0-greedy_until b/tests/testdata/math_geometry-v0-greedy_until
index 1c7362fe44..3ed1fe9f97 100644
--- a/tests/testdata/math_geometry-v0-greedy_until
+++ b/tests/testdata/math_geometry-v0-greedy_until
@@ -1 +1 @@
-46bc4cb219b6903397da782699a684bdbb982c0c954ff82e6beeed5c84878f42
\ No newline at end of file
+46bc4cb219b6903397da782699a684bdbb982c0c954ff82e6beeed5c84878f42
diff --git a/tests/testdata/math_geometry-v0-res.json b/tests/testdata/math_geometry-v0-res.json
index 1b25dc283c..f50c311ab4 100644
--- a/tests/testdata/math_geometry-v0-res.json
+++ b/tests/testdata/math_geometry-v0-res.json
@@ -1 +1 @@
-{"results": {"math_geometry": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_geometry": 0}}
\ No newline at end of file
+{"results": {"math_geometry": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_geometry": 0}}
diff --git a/tests/testdata/math_geometry-v1-greedy_until b/tests/testdata/math_geometry-v1-greedy_until
index 1c7362fe44..3ed1fe9f97 100644
--- a/tests/testdata/math_geometry-v1-greedy_until
+++ b/tests/testdata/math_geometry-v1-greedy_until
@@ -1 +1 @@
-46bc4cb219b6903397da782699a684bdbb982c0c954ff82e6beeed5c84878f42
\ No newline at end of file
+46bc4cb219b6903397da782699a684bdbb982c0c954ff82e6beeed5c84878f42
diff --git a/tests/testdata/math_geometry-v1-res.json b/tests/testdata/math_geometry-v1-res.json
index eb6851fc63..8a915069b5 100644
--- a/tests/testdata/math_geometry-v1-res.json
+++ b/tests/testdata/math_geometry-v1-res.json
@@ -1 +1 @@
-{"results": {"math_geometry": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_geometry": 1}}
\ No newline at end of file
+{"results": {"math_geometry": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_geometry": 1}}
diff --git a/tests/testdata/math_intermediate_algebra-v0-greedy_until b/tests/testdata/math_intermediate_algebra-v0-greedy_until
index 3ab10de26a..ed4f5dc12a 100644
--- a/tests/testdata/math_intermediate_algebra-v0-greedy_until
+++ b/tests/testdata/math_intermediate_algebra-v0-greedy_until
@@ -1 +1 @@
-d53c699de272d517ed7ad783b4e692302be9f9f97a8d4ac7a6541e538a7cabe0
\ No newline at end of file
+d53c699de272d517ed7ad783b4e692302be9f9f97a8d4ac7a6541e538a7cabe0
diff --git a/tests/testdata/math_intermediate_algebra-v0-res.json b/tests/testdata/math_intermediate_algebra-v0-res.json
index 7a195d9ac4..e047aba684 100644
--- a/tests/testdata/math_intermediate_algebra-v0-res.json
+++ b/tests/testdata/math_intermediate_algebra-v0-res.json
@@ -1 +1 @@
-{"results": {"math_intermediate_algebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_intermediate_algebra": 0}}
\ No newline at end of file
+{"results": {"math_intermediate_algebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_intermediate_algebra": 0}}
diff --git a/tests/testdata/math_intermediate_algebra-v1-greedy_until b/tests/testdata/math_intermediate_algebra-v1-greedy_until
index 3ab10de26a..ed4f5dc12a 100644
--- a/tests/testdata/math_intermediate_algebra-v1-greedy_until
+++ b/tests/testdata/math_intermediate_algebra-v1-greedy_until
@@ -1 +1 @@
-d53c699de272d517ed7ad783b4e692302be9f9f97a8d4ac7a6541e538a7cabe0
\ No newline at end of file
+d53c699de272d517ed7ad783b4e692302be9f9f97a8d4ac7a6541e538a7cabe0
diff --git a/tests/testdata/math_intermediate_algebra-v1-res.json b/tests/testdata/math_intermediate_algebra-v1-res.json
index 63ab45b9ff..c6f1c39e18 100644
--- a/tests/testdata/math_intermediate_algebra-v1-res.json
+++ b/tests/testdata/math_intermediate_algebra-v1-res.json
@@ -1 +1 @@
-{"results": {"math_intermediate_algebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_intermediate_algebra": 1}}
\ No newline at end of file
+{"results": {"math_intermediate_algebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_intermediate_algebra": 1}}
diff --git a/tests/testdata/math_num_theory-v0-greedy_until b/tests/testdata/math_num_theory-v0-greedy_until
index 82febb9f5d..8b9fae1314 100644
--- a/tests/testdata/math_num_theory-v0-greedy_until
+++ b/tests/testdata/math_num_theory-v0-greedy_until
@@ -1 +1 @@
-b920ccb507afdcf3ef6f4c04891913731e9f32ec914801791c6d9f8abf6e1897
\ No newline at end of file
+b920ccb507afdcf3ef6f4c04891913731e9f32ec914801791c6d9f8abf6e1897
diff --git a/tests/testdata/math_num_theory-v0-res.json b/tests/testdata/math_num_theory-v0-res.json
index a27a38fa9d..f39ace0db5 100644
--- a/tests/testdata/math_num_theory-v0-res.json
+++ b/tests/testdata/math_num_theory-v0-res.json
@@ -1 +1 @@
-{"results": {"math_num_theory": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_num_theory": 0}}
\ No newline at end of file
+{"results": {"math_num_theory": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_num_theory": 0}}
diff --git a/tests/testdata/math_num_theory-v1-greedy_until b/tests/testdata/math_num_theory-v1-greedy_until
index 82febb9f5d..8b9fae1314 100644
--- a/tests/testdata/math_num_theory-v1-greedy_until
+++ b/tests/testdata/math_num_theory-v1-greedy_until
@@ -1 +1 @@
-b920ccb507afdcf3ef6f4c04891913731e9f32ec914801791c6d9f8abf6e1897
\ No newline at end of file
+b920ccb507afdcf3ef6f4c04891913731e9f32ec914801791c6d9f8abf6e1897
diff --git a/tests/testdata/math_num_theory-v1-res.json b/tests/testdata/math_num_theory-v1-res.json
index 00917b90dd..67799305de 100644
--- a/tests/testdata/math_num_theory-v1-res.json
+++ b/tests/testdata/math_num_theory-v1-res.json
@@ -1 +1 @@
-{"results": {"math_num_theory": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_num_theory": 1}}
\ No newline at end of file
+{"results": {"math_num_theory": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_num_theory": 1}}
diff --git a/tests/testdata/math_prealgebra-v0-greedy_until b/tests/testdata/math_prealgebra-v0-greedy_until
index 5200f4cfa9..570cf27876 100644
--- a/tests/testdata/math_prealgebra-v0-greedy_until
+++ b/tests/testdata/math_prealgebra-v0-greedy_until
@@ -1 +1 @@
-752cdf343d7152e476b0273065024f6ea0e0f47ea385c6bdf9067736cb39724a
\ No newline at end of file
+752cdf343d7152e476b0273065024f6ea0e0f47ea385c6bdf9067736cb39724a
diff --git a/tests/testdata/math_prealgebra-v0-res.json b/tests/testdata/math_prealgebra-v0-res.json
index b3ada8a6be..8bbaef99a9 100644
--- a/tests/testdata/math_prealgebra-v0-res.json
+++ b/tests/testdata/math_prealgebra-v0-res.json
@@ -1 +1 @@
-{"results": {"math_prealgebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_prealgebra": 0}}
\ No newline at end of file
+{"results": {"math_prealgebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_prealgebra": 0}}
diff --git a/tests/testdata/math_prealgebra-v1-greedy_until b/tests/testdata/math_prealgebra-v1-greedy_until
index 5200f4cfa9..570cf27876 100644
--- a/tests/testdata/math_prealgebra-v1-greedy_until
+++ b/tests/testdata/math_prealgebra-v1-greedy_until
@@ -1 +1 @@
-752cdf343d7152e476b0273065024f6ea0e0f47ea385c6bdf9067736cb39724a
\ No newline at end of file
+752cdf343d7152e476b0273065024f6ea0e0f47ea385c6bdf9067736cb39724a
diff --git a/tests/testdata/math_prealgebra-v1-res.json b/tests/testdata/math_prealgebra-v1-res.json
index e3869faa80..18b665567a 100644
--- a/tests/testdata/math_prealgebra-v1-res.json
+++ b/tests/testdata/math_prealgebra-v1-res.json
@@ -1 +1 @@
-{"results": {"math_prealgebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_prealgebra": 1}}
\ No newline at end of file
+{"results": {"math_prealgebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_prealgebra": 1}}
diff --git a/tests/testdata/math_precalc-v0-greedy_until b/tests/testdata/math_precalc-v0-greedy_until
index 71bbd8d9c2..816534b355 100644
--- a/tests/testdata/math_precalc-v0-greedy_until
+++ b/tests/testdata/math_precalc-v0-greedy_until
@@ -1 +1 @@
-bc834b06fd79473ca6fe38a51b714aad0bf0478c1b0eec787eca34dbdf69cb71
\ No newline at end of file
+bc834b06fd79473ca6fe38a51b714aad0bf0478c1b0eec787eca34dbdf69cb71
diff --git a/tests/testdata/math_precalc-v0-res.json b/tests/testdata/math_precalc-v0-res.json
index 699dc5fe38..f1f806d39e 100644
--- a/tests/testdata/math_precalc-v0-res.json
+++ b/tests/testdata/math_precalc-v0-res.json
@@ -1 +1 @@
-{"results": {"math_precalc": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_precalc": 0}}
\ No newline at end of file
+{"results": {"math_precalc": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_precalc": 0}}
diff --git a/tests/testdata/math_precalc-v1-greedy_until b/tests/testdata/math_precalc-v1-greedy_until
index 71bbd8d9c2..816534b355 100644
--- a/tests/testdata/math_precalc-v1-greedy_until
+++ b/tests/testdata/math_precalc-v1-greedy_until
@@ -1 +1 @@
-bc834b06fd79473ca6fe38a51b714aad0bf0478c1b0eec787eca34dbdf69cb71
\ No newline at end of file
+bc834b06fd79473ca6fe38a51b714aad0bf0478c1b0eec787eca34dbdf69cb71
diff --git a/tests/testdata/math_precalc-v1-res.json b/tests/testdata/math_precalc-v1-res.json
index a5846590a3..c635e939b0 100644
--- a/tests/testdata/math_precalc-v1-res.json
+++ b/tests/testdata/math_precalc-v1-res.json
@@ -1 +1 @@
-{"results": {"math_precalc": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_precalc": 1}}
\ No newline at end of file
+{"results": {"math_precalc": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_precalc": 1}}
diff --git a/tests/testdata/mathqa-v0-loglikelihood b/tests/testdata/mathqa-v0-loglikelihood
index 9f33d79035..b1481a24d0 100644
--- a/tests/testdata/mathqa-v0-loglikelihood
+++ b/tests/testdata/mathqa-v0-loglikelihood
@@ -1 +1 @@
-a45260e49f02c7cb8886b3746db4d388890860b202dd8a9f0267e3c324e0af13
\ No newline at end of file
+a45260e49f02c7cb8886b3746db4d388890860b202dd8a9f0267e3c324e0af13
diff --git a/tests/testdata/mathqa-v0-res.json b/tests/testdata/mathqa-v0-res.json
index dabd07c07c..07bfe685cc 100644
--- a/tests/testdata/mathqa-v0-res.json
+++ b/tests/testdata/mathqa-v0-res.json
@@ -1 +1 @@
-{"results": {"mathqa": {"acc": 0.20770519262981574, "acc_norm": 0.2050251256281407, "acc_norm_stderr": 0.007390619359738901, "acc_stderr": 0.007426217631188539}}, "versions": {"mathqa": 0}}
\ No newline at end of file
+{"results": {"mathqa": {"acc": 0.20770519262981574, "acc_norm": 0.2050251256281407, "acc_norm_stderr": 0.007390619359738901, "acc_stderr": 0.007426217631188539}}, "versions": {"mathqa": 0}}
diff --git a/tests/testdata/mc_taco-v0-loglikelihood b/tests/testdata/mc_taco-v0-loglikelihood
index f0ce5c6458..57258313b2 100644
--- a/tests/testdata/mc_taco-v0-loglikelihood
+++ b/tests/testdata/mc_taco-v0-loglikelihood
@@ -1 +1 @@
-1811808ef05afd5f30ffc3471622a3dd7a1b681b17a2f7616695ad6b2a45943c
\ No newline at end of file
+1811808ef05afd5f30ffc3471622a3dd7a1b681b17a2f7616695ad6b2a45943c
diff --git a/tests/testdata/mc_taco-v0-res.json b/tests/testdata/mc_taco-v0-res.json
index fc36d1ed3f..2fe07d4a3a 100644
--- a/tests/testdata/mc_taco-v0-res.json
+++ b/tests/testdata/mc_taco-v0-res.json
@@ -1 +1 @@
-{"results": {"mc_taco": {"em": 0.07732732732732733, "f1": 0.41600515965511614}}, "versions": {"mc_taco": 0}}
\ No newline at end of file
+{"results": {"mc_taco": {"em": 0.07732732732732733, "f1": 0.41600515965511614}}, "versions": {"mc_taco": 0}}
diff --git a/tests/testdata/mnli-v0-loglikelihood b/tests/testdata/mnli-v0-loglikelihood
index 433b76d010..cb5e932cb5 100644
--- a/tests/testdata/mnli-v0-loglikelihood
+++ b/tests/testdata/mnli-v0-loglikelihood
@@ -1 +1 @@
-4fc7b56b8f1e37e38f4a052b227baec2df914c898c3405d3e994726ba4fba976
\ No newline at end of file
+4fc7b56b8f1e37e38f4a052b227baec2df914c898c3405d3e994726ba4fba976
diff --git a/tests/testdata/mnli-v0-res.json b/tests/testdata/mnli-v0-res.json
index d9dada7a02..d631581a60 100644
--- a/tests/testdata/mnli-v0-res.json
+++ b/tests/testdata/mnli-v0-res.json
@@ -1 +1 @@
-{"results": {"mnli": {"acc": 0.32868059093224655, "acc_stderr": 0.004741640290753859}}, "versions": {"mnli": 0}}
\ No newline at end of file
+{"results": {"mnli": {"acc": 0.32868059093224655, "acc_stderr": 0.004741640290753859}}, "versions": {"mnli": 0}}
diff --git a/tests/testdata/mnli_mismatched-v0-loglikelihood b/tests/testdata/mnli_mismatched-v0-loglikelihood
index 3fb242da3a..fcd0e5b260 100644
--- a/tests/testdata/mnli_mismatched-v0-loglikelihood
+++ b/tests/testdata/mnli_mismatched-v0-loglikelihood
@@ -1 +1 @@
-3784acf322e79f31702a7a0612030e4ba5c4fc466ad976a34ee3f3d7278c01f0
\ No newline at end of file
+3784acf322e79f31702a7a0612030e4ba5c4fc466ad976a34ee3f3d7278c01f0
diff --git a/tests/testdata/mnli_mismatched-v0-res.json b/tests/testdata/mnli_mismatched-v0-res.json
index 261deed962..5392bc600f 100644
--- a/tests/testdata/mnli_mismatched-v0-res.json
+++ b/tests/testdata/mnli_mismatched-v0-res.json
@@ -1 +1 @@
-{"results": {"mnli_mismatched": {"acc": 0.3360455655004068, "acc_stderr": 0.004763973908606819}}, "versions": {"mnli_mismatched": 0}}
\ No newline at end of file
+{"results": {"mnli_mismatched": {"acc": 0.3360455655004068, "acc_stderr": 0.004763973908606819}}, "versions": {"mnli_mismatched": 0}}
diff --git a/tests/testdata/mrpc-v0-loglikelihood b/tests/testdata/mrpc-v0-loglikelihood
index 95c849a153..284b450fb0 100644
--- a/tests/testdata/mrpc-v0-loglikelihood
+++ b/tests/testdata/mrpc-v0-loglikelihood
@@ -1 +1 @@
-9f54cbff8d6accba99cfa2c4c4b359563313941018173d7dcf9e32dc28c06583
\ No newline at end of file
+9f54cbff8d6accba99cfa2c4c4b359563313941018173d7dcf9e32dc28c06583
diff --git a/tests/testdata/mrpc-v0-res.json b/tests/testdata/mrpc-v0-res.json
index f141eaa0a4..54d2dac47a 100644
--- a/tests/testdata/mrpc-v0-res.json
+++ b/tests/testdata/mrpc-v0-res.json
@@ -1 +1 @@
-{"results": {"mrpc": {"acc": 0.5392156862745098, "acc_stderr": 0.024707732873723128, "f1": 0.5982905982905982, "f1_stderr": 0.028928325246283727}}, "versions": {"mrpc": 0}}
\ No newline at end of file
+{"results": {"mrpc": {"acc": 0.5392156862745098, "acc_stderr": 0.024707732873723128, "f1": 0.5982905982905982, "f1_stderr": 0.028928325246283727}}, "versions": {"mrpc": 0}}
diff --git a/tests/testdata/multirc-v0-loglikelihood b/tests/testdata/multirc-v0-loglikelihood
index b3681ec175..6391fe9464 100644
--- a/tests/testdata/multirc-v0-loglikelihood
+++ b/tests/testdata/multirc-v0-loglikelihood
@@ -1 +1 @@
-cdb026c027437a8b4653212d0944d36fc16f49921dcb8e4bef899d15a55e9f80
\ No newline at end of file
+cdb026c027437a8b4653212d0944d36fc16f49921dcb8e4bef899d15a55e9f80
diff --git a/tests/testdata/multirc-v0-res.json b/tests/testdata/multirc-v0-res.json
index 87e9c532eb..864c987f55 100644
--- a/tests/testdata/multirc-v0-res.json
+++ b/tests/testdata/multirc-v0-res.json
@@ -1 +1 @@
-{"results": {"multirc": {"acc": 0.07450157397691501, "acc_stderr": 0.008510441526175931}}, "versions": {"multirc": 0}}
\ No newline at end of file
+{"results": {"multirc": {"acc": 0.07450157397691501, "acc_stderr": 0.008510441526175931}}, "versions": {"multirc": 0}}
diff --git a/tests/testdata/multirc-v1-loglikelihood b/tests/testdata/multirc-v1-loglikelihood
index 52a89c6f9e..7a1d5b828f 100644
--- a/tests/testdata/multirc-v1-loglikelihood
+++ b/tests/testdata/multirc-v1-loglikelihood
@@ -1 +1 @@
-0e793bd6f637a70a04c6f2cda080188fc037961b2f909095fe63f7bdbc4a90c6
\ No newline at end of file
+0e793bd6f637a70a04c6f2cda080188fc037961b2f909095fe63f7bdbc4a90c6
diff --git a/tests/testdata/multirc-v1-res.json b/tests/testdata/multirc-v1-res.json
index 938141bbb8..2b782974d2 100644
--- a/tests/testdata/multirc-v1-res.json
+++ b/tests/testdata/multirc-v1-res.json
@@ -1 +1 @@
-{"results": {"multirc": {"acc": 0.046169989506820566, "acc_stderr": 0.006801377886208738}}, "versions": {"multirc": 1}}
\ No newline at end of file
+{"results": {"multirc": {"acc": 0.046169989506820566, "acc_stderr": 0.006801377886208738}}, "versions": {"multirc": 1}}
diff --git a/tests/testdata/mutual-v0-loglikelihood b/tests/testdata/mutual-v0-loglikelihood
index 0022f466d2..8d93380a2a 100644
--- a/tests/testdata/mutual-v0-loglikelihood
+++ b/tests/testdata/mutual-v0-loglikelihood
@@ -1 +1 @@
-f759213a28f0412510bf1a24c9cab0dae64bdee902d42a26225295445e7779db
\ No newline at end of file
+f759213a28f0412510bf1a24c9cab0dae64bdee902d42a26225295445e7779db
diff --git a/tests/testdata/mutual-v0-res.json b/tests/testdata/mutual-v0-res.json
index 2d240576b3..aac1d3f4e2 100644
--- a/tests/testdata/mutual-v0-res.json
+++ b/tests/testdata/mutual-v0-res.json
@@ -1 +1 @@
-{"results": {"mutual": {"mrr": 0.5023513920240772, "mrr_stderr": 0.009501864812936679, "r@1": 0.22573363431151242, "r@1_stderr": 0.014053085820407457, "r@2": 0.4221218961625282, "r@2_stderr": 0.016602191705517556}}, "versions": {"mutual": 0}}
\ No newline at end of file
+{"results": {"mutual": {"mrr": 0.5023513920240772, "mrr_stderr": 0.009501864812936679, "r@1": 0.22573363431151242, "r@1_stderr": 0.014053085820407457, "r@2": 0.4221218961625282, "r@2_stderr": 0.016602191705517556}}, "versions": {"mutual": 0}}
diff --git a/tests/testdata/mutual-v1-loglikelihood b/tests/testdata/mutual-v1-loglikelihood
index 0022f466d2..8d93380a2a 100644
--- a/tests/testdata/mutual-v1-loglikelihood
+++ b/tests/testdata/mutual-v1-loglikelihood
@@ -1 +1 @@
-f759213a28f0412510bf1a24c9cab0dae64bdee902d42a26225295445e7779db
\ No newline at end of file
+f759213a28f0412510bf1a24c9cab0dae64bdee902d42a26225295445e7779db
diff --git a/tests/testdata/mutual-v1-res.json b/tests/testdata/mutual-v1-res.json
index 42e97c6f1a..4d680e5934 100644
--- a/tests/testdata/mutual-v1-res.json
+++ b/tests/testdata/mutual-v1-res.json
@@ -1 +1 @@
-{"results": {"mutual": {"mrr": 0.5023513920240772, "mrr_stderr": 0.009501864812936679, "r@1": 0.22460496613995484, "r@1_stderr": 0.014028122493992806, "r@2": 0.4706546275395034, "r@2_stderr": 0.016778343895001414}}, "versions": {"mutual": 1}}
\ No newline at end of file
+{"results": {"mutual": {"mrr": 0.5023513920240772, "mrr_stderr": 0.009501864812936679, "r@1": 0.22460496613995484, "r@1_stderr": 0.014028122493992806, "r@2": 0.4706546275395034, "r@2_stderr": 0.016778343895001414}}, "versions": {"mutual": 1}}
diff --git a/tests/testdata/mutual_plus-v0-loglikelihood b/tests/testdata/mutual_plus-v0-loglikelihood
index f4ba9d3731..93c1f711fd 100644
--- a/tests/testdata/mutual_plus-v0-loglikelihood
+++ b/tests/testdata/mutual_plus-v0-loglikelihood
@@ -1 +1 @@
-b846bb9db109535f59a93d1ce340cf09f68bdf4fed5b8decd168784220fe07fa
\ No newline at end of file
+b846bb9db109535f59a93d1ce340cf09f68bdf4fed5b8decd168784220fe07fa
diff --git a/tests/testdata/mutual_plus-v0-res.json b/tests/testdata/mutual_plus-v0-res.json
index 9c03488263..4de1605d42 100644
--- a/tests/testdata/mutual_plus-v0-res.json
+++ b/tests/testdata/mutual_plus-v0-res.json
@@ -1 +1 @@
-{"results": {"mutual_plus": {"mrr": 0.5275583145221953, "mrr_stderr": 0.009940894824430708, "r@1": 0.2595936794582393, "r@1_stderr": 0.014737047402750955, "r@2": 0.45372460496614, "r@2_stderr": 0.01673517854461967}}, "versions": {"mutual_plus": 0}}
\ No newline at end of file
+{"results": {"mutual_plus": {"mrr": 0.5275583145221953, "mrr_stderr": 0.009940894824430708, "r@1": 0.2595936794582393, "r@1_stderr": 0.014737047402750955, "r@2": 0.45372460496614, "r@2_stderr": 0.01673517854461967}}, "versions": {"mutual_plus": 0}}
diff --git a/tests/testdata/mutual_plus-v1-loglikelihood b/tests/testdata/mutual_plus-v1-loglikelihood
index f4ba9d3731..93c1f711fd 100644
--- a/tests/testdata/mutual_plus-v1-loglikelihood
+++ b/tests/testdata/mutual_plus-v1-loglikelihood
@@ -1 +1 @@
-b846bb9db109535f59a93d1ce340cf09f68bdf4fed5b8decd168784220fe07fa
\ No newline at end of file
+b846bb9db109535f59a93d1ce340cf09f68bdf4fed5b8decd168784220fe07fa
diff --git a/tests/testdata/mutual_plus-v1-res.json b/tests/testdata/mutual_plus-v1-res.json
index cdb6c85b65..c5261b8167 100644
--- a/tests/testdata/mutual_plus-v1-res.json
+++ b/tests/testdata/mutual_plus-v1-res.json
@@ -1 +1 @@
-{"results": {"mutual_plus": {"mrr": 0.5275583145221953, "mrr_stderr": 0.009940894824430708, "r@1": 0.26297968397291194, "r@1_stderr": 0.01479889176605113, "r@2": 0.5, "r@2_stderr": 0.01680731613632036}}, "versions": {"mutual_plus": 1}}
\ No newline at end of file
+{"results": {"mutual_plus": {"mrr": 0.5275583145221953, "mrr_stderr": 0.009940894824430708, "r@1": 0.26297968397291194, "r@1_stderr": 0.01479889176605113, "r@2": 0.5, "r@2_stderr": 0.01680731613632036}}, "versions": {"mutual_plus": 1}}
diff --git a/tests/testdata/openbookqa-v0-loglikelihood b/tests/testdata/openbookqa-v0-loglikelihood
index b2cc5e9795..bc70ae8322 100644
--- a/tests/testdata/openbookqa-v0-loglikelihood
+++ b/tests/testdata/openbookqa-v0-loglikelihood
@@ -1 +1 @@
-78a49a0ca1a47373adb33463b1d092e6bc0d8f4b01bcb380ada48065037849d7
\ No newline at end of file
+78a49a0ca1a47373adb33463b1d092e6bc0d8f4b01bcb380ada48065037849d7
diff --git a/tests/testdata/openbookqa-v0-res.json b/tests/testdata/openbookqa-v0-res.json
index 04f4c25442..c7195c2448 100644
--- a/tests/testdata/openbookqa-v0-res.json
+++ b/tests/testdata/openbookqa-v0-res.json
@@ -1 +1 @@
-{"results": {"openbookqa": {"acc": 0.214, "acc_norm": 0.276, "acc_norm_stderr": 0.020011219298073517, "acc_stderr": 0.018359797502387046}}, "versions": {"openbookqa": 0}}
\ No newline at end of file
+{"results": {"openbookqa": {"acc": 0.214, "acc_norm": 0.276, "acc_norm_stderr": 0.020011219298073517, "acc_stderr": 0.018359797502387046}}, "versions": {"openbookqa": 0}}
diff --git a/tests/testdata/pile_arxiv-v0-loglikelihood_rolling b/tests/testdata/pile_arxiv-v0-loglikelihood_rolling
index 3aa1d8c734..7f2cf1c523 100644
--- a/tests/testdata/pile_arxiv-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_arxiv-v0-loglikelihood_rolling
@@ -1 +1 @@
-814f9954e44368559602c00f7e85fa3971acdfd0315f508ec7df6318a79c55ec
\ No newline at end of file
+814f9954e44368559602c00f7e85fa3971acdfd0315f508ec7df6318a79c55ec
diff --git a/tests/testdata/pile_arxiv-v0-res.json b/tests/testdata/pile_arxiv-v0-res.json
index d19d0c6fee..5de44de446 100644
--- a/tests/testdata/pile_arxiv-v0-res.json
+++ b/tests/testdata/pile_arxiv-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_arxiv": {"bits_per_byte": 1.0750412350569374e-05, "byte_perplexity": 1.0000107504701365, "word_perplexity": 1.0000819333090385}}, "versions": {"pile_arxiv": 0}}
\ No newline at end of file
+{"results": {"pile_arxiv": {"bits_per_byte": 1.0750412350569374e-05, "byte_perplexity": 1.0000107504701365, "word_perplexity": 1.0000819333090385}}, "versions": {"pile_arxiv": 0}}
diff --git a/tests/testdata/pile_arxiv-v1-loglikelihood_rolling b/tests/testdata/pile_arxiv-v1-loglikelihood_rolling
index 3aa1d8c734..7f2cf1c523 100644
--- a/tests/testdata/pile_arxiv-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_arxiv-v1-loglikelihood_rolling
@@ -1 +1 @@
-814f9954e44368559602c00f7e85fa3971acdfd0315f508ec7df6318a79c55ec
\ No newline at end of file
+814f9954e44368559602c00f7e85fa3971acdfd0315f508ec7df6318a79c55ec
diff --git a/tests/testdata/pile_arxiv-v1-res.json b/tests/testdata/pile_arxiv-v1-res.json
index 05cbab3873..649a9692b6 100644
--- a/tests/testdata/pile_arxiv-v1-res.json
+++ b/tests/testdata/pile_arxiv-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_arxiv": {"bits_per_byte": 1.55095665856779e-05, "byte_perplexity": 1.0000107504701365, "word_perplexity": 1.0000819333090385}}, "versions": {"pile_arxiv": 1}}
\ No newline at end of file
+{"results": {"pile_arxiv": {"bits_per_byte": 1.55095665856779e-05, "byte_perplexity": 1.0000107504701365, "word_perplexity": 1.0000819333090385}}, "versions": {"pile_arxiv": 1}}
diff --git a/tests/testdata/pile_bookcorpus2-v0-loglikelihood_rolling b/tests/testdata/pile_bookcorpus2-v0-loglikelihood_rolling
index b37a91cc2d..5c93b16fdc 100644
--- a/tests/testdata/pile_bookcorpus2-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_bookcorpus2-v0-loglikelihood_rolling
@@ -1 +1 @@
-5c17ddfebeab8c41dabadb6fc216ceda91e3fe5dc95aaf1b2c843d7f11828b03
\ No newline at end of file
+5c17ddfebeab8c41dabadb6fc216ceda91e3fe5dc95aaf1b2c843d7f11828b03
diff --git a/tests/testdata/pile_bookcorpus2-v0-res.json b/tests/testdata/pile_bookcorpus2-v0-res.json
index 698b03e8b3..f9791d82dc 100644
--- a/tests/testdata/pile_bookcorpus2-v0-res.json
+++ b/tests/testdata/pile_bookcorpus2-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_bookcorpus2": {"bits_per_byte": 1.1631037706429144e-06, "byte_perplexity": 1.000001163104447, "word_perplexity": 1.0000066499426599}}, "versions": {"pile_bookcorpus2": 0}}
\ No newline at end of file
+{"results": {"pile_bookcorpus2": {"bits_per_byte": 1.1631037706429144e-06, "byte_perplexity": 1.000001163104447, "word_perplexity": 1.0000066499426599}}, "versions": {"pile_bookcorpus2": 0}}
diff --git a/tests/testdata/pile_bookcorpus2-v1-loglikelihood_rolling b/tests/testdata/pile_bookcorpus2-v1-loglikelihood_rolling
index b37a91cc2d..5c93b16fdc 100644
--- a/tests/testdata/pile_bookcorpus2-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_bookcorpus2-v1-loglikelihood_rolling
@@ -1 +1 @@
-5c17ddfebeab8c41dabadb6fc216ceda91e3fe5dc95aaf1b2c843d7f11828b03
\ No newline at end of file
+5c17ddfebeab8c41dabadb6fc216ceda91e3fe5dc95aaf1b2c843d7f11828b03
diff --git a/tests/testdata/pile_bookcorpus2-v1-res.json b/tests/testdata/pile_bookcorpus2-v1-res.json
index 967c14934b..58038eee33 100644
--- a/tests/testdata/pile_bookcorpus2-v1-res.json
+++ b/tests/testdata/pile_bookcorpus2-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_bookcorpus2": {"bits_per_byte": 1.6780040419457868e-06, "byte_perplexity": 1.000001163104447, "word_perplexity": 1.0000066499426599}}, "versions": {"pile_bookcorpus2": 1}}
\ No newline at end of file
+{"results": {"pile_bookcorpus2": {"bits_per_byte": 1.6780040419457868e-06, "byte_perplexity": 1.000001163104447, "word_perplexity": 1.0000066499426599}}, "versions": {"pile_bookcorpus2": 1}}
diff --git a/tests/testdata/pile_books3-v0-loglikelihood_rolling b/tests/testdata/pile_books3-v0-loglikelihood_rolling
index b483d3b45b..91d876fce6 100644
--- a/tests/testdata/pile_books3-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_books3-v0-loglikelihood_rolling
@@ -1 +1 @@
-0f8f36f705b999b6d55fa72ff89a82793dd1cb568ab1f8727a6a2086a12b9410
\ No newline at end of file
+0f8f36f705b999b6d55fa72ff89a82793dd1cb568ab1f8727a6a2086a12b9410
diff --git a/tests/testdata/pile_books3-v0-res.json b/tests/testdata/pile_books3-v0-res.json
index df19cd0a18..ea7459d072 100644
--- a/tests/testdata/pile_books3-v0-res.json
+++ b/tests/testdata/pile_books3-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_books3": {"bits_per_byte": 8.942486206275221e-07, "byte_perplexity": 1.0000008942490204, "word_perplexity": 1.0000052870063607}}, "versions": {"pile_books3": 0}}
\ No newline at end of file
+{"results": {"pile_books3": {"bits_per_byte": 8.942486206275221e-07, "byte_perplexity": 1.0000008942490204, "word_perplexity": 1.0000052870063607}}, "versions": {"pile_books3": 0}}
diff --git a/tests/testdata/pile_books3-v1-loglikelihood_rolling b/tests/testdata/pile_books3-v1-loglikelihood_rolling
index b483d3b45b..91d876fce6 100644
--- a/tests/testdata/pile_books3-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_books3-v1-loglikelihood_rolling
@@ -1 +1 @@
-0f8f36f705b999b6d55fa72ff89a82793dd1cb568ab1f8727a6a2086a12b9410
\ No newline at end of file
+0f8f36f705b999b6d55fa72ff89a82793dd1cb568ab1f8727a6a2086a12b9410
diff --git a/tests/testdata/pile_books3-v1-res.json b/tests/testdata/pile_books3-v1-res.json
index 6ff7a51711..75e90bf524 100644
--- a/tests/testdata/pile_books3-v1-res.json
+++ b/tests/testdata/pile_books3-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_books3": {"bits_per_byte": 1.2901280503011222e-06, "byte_perplexity": 1.0000008942490204, "word_perplexity": 1.0000052870063607}}, "versions": {"pile_books3": 1}}
\ No newline at end of file
+{"results": {"pile_books3": {"bits_per_byte": 1.2901280503011222e-06, "byte_perplexity": 1.0000008942490204, "word_perplexity": 1.0000052870063607}}, "versions": {"pile_books3": 1}}
diff --git a/tests/testdata/pile_dm-mathematics-v0-loglikelihood_rolling b/tests/testdata/pile_dm-mathematics-v0-loglikelihood_rolling
index 2fb27786c5..728aed27d3 100644
--- a/tests/testdata/pile_dm-mathematics-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_dm-mathematics-v0-loglikelihood_rolling
@@ -1 +1 @@
-d5b7967c0ece8b816f3921a8bd0fad23365349e935b491595e2ad1135af42da6
\ No newline at end of file
+d5b7967c0ece8b816f3921a8bd0fad23365349e935b491595e2ad1135af42da6
diff --git a/tests/testdata/pile_dm-mathematics-v0-res.json b/tests/testdata/pile_dm-mathematics-v0-res.json
index 860aa06c97..86fc412583 100644
--- a/tests/testdata/pile_dm-mathematics-v0-res.json
+++ b/tests/testdata/pile_dm-mathematics-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_dm-mathematics": {"bits_per_byte": 6.176600873627999e-05, "byte_perplexity": 1.0000617679162955, "word_perplexity": 1.0002875035042451}}, "versions": {"pile_dm-mathematics": 0}}
\ No newline at end of file
+{"results": {"pile_dm-mathematics": {"bits_per_byte": 6.176600873627999e-05, "byte_perplexity": 1.0000617679162955, "word_perplexity": 1.0002875035042451}}, "versions": {"pile_dm-mathematics": 0}}
diff --git a/tests/testdata/pile_dm-mathematics-v1-loglikelihood_rolling b/tests/testdata/pile_dm-mathematics-v1-loglikelihood_rolling
index 2fb27786c5..728aed27d3 100644
--- a/tests/testdata/pile_dm-mathematics-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_dm-mathematics-v1-loglikelihood_rolling
@@ -1 +1 @@
-d5b7967c0ece8b816f3921a8bd0fad23365349e935b491595e2ad1135af42da6
\ No newline at end of file
+d5b7967c0ece8b816f3921a8bd0fad23365349e935b491595e2ad1135af42da6
diff --git a/tests/testdata/pile_dm-mathematics-v1-res.json b/tests/testdata/pile_dm-mathematics-v1-res.json
index 192e9066a4..e4c44507cf 100644
--- a/tests/testdata/pile_dm-mathematics-v1-res.json
+++ b/tests/testdata/pile_dm-mathematics-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_dm-mathematics": {"bits_per_byte": 8.910951449933553e-05, "byte_perplexity": 1.0000617679162955, "word_perplexity": 1.0002875035042451}}, "versions": {"pile_dm-mathematics": 1}}
\ No newline at end of file
+{"results": {"pile_dm-mathematics": {"bits_per_byte": 8.910951449933553e-05, "byte_perplexity": 1.0000617679162955, "word_perplexity": 1.0002875035042451}}, "versions": {"pile_dm-mathematics": 1}}
diff --git a/tests/testdata/pile_enron-v0-loglikelihood_rolling b/tests/testdata/pile_enron-v0-loglikelihood_rolling
index 57dbe76460..374580107d 100644
--- a/tests/testdata/pile_enron-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_enron-v0-loglikelihood_rolling
@@ -1 +1 @@
-4baa6ccdc9e3aa9921675ab4400d5e89d7b546b844a8ea28f6461d649066418a
\ No newline at end of file
+4baa6ccdc9e3aa9921675ab4400d5e89d7b546b844a8ea28f6461d649066418a
diff --git a/tests/testdata/pile_enron-v0-res.json b/tests/testdata/pile_enron-v0-res.json
index a4a49493d5..c26a9322a5 100644
--- a/tests/testdata/pile_enron-v0-res.json
+++ b/tests/testdata/pile_enron-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_enron": {"bits_per_byte": 0.0003163902828673244, "byte_perplexity": 1.000316440339552, "word_perplexity": 1.00224668051869}}, "versions": {"pile_enron": 0}}
\ No newline at end of file
+{"results": {"pile_enron": {"bits_per_byte": 0.0003163902828673244, "byte_perplexity": 1.000316440339552, "word_perplexity": 1.00224668051869}}, "versions": {"pile_enron": 0}}
diff --git a/tests/testdata/pile_enron-v1-loglikelihood_rolling b/tests/testdata/pile_enron-v1-loglikelihood_rolling
index 57dbe76460..374580107d 100644
--- a/tests/testdata/pile_enron-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_enron-v1-loglikelihood_rolling
@@ -1 +1 @@
-4baa6ccdc9e3aa9921675ab4400d5e89d7b546b844a8ea28f6461d649066418a
\ No newline at end of file
+4baa6ccdc9e3aa9921675ab4400d5e89d7b546b844a8ea28f6461d649066418a
diff --git a/tests/testdata/pile_enron-v1-res.json b/tests/testdata/pile_enron-v1-res.json
index abe7b45f9a..257e3d0b06 100644
--- a/tests/testdata/pile_enron-v1-res.json
+++ b/tests/testdata/pile_enron-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_enron": {"bits_per_byte": 0.0004564546920781453, "byte_perplexity": 1.000316440339552, "word_perplexity": 1.00224668051869}}, "versions": {"pile_enron": 1}}
\ No newline at end of file
+{"results": {"pile_enron": {"bits_per_byte": 0.0004564546920781453, "byte_perplexity": 1.000316440339552, "word_perplexity": 1.00224668051869}}, "versions": {"pile_enron": 1}}
diff --git a/tests/testdata/pile_europarl-v0-loglikelihood_rolling b/tests/testdata/pile_europarl-v0-loglikelihood_rolling
index 8027260755..beb1f6fbed 100644
--- a/tests/testdata/pile_europarl-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_europarl-v0-loglikelihood_rolling
@@ -1 +1 @@
-e67d3dbccd47d308bfc5b0e66b76d0dfc5e386ebfa94e056562c2281c395543f
\ No newline at end of file
+e67d3dbccd47d308bfc5b0e66b76d0dfc5e386ebfa94e056562c2281c395543f
diff --git a/tests/testdata/pile_europarl-v0-res.json b/tests/testdata/pile_europarl-v0-res.json
index 4c53edd2ce..d6fd7d406e 100644
--- a/tests/testdata/pile_europarl-v0-res.json
+++ b/tests/testdata/pile_europarl-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_europarl": {"bits_per_byte": 8.648858203555344e-06, "byte_perplexity": 1.000008648895605, "word_perplexity": 1.000063506523818}}, "versions": {"pile_europarl": 0}}
\ No newline at end of file
+{"results": {"pile_europarl": {"bits_per_byte": 8.648858203555344e-06, "byte_perplexity": 1.000008648895605, "word_perplexity": 1.000063506523818}}, "versions": {"pile_europarl": 0}}
diff --git a/tests/testdata/pile_europarl-v1-loglikelihood_rolling b/tests/testdata/pile_europarl-v1-loglikelihood_rolling
index 8027260755..beb1f6fbed 100644
--- a/tests/testdata/pile_europarl-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_europarl-v1-loglikelihood_rolling
@@ -1 +1 @@
-e67d3dbccd47d308bfc5b0e66b76d0dfc5e386ebfa94e056562c2281c395543f
\ No newline at end of file
+e67d3dbccd47d308bfc5b0e66b76d0dfc5e386ebfa94e056562c2281c395543f
diff --git a/tests/testdata/pile_europarl-v1-res.json b/tests/testdata/pile_europarl-v1-res.json
index b948f0d369..d9c45675a3 100644
--- a/tests/testdata/pile_europarl-v1-res.json
+++ b/tests/testdata/pile_europarl-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_europarl": {"bits_per_byte": 1.2477664839621123e-05, "byte_perplexity": 1.000008648895605, "word_perplexity": 1.000063506523818}}, "versions": {"pile_europarl": 1}}
\ No newline at end of file
+{"results": {"pile_europarl": {"bits_per_byte": 1.2477664839621123e-05, "byte_perplexity": 1.000008648895605, "word_perplexity": 1.000063506523818}}, "versions": {"pile_europarl": 1}}
diff --git a/tests/testdata/pile_freelaw-v0-loglikelihood_rolling b/tests/testdata/pile_freelaw-v0-loglikelihood_rolling
index 7b5771f491..c8e1cd2e38 100644
--- a/tests/testdata/pile_freelaw-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_freelaw-v0-loglikelihood_rolling
@@ -1 +1 @@
-d77f3f68aadd6cbf1290c2f6737b2ed5d5c2a60e4c81a65c280f207783caabe1
\ No newline at end of file
+d77f3f68aadd6cbf1290c2f6737b2ed5d5c2a60e4c81a65c280f207783caabe1
diff --git a/tests/testdata/pile_freelaw-v0-res.json b/tests/testdata/pile_freelaw-v0-res.json
index 0bda41ffb3..cff04889f9 100644
--- a/tests/testdata/pile_freelaw-v0-res.json
+++ b/tests/testdata/pile_freelaw-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_freelaw": {"bits_per_byte": 3.16238943008513e-05, "byte_perplexity": 1.0000316243943415, "word_perplexity": 1.000203169094218}}, "versions": {"pile_freelaw": 0}}
\ No newline at end of file
+{"results": {"pile_freelaw": {"bits_per_byte": 3.16238943008513e-05, "byte_perplexity": 1.0000316243943415, "word_perplexity": 1.000203169094218}}, "versions": {"pile_freelaw": 0}}
diff --git a/tests/testdata/pile_freelaw-v1-loglikelihood_rolling b/tests/testdata/pile_freelaw-v1-loglikelihood_rolling
index 7b5771f491..c8e1cd2e38 100644
--- a/tests/testdata/pile_freelaw-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_freelaw-v1-loglikelihood_rolling
@@ -1 +1 @@
-d77f3f68aadd6cbf1290c2f6737b2ed5d5c2a60e4c81a65c280f207783caabe1
\ No newline at end of file
+d77f3f68aadd6cbf1290c2f6737b2ed5d5c2a60e4c81a65c280f207783caabe1
diff --git a/tests/testdata/pile_freelaw-v1-res.json b/tests/testdata/pile_freelaw-v1-res.json
index dd0e0bac36..e4831db8ad 100644
--- a/tests/testdata/pile_freelaw-v1-res.json
+++ b/tests/testdata/pile_freelaw-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_freelaw": {"bits_per_byte": 4.5623635481434923e-05, "byte_perplexity": 1.0000316243943415, "word_perplexity": 1.000203169094218}}, "versions": {"pile_freelaw": 1}}
\ No newline at end of file
+{"results": {"pile_freelaw": {"bits_per_byte": 4.5623635481434923e-05, "byte_perplexity": 1.0000316243943415, "word_perplexity": 1.000203169094218}}, "versions": {"pile_freelaw": 1}}
diff --git a/tests/testdata/pile_github-v0-loglikelihood_rolling b/tests/testdata/pile_github-v0-loglikelihood_rolling
index cf8251e4f6..98e197e1e6 100644
--- a/tests/testdata/pile_github-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_github-v0-loglikelihood_rolling
@@ -1 +1 @@
-df384c3df3d8f53273e97127c5bb84c17e638acad7d6bc9c91f6dee96d43b639
\ No newline at end of file
+df384c3df3d8f53273e97127c5bb84c17e638acad7d6bc9c91f6dee96d43b639
diff --git a/tests/testdata/pile_github-v0-res.json b/tests/testdata/pile_github-v0-res.json
index bdabf39969..8f999e5d28 100644
--- a/tests/testdata/pile_github-v0-res.json
+++ b/tests/testdata/pile_github-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_github": {"bits_per_byte": 9.540627613754646e-05, "byte_perplexity": 1.0000954108274611, "word_perplexity": 1.0009643183931227}}, "versions": {"pile_github": 0}}
\ No newline at end of file
+{"results": {"pile_github": {"bits_per_byte": 9.540627613754646e-05, "byte_perplexity": 1.0000954108274611, "word_perplexity": 1.0009643183931227}}, "versions": {"pile_github": 0}}
diff --git a/tests/testdata/pile_github-v1-loglikelihood_rolling b/tests/testdata/pile_github-v1-loglikelihood_rolling
index cf8251e4f6..98e197e1e6 100644
--- a/tests/testdata/pile_github-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_github-v1-loglikelihood_rolling
@@ -1 +1 @@
-df384c3df3d8f53273e97127c5bb84c17e638acad7d6bc9c91f6dee96d43b639
\ No newline at end of file
+df384c3df3d8f53273e97127c5bb84c17e638acad7d6bc9c91f6dee96d43b639
diff --git a/tests/testdata/pile_github-v1-res.json b/tests/testdata/pile_github-v1-res.json
index cc06a45501..4835ab2df7 100644
--- a/tests/testdata/pile_github-v1-res.json
+++ b/tests/testdata/pile_github-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_github": {"bits_per_byte": 0.00013764216145332133, "byte_perplexity": 1.0000954108274611, "word_perplexity": 1.0009643183931227}}, "versions": {"pile_github": 1}}
\ No newline at end of file
+{"results": {"pile_github": {"bits_per_byte": 0.00013764216145332133, "byte_perplexity": 1.0000954108274611, "word_perplexity": 1.0009643183931227}}, "versions": {"pile_github": 1}}
diff --git a/tests/testdata/pile_gutenberg-v0-loglikelihood_rolling b/tests/testdata/pile_gutenberg-v0-loglikelihood_rolling
index bd7b15927f..73017a7f73 100644
--- a/tests/testdata/pile_gutenberg-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_gutenberg-v0-loglikelihood_rolling
@@ -1 +1 @@
-02a559f74a9105145e7d4d9c5ddea372b5b4938f5368dc8ffafc39cbe3b4c7ef
\ No newline at end of file
+02a559f74a9105145e7d4d9c5ddea372b5b4938f5368dc8ffafc39cbe3b4c7ef
diff --git a/tests/testdata/pile_gutenberg-v0-res.json b/tests/testdata/pile_gutenberg-v0-res.json
index 757ef06f79..f5b866ba76 100644
--- a/tests/testdata/pile_gutenberg-v0-res.json
+++ b/tests/testdata/pile_gutenberg-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_gutenberg": {"bits_per_byte": 1.2443606332351536e-06, "byte_perplexity": 1.0000012443614075, "word_perplexity": 1.0000072174665404}}, "versions": {"pile_gutenberg": 0}}
\ No newline at end of file
+{"results": {"pile_gutenberg": {"bits_per_byte": 1.2443606332351536e-06, "byte_perplexity": 1.0000012443614075, "word_perplexity": 1.0000072174665404}}, "versions": {"pile_gutenberg": 0}}
diff --git a/tests/testdata/pile_gutenberg-v1-loglikelihood_rolling b/tests/testdata/pile_gutenberg-v1-loglikelihood_rolling
index bd7b15927f..73017a7f73 100644
--- a/tests/testdata/pile_gutenberg-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_gutenberg-v1-loglikelihood_rolling
@@ -1 +1 @@
-02a559f74a9105145e7d4d9c5ddea372b5b4938f5368dc8ffafc39cbe3b4c7ef
\ No newline at end of file
+02a559f74a9105145e7d4d9c5ddea372b5b4938f5368dc8ffafc39cbe3b4c7ef
diff --git a/tests/testdata/pile_gutenberg-v1-res.json b/tests/testdata/pile_gutenberg-v1-res.json
index 6d22ed3ff5..92add0a29b 100644
--- a/tests/testdata/pile_gutenberg-v1-res.json
+++ b/tests/testdata/pile_gutenberg-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_gutenberg": {"bits_per_byte": 1.7952329146458065e-06, "byte_perplexity": 1.0000012443614075, "word_perplexity": 1.0000072174665404}}, "versions": {"pile_gutenberg": 1}}
\ No newline at end of file
+{"results": {"pile_gutenberg": {"bits_per_byte": 1.7952329146458065e-06, "byte_perplexity": 1.0000012443614075, "word_perplexity": 1.0000072174665404}}, "versions": {"pile_gutenberg": 1}}
diff --git a/tests/testdata/pile_hackernews-v0-loglikelihood_rolling b/tests/testdata/pile_hackernews-v0-loglikelihood_rolling
index 48b767bfe7..9298a46425 100644
--- a/tests/testdata/pile_hackernews-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_hackernews-v0-loglikelihood_rolling
@@ -1 +1 @@
-ec1082ee5a5326e0d57aa4e73b634937140c1de9af95f154e8ab57b05d9b422b
\ No newline at end of file
+ec1082ee5a5326e0d57aa4e73b634937140c1de9af95f154e8ab57b05d9b422b
diff --git a/tests/testdata/pile_hackernews-v0-res.json b/tests/testdata/pile_hackernews-v0-res.json
index 68578fe4c9..e2deadbd78 100644
--- a/tests/testdata/pile_hackernews-v0-res.json
+++ b/tests/testdata/pile_hackernews-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_hackernews": {"bits_per_byte": 0.00010170276359193358, "byte_perplexity": 1.0001017079354932, "word_perplexity": 1.0006273924348839}}, "versions": {"pile_hackernews": 0}}
\ No newline at end of file
+{"results": {"pile_hackernews": {"bits_per_byte": 0.00010170276359193358, "byte_perplexity": 1.0001017079354932, "word_perplexity": 1.0006273924348839}}, "versions": {"pile_hackernews": 0}}
diff --git a/tests/testdata/pile_hackernews-v1-loglikelihood_rolling b/tests/testdata/pile_hackernews-v1-loglikelihood_rolling
index 48b767bfe7..9298a46425 100644
--- a/tests/testdata/pile_hackernews-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_hackernews-v1-loglikelihood_rolling
@@ -1 +1 @@
-ec1082ee5a5326e0d57aa4e73b634937140c1de9af95f154e8ab57b05d9b422b
\ No newline at end of file
+ec1082ee5a5326e0d57aa4e73b634937140c1de9af95f154e8ab57b05d9b422b
diff --git a/tests/testdata/pile_hackernews-v1-res.json b/tests/testdata/pile_hackernews-v1-res.json
index ea135278b7..46aeb3a266 100644
--- a/tests/testdata/pile_hackernews-v1-res.json
+++ b/tests/testdata/pile_hackernews-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_hackernews": {"bits_per_byte": 0.00014672607267878518, "byte_perplexity": 1.0001017079354932, "word_perplexity": 1.0006273924348839}}, "versions": {"pile_hackernews": 1}}
\ No newline at end of file
+{"results": {"pile_hackernews": {"bits_per_byte": 0.00014672607267878518, "byte_perplexity": 1.0001017079354932, "word_perplexity": 1.0006273924348839}}, "versions": {"pile_hackernews": 1}}
diff --git a/tests/testdata/pile_nih-exporter-v0-loglikelihood_rolling b/tests/testdata/pile_nih-exporter-v0-loglikelihood_rolling
index 5f76588a81..ffc7508b50 100644
--- a/tests/testdata/pile_nih-exporter-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_nih-exporter-v0-loglikelihood_rolling
@@ -1 +1 @@
-520ea6e04e8a39dc0b5f63a837429a78a40e63d39d109096101feb8c5b2cf8d8
\ No newline at end of file
+520ea6e04e8a39dc0b5f63a837429a78a40e63d39d109096101feb8c5b2cf8d8
diff --git a/tests/testdata/pile_nih-exporter-v0-res.json b/tests/testdata/pile_nih-exporter-v0-res.json
index 1c7bb56c6d..66312fe60b 100644
--- a/tests/testdata/pile_nih-exporter-v0-res.json
+++ b/tests/testdata/pile_nih-exporter-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_nih-exporter": {"bits_per_byte": 0.00024394433346975716, "byte_perplexity": 1.0002439740903082, "word_perplexity": 1.0016712202288802}}, "versions": {"pile_nih-exporter": 0}}
\ No newline at end of file
+{"results": {"pile_nih-exporter": {"bits_per_byte": 0.00024394433346975716, "byte_perplexity": 1.0002439740903082, "word_perplexity": 1.0016712202288802}}, "versions": {"pile_nih-exporter": 0}}
diff --git a/tests/testdata/pile_nih-exporter-v1-loglikelihood_rolling b/tests/testdata/pile_nih-exporter-v1-loglikelihood_rolling
index 5f76588a81..ffc7508b50 100644
--- a/tests/testdata/pile_nih-exporter-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_nih-exporter-v1-loglikelihood_rolling
@@ -1 +1 @@
-520ea6e04e8a39dc0b5f63a837429a78a40e63d39d109096101feb8c5b2cf8d8
\ No newline at end of file
+520ea6e04e8a39dc0b5f63a837429a78a40e63d39d109096101feb8c5b2cf8d8
diff --git a/tests/testdata/pile_nih-exporter-v1-res.json b/tests/testdata/pile_nih-exporter-v1-res.json
index 0e40fc8268..f7135da8d7 100644
--- a/tests/testdata/pile_nih-exporter-v1-res.json
+++ b/tests/testdata/pile_nih-exporter-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_nih-exporter": {"bits_per_byte": 0.00035193728014978225, "byte_perplexity": 1.0002439740903082, "word_perplexity": 1.0016712202288802}}, "versions": {"pile_nih-exporter": 1}}
\ No newline at end of file
+{"results": {"pile_nih-exporter": {"bits_per_byte": 0.00035193728014978225, "byte_perplexity": 1.0002439740903082, "word_perplexity": 1.0016712202288802}}, "versions": {"pile_nih-exporter": 1}}
diff --git a/tests/testdata/pile_opensubtitles-v0-loglikelihood_rolling b/tests/testdata/pile_opensubtitles-v0-loglikelihood_rolling
index 47805d3b5f..059c1ee54a 100644
--- a/tests/testdata/pile_opensubtitles-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_opensubtitles-v0-loglikelihood_rolling
@@ -1 +1 @@
-0f1c23a1f4ddec0c2b1ff34de8d1505b0eb9e2868d8edbcc1b6de13d02f32036
\ No newline at end of file
+0f1c23a1f4ddec0c2b1ff34de8d1505b0eb9e2868d8edbcc1b6de13d02f32036
diff --git a/tests/testdata/pile_opensubtitles-v0-res.json b/tests/testdata/pile_opensubtitles-v0-res.json
index f718e515ba..27d85cd66c 100644
--- a/tests/testdata/pile_opensubtitles-v0-res.json
+++ b/tests/testdata/pile_opensubtitles-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_opensubtitles": {"bits_per_byte": 1.5213441136639177e-05, "byte_perplexity": 1.0000152135568616, "word_perplexity": 1.0000856162053249}}, "versions": {"pile_opensubtitles": 0}}
\ No newline at end of file
+{"results": {"pile_opensubtitles": {"bits_per_byte": 1.5213441136639177e-05, "byte_perplexity": 1.0000152135568616, "word_perplexity": 1.0000856162053249}}, "versions": {"pile_opensubtitles": 0}}
diff --git a/tests/testdata/pile_opensubtitles-v1-loglikelihood_rolling b/tests/testdata/pile_opensubtitles-v1-loglikelihood_rolling
index 47805d3b5f..059c1ee54a 100644
--- a/tests/testdata/pile_opensubtitles-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_opensubtitles-v1-loglikelihood_rolling
@@ -1 +1 @@
-0f1c23a1f4ddec0c2b1ff34de8d1505b0eb9e2868d8edbcc1b6de13d02f32036
\ No newline at end of file
+0f1c23a1f4ddec0c2b1ff34de8d1505b0eb9e2868d8edbcc1b6de13d02f32036
diff --git a/tests/testdata/pile_opensubtitles-v1-res.json b/tests/testdata/pile_opensubtitles-v1-res.json
index 1468294732..16f0881b8e 100644
--- a/tests/testdata/pile_opensubtitles-v1-res.json
+++ b/tests/testdata/pile_opensubtitles-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_opensubtitles": {"bits_per_byte": 2.1948356082685497e-05, "byte_perplexity": 1.0000152135568616, "word_perplexity": 1.0000856162053249}}, "versions": {"pile_opensubtitles": 1}}
\ No newline at end of file
+{"results": {"pile_opensubtitles": {"bits_per_byte": 2.1948356082685497e-05, "byte_perplexity": 1.0000152135568616, "word_perplexity": 1.0000856162053249}}, "versions": {"pile_opensubtitles": 1}}
diff --git a/tests/testdata/pile_openwebtext2-v0-loglikelihood_rolling b/tests/testdata/pile_openwebtext2-v0-loglikelihood_rolling
index 22046e4405..8b66d7a70f 100644
--- a/tests/testdata/pile_openwebtext2-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_openwebtext2-v0-loglikelihood_rolling
@@ -1 +1 @@
-5d6c19665f429ab1ccbe027da67f42bdaf219f819ab093673976eee55e015ff4
\ No newline at end of file
+5d6c19665f429ab1ccbe027da67f42bdaf219f819ab093673976eee55e015ff4
diff --git a/tests/testdata/pile_openwebtext2-v0-res.json b/tests/testdata/pile_openwebtext2-v0-res.json
index ead8d0b0bf..187af3c076 100644
--- a/tests/testdata/pile_openwebtext2-v0-res.json
+++ b/tests/testdata/pile_openwebtext2-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_openwebtext2": {"bits_per_byte": 0.00012809520662477846, "byte_perplexity": 1.000128103411166, "word_perplexity": 1.0007951516532847}}, "versions": {"pile_openwebtext2": 0}}
\ No newline at end of file
+{"results": {"pile_openwebtext2": {"bits_per_byte": 0.00012809520662477846, "byte_perplexity": 1.000128103411166, "word_perplexity": 1.0007951516532847}}, "versions": {"pile_openwebtext2": 0}}
diff --git a/tests/testdata/pile_openwebtext2-v1-loglikelihood_rolling b/tests/testdata/pile_openwebtext2-v1-loglikelihood_rolling
index 22046e4405..8b66d7a70f 100644
--- a/tests/testdata/pile_openwebtext2-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_openwebtext2-v1-loglikelihood_rolling
@@ -1 +1 @@
-5d6c19665f429ab1ccbe027da67f42bdaf219f819ab093673976eee55e015ff4
\ No newline at end of file
+5d6c19665f429ab1ccbe027da67f42bdaf219f819ab093673976eee55e015ff4
diff --git a/tests/testdata/pile_openwebtext2-v1-res.json b/tests/testdata/pile_openwebtext2-v1-res.json
index ca433e3c85..5718273e0a 100644
--- a/tests/testdata/pile_openwebtext2-v1-res.json
+++ b/tests/testdata/pile_openwebtext2-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_openwebtext2": {"bits_per_byte": 0.000184802319359215, "byte_perplexity": 1.000128103411166, "word_perplexity": 1.0007951516532847}}, "versions": {"pile_openwebtext2": 1}}
\ No newline at end of file
+{"results": {"pile_openwebtext2": {"bits_per_byte": 0.000184802319359215, "byte_perplexity": 1.000128103411166, "word_perplexity": 1.0007951516532847}}, "versions": {"pile_openwebtext2": 1}}
diff --git a/tests/testdata/pile_philpapers-v0-loglikelihood_rolling b/tests/testdata/pile_philpapers-v0-loglikelihood_rolling
index 4fbbc241ba..5719d858e2 100644
--- a/tests/testdata/pile_philpapers-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_philpapers-v0-loglikelihood_rolling
@@ -1 +1 @@
-339ba5d8c044c4a3ff9b9a8eaa24da1d6c01b72972074eb671a7da049eeb7047
\ No newline at end of file
+339ba5d8c044c4a3ff9b9a8eaa24da1d6c01b72972074eb671a7da049eeb7047
diff --git a/tests/testdata/pile_philpapers-v0-res.json b/tests/testdata/pile_philpapers-v0-res.json
index be561fe2f8..31c6775002 100644
--- a/tests/testdata/pile_philpapers-v0-res.json
+++ b/tests/testdata/pile_philpapers-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_philpapers": {"bits_per_byte": 6.241575895982095e-06, "byte_perplexity": 1.0000062415953748, "word_perplexity": 1.0000409888564146}}, "versions": {"pile_philpapers": 0}}
\ No newline at end of file
+{"results": {"pile_philpapers": {"bits_per_byte": 6.241575895982095e-06, "byte_perplexity": 1.0000062415953748, "word_perplexity": 1.0000409888564146}}, "versions": {"pile_philpapers": 0}}
diff --git a/tests/testdata/pile_philpapers-v1-loglikelihood_rolling b/tests/testdata/pile_philpapers-v1-loglikelihood_rolling
index 4fbbc241ba..5719d858e2 100644
--- a/tests/testdata/pile_philpapers-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_philpapers-v1-loglikelihood_rolling
@@ -1 +1 @@
-339ba5d8c044c4a3ff9b9a8eaa24da1d6c01b72972074eb671a7da049eeb7047
\ No newline at end of file
+339ba5d8c044c4a3ff9b9a8eaa24da1d6c01b72972074eb671a7da049eeb7047
diff --git a/tests/testdata/pile_philpapers-v1-res.json b/tests/testdata/pile_philpapers-v1-res.json
index 5a2f77678a..c4ae2664ff 100644
--- a/tests/testdata/pile_philpapers-v1-res.json
+++ b/tests/testdata/pile_philpapers-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_philpapers": {"bits_per_byte": 9.004690592465457e-06, "byte_perplexity": 1.0000062415953748, "word_perplexity": 1.0000409888564146}}, "versions": {"pile_philpapers": 1}}
\ No newline at end of file
+{"results": {"pile_philpapers": {"bits_per_byte": 9.004690592465457e-06, "byte_perplexity": 1.0000062415953748, "word_perplexity": 1.0000409888564146}}, "versions": {"pile_philpapers": 1}}
diff --git a/tests/testdata/pile_pile-cc-v0-loglikelihood_rolling b/tests/testdata/pile_pile-cc-v0-loglikelihood_rolling
index d5369ed3c9..13ed12a480 100644
--- a/tests/testdata/pile_pile-cc-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_pile-cc-v0-loglikelihood_rolling
@@ -1 +1 @@
-731fdef4a43949b179ba0c540148ebc2fa41583dd583ef580dd812076c66a451
\ No newline at end of file
+731fdef4a43949b179ba0c540148ebc2fa41583dd583ef580dd812076c66a451
diff --git a/tests/testdata/pile_pile-cc-v0-res.json b/tests/testdata/pile_pile-cc-v0-res.json
index 383233f259..b115ee6e40 100644
--- a/tests/testdata/pile_pile-cc-v0-res.json
+++ b/tests/testdata/pile_pile-cc-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_pile-cc": {"bits_per_byte": 0.00011234131907228174, "byte_perplexity": 1.0001123476295946, "word_perplexity": 1.0006738958554477}}, "versions": {"pile_pile-cc": 0}}
\ No newline at end of file
+{"results": {"pile_pile-cc": {"bits_per_byte": 0.00011234131907228174, "byte_perplexity": 1.0001123476295946, "word_perplexity": 1.0006738958554477}}, "versions": {"pile_pile-cc": 0}}
diff --git a/tests/testdata/pile_pile-cc-v1-loglikelihood_rolling b/tests/testdata/pile_pile-cc-v1-loglikelihood_rolling
index d5369ed3c9..13ed12a480 100644
--- a/tests/testdata/pile_pile-cc-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_pile-cc-v1-loglikelihood_rolling
@@ -1 +1 @@
-731fdef4a43949b179ba0c540148ebc2fa41583dd583ef580dd812076c66a451
\ No newline at end of file
+731fdef4a43949b179ba0c540148ebc2fa41583dd583ef580dd812076c66a451
diff --git a/tests/testdata/pile_pile-cc-v1-res.json b/tests/testdata/pile_pile-cc-v1-res.json
index bd2772e32a..c14dcba1c2 100644
--- a/tests/testdata/pile_pile-cc-v1-res.json
+++ b/tests/testdata/pile_pile-cc-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_pile-cc": {"bits_per_byte": 0.0001620742639125056, "byte_perplexity": 1.0001123476295946, "word_perplexity": 1.0006738958554477}}, "versions": {"pile_pile-cc": 1}}
\ No newline at end of file
+{"results": {"pile_pile-cc": {"bits_per_byte": 0.0001620742639125056, "byte_perplexity": 1.0001123476295946, "word_perplexity": 1.0006738958554477}}, "versions": {"pile_pile-cc": 1}}
diff --git a/tests/testdata/pile_pubmed-abstracts-v0-loglikelihood_rolling b/tests/testdata/pile_pubmed-abstracts-v0-loglikelihood_rolling
index de5660d60a..61e6ef3e2e 100644
--- a/tests/testdata/pile_pubmed-abstracts-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_pubmed-abstracts-v0-loglikelihood_rolling
@@ -1 +1 @@
-66436569a43163afb2caf422d32c5f329899e74c49865d4d13881fd465fd9976
\ No newline at end of file
+66436569a43163afb2caf422d32c5f329899e74c49865d4d13881fd465fd9976
diff --git a/tests/testdata/pile_pubmed-abstracts-v0-res.json b/tests/testdata/pile_pubmed-abstracts-v0-res.json
index 333c2970fa..9b13a860a4 100644
--- a/tests/testdata/pile_pubmed-abstracts-v0-res.json
+++ b/tests/testdata/pile_pubmed-abstracts-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_pubmed-abstracts": {"bits_per_byte": 0.00037553733051528816, "byte_perplexity": 1.0003756078534862, "word_perplexity": 1.0025884332779}}, "versions": {"pile_pubmed-abstracts": 0}}
\ No newline at end of file
+{"results": {"pile_pubmed-abstracts": {"bits_per_byte": 0.00037553733051528816, "byte_perplexity": 1.0003756078534862, "word_perplexity": 1.0025884332779}}, "versions": {"pile_pubmed-abstracts": 0}}
diff --git a/tests/testdata/pile_pubmed-abstracts-v1-loglikelihood_rolling b/tests/testdata/pile_pubmed-abstracts-v1-loglikelihood_rolling
index de5660d60a..61e6ef3e2e 100644
--- a/tests/testdata/pile_pubmed-abstracts-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_pubmed-abstracts-v1-loglikelihood_rolling
@@ -1 +1 @@
-66436569a43163afb2caf422d32c5f329899e74c49865d4d13881fd465fd9976
\ No newline at end of file
+66436569a43163afb2caf422d32c5f329899e74c49865d4d13881fd465fd9976
diff --git a/tests/testdata/pile_pubmed-abstracts-v1-res.json b/tests/testdata/pile_pubmed-abstracts-v1-res.json
index 21b6bb451f..9a3736f685 100644
--- a/tests/testdata/pile_pubmed-abstracts-v1-res.json
+++ b/tests/testdata/pile_pubmed-abstracts-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_pubmed-abstracts": {"bits_per_byte": 0.0005417858444030858, "byte_perplexity": 1.0003756078534862, "word_perplexity": 1.0025884332779}}, "versions": {"pile_pubmed-abstracts": 1}}
\ No newline at end of file
+{"results": {"pile_pubmed-abstracts": {"bits_per_byte": 0.0005417858444030858, "byte_perplexity": 1.0003756078534862, "word_perplexity": 1.0025884332779}}, "versions": {"pile_pubmed-abstracts": 1}}
diff --git a/tests/testdata/pile_pubmed-central-v0-loglikelihood_rolling b/tests/testdata/pile_pubmed-central-v0-loglikelihood_rolling
index 283109f32e..d2bb7746ec 100644
--- a/tests/testdata/pile_pubmed-central-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_pubmed-central-v0-loglikelihood_rolling
@@ -1 +1 @@
-40b39d120d99a145690444e86acc3e3e24d41e6e0538a75e26929ad84926e5e0
\ No newline at end of file
+40b39d120d99a145690444e86acc3e3e24d41e6e0538a75e26929ad84926e5e0
diff --git a/tests/testdata/pile_pubmed-central-v0-res.json b/tests/testdata/pile_pubmed-central-v0-res.json
index 6e5f1efe49..0e4cd3b916 100644
--- a/tests/testdata/pile_pubmed-central-v0-res.json
+++ b/tests/testdata/pile_pubmed-central-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_pubmed-central": {"bits_per_byte": 1.5812411832795375e-05, "byte_perplexity": 1.0000158125368497, "word_perplexity": 1.000123107107861}}, "versions": {"pile_pubmed-central": 0}}
\ No newline at end of file
+{"results": {"pile_pubmed-central": {"bits_per_byte": 1.5812411832795375e-05, "byte_perplexity": 1.0000158125368497, "word_perplexity": 1.000123107107861}}, "versions": {"pile_pubmed-central": 0}}
diff --git a/tests/testdata/pile_pubmed-central-v1-loglikelihood_rolling b/tests/testdata/pile_pubmed-central-v1-loglikelihood_rolling
index 283109f32e..d2bb7746ec 100644
--- a/tests/testdata/pile_pubmed-central-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_pubmed-central-v1-loglikelihood_rolling
@@ -1 +1 @@
-40b39d120d99a145690444e86acc3e3e24d41e6e0538a75e26929ad84926e5e0
\ No newline at end of file
+40b39d120d99a145690444e86acc3e3e24d41e6e0538a75e26929ad84926e5e0
diff --git a/tests/testdata/pile_pubmed-central-v1-res.json b/tests/testdata/pile_pubmed-central-v1-res.json
index 4d4a241ace..be7fb6a056 100644
--- a/tests/testdata/pile_pubmed-central-v1-res.json
+++ b/tests/testdata/pile_pubmed-central-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_pubmed-central": {"bits_per_byte": 2.2812488135667854e-05, "byte_perplexity": 1.0000158125368497, "word_perplexity": 1.000123107107861}}, "versions": {"pile_pubmed-central": 1}}
\ No newline at end of file
+{"results": {"pile_pubmed-central": {"bits_per_byte": 2.2812488135667854e-05, "byte_perplexity": 1.0000158125368497, "word_perplexity": 1.000123107107861}}, "versions": {"pile_pubmed-central": 1}}
diff --git a/tests/testdata/pile_stackexchange-v0-loglikelihood_rolling b/tests/testdata/pile_stackexchange-v0-loglikelihood_rolling
index dcf0e64cf0..fba76fc9fb 100644
--- a/tests/testdata/pile_stackexchange-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_stackexchange-v0-loglikelihood_rolling
@@ -1 +1 @@
-e524bfb3e21cbdaddc117403a50df598520c7bf5b2c60ad8f2372cfa564e79be
\ No newline at end of file
+e524bfb3e21cbdaddc117403a50df598520c7bf5b2c60ad8f2372cfa564e79be
diff --git a/tests/testdata/pile_stackexchange-v0-res.json b/tests/testdata/pile_stackexchange-v0-res.json
index 76fdd0a6dd..8c64f66569 100644
--- a/tests/testdata/pile_stackexchange-v0-res.json
+++ b/tests/testdata/pile_stackexchange-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_stackexchange": {"bits_per_byte": 0.0002288815898835956, "byte_perplexity": 1.0002289077852733, "word_perplexity": 1.0016993562258851}}, "versions": {"pile_stackexchange": 0}}
\ No newline at end of file
+{"results": {"pile_stackexchange": {"bits_per_byte": 0.0002288815898835956, "byte_perplexity": 1.0002289077852733, "word_perplexity": 1.0016993562258851}}, "versions": {"pile_stackexchange": 0}}
diff --git a/tests/testdata/pile_stackexchange-v1-loglikelihood_rolling b/tests/testdata/pile_stackexchange-v1-loglikelihood_rolling
index dcf0e64cf0..fba76fc9fb 100644
--- a/tests/testdata/pile_stackexchange-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_stackexchange-v1-loglikelihood_rolling
@@ -1 +1 @@
-e524bfb3e21cbdaddc117403a50df598520c7bf5b2c60ad8f2372cfa564e79be
\ No newline at end of file
+e524bfb3e21cbdaddc117403a50df598520c7bf5b2c60ad8f2372cfa564e79be
diff --git a/tests/testdata/pile_stackexchange-v1-res.json b/tests/testdata/pile_stackexchange-v1-res.json
index 2773302990..aa550a119d 100644
--- a/tests/testdata/pile_stackexchange-v1-res.json
+++ b/tests/testdata/pile_stackexchange-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_stackexchange": {"bits_per_byte": 0.0003302063346758449, "byte_perplexity": 1.0002289077852733, "word_perplexity": 1.0016993562258851}}, "versions": {"pile_stackexchange": 1}}
\ No newline at end of file
+{"results": {"pile_stackexchange": {"bits_per_byte": 0.0003302063346758449, "byte_perplexity": 1.0002289077852733, "word_perplexity": 1.0016993562258851}}, "versions": {"pile_stackexchange": 1}}
diff --git a/tests/testdata/pile_ubuntu-irc-v0-loglikelihood_rolling b/tests/testdata/pile_ubuntu-irc-v0-loglikelihood_rolling
index ce04199863..cdbcfd3fd4 100644
--- a/tests/testdata/pile_ubuntu-irc-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_ubuntu-irc-v0-loglikelihood_rolling
@@ -1 +1 @@
-4eb69e314f0864ec8890e2323d7e76f8a8309692c4f090e2b41bf4be681a811d
\ No newline at end of file
+4eb69e314f0864ec8890e2323d7e76f8a8309692c4f090e2b41bf4be681a811d
diff --git a/tests/testdata/pile_ubuntu-irc-v0-res.json b/tests/testdata/pile_ubuntu-irc-v0-res.json
index dff51cba76..74cd951ae7 100644
--- a/tests/testdata/pile_ubuntu-irc-v0-res.json
+++ b/tests/testdata/pile_ubuntu-irc-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_ubuntu-irc": {"bits_per_byte": 1.6298315496830533e-06, "byte_perplexity": 1.0000016298328778, "word_perplexity": 1.0000108866656874}}, "versions": {"pile_ubuntu-irc": 0}}
\ No newline at end of file
+{"results": {"pile_ubuntu-irc": {"bits_per_byte": 1.6298315496830533e-06, "byte_perplexity": 1.0000016298328778, "word_perplexity": 1.0000108866656874}}, "versions": {"pile_ubuntu-irc": 0}}
diff --git a/tests/testdata/pile_ubuntu-irc-v1-loglikelihood_rolling b/tests/testdata/pile_ubuntu-irc-v1-loglikelihood_rolling
index ce04199863..cdbcfd3fd4 100644
--- a/tests/testdata/pile_ubuntu-irc-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_ubuntu-irc-v1-loglikelihood_rolling
@@ -1 +1 @@
-4eb69e314f0864ec8890e2323d7e76f8a8309692c4f090e2b41bf4be681a811d
\ No newline at end of file
+4eb69e314f0864ec8890e2323d7e76f8a8309692c4f090e2b41bf4be681a811d
diff --git a/tests/testdata/pile_ubuntu-irc-v1-res.json b/tests/testdata/pile_ubuntu-irc-v1-res.json
index 0e3b1b2597..d5b6788d25 100644
--- a/tests/testdata/pile_ubuntu-irc-v1-res.json
+++ b/tests/testdata/pile_ubuntu-irc-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_ubuntu-irc": {"bits_per_byte": 2.3513498942121155e-06, "byte_perplexity": 1.0000016298328778, "word_perplexity": 1.0000108866656874}}, "versions": {"pile_ubuntu-irc": 1}}
\ No newline at end of file
+{"results": {"pile_ubuntu-irc": {"bits_per_byte": 2.3513498942121155e-06, "byte_perplexity": 1.0000016298328778, "word_perplexity": 1.0000108866656874}}, "versions": {"pile_ubuntu-irc": 1}}
diff --git a/tests/testdata/pile_uspto-v0-loglikelihood_rolling b/tests/testdata/pile_uspto-v0-loglikelihood_rolling
index 4649d3b9b7..64e43b699b 100644
--- a/tests/testdata/pile_uspto-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_uspto-v0-loglikelihood_rolling
@@ -1 +1 @@
-789b2bdb31564d512b70f801316f49320a26c83ba361226bac0afb255341d477
\ No newline at end of file
+789b2bdb31564d512b70f801316f49320a26c83ba361226bac0afb255341d477
diff --git a/tests/testdata/pile_uspto-v0-res.json b/tests/testdata/pile_uspto-v0-res.json
index c13dfc73f5..aefc74637e 100644
--- a/tests/testdata/pile_uspto-v0-res.json
+++ b/tests/testdata/pile_uspto-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_uspto": {"bits_per_byte": 0.00012062434384130924, "byte_perplexity": 1.00012063161925, "word_perplexity": 1.0007716198916954}}, "versions": {"pile_uspto": 0}}
\ No newline at end of file
+{"results": {"pile_uspto": {"bits_per_byte": 0.00012062434384130924, "byte_perplexity": 1.00012063161925, "word_perplexity": 1.0007716198916954}}, "versions": {"pile_uspto": 0}}
diff --git a/tests/testdata/pile_uspto-v1-loglikelihood_rolling b/tests/testdata/pile_uspto-v1-loglikelihood_rolling
index 4649d3b9b7..64e43b699b 100644
--- a/tests/testdata/pile_uspto-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_uspto-v1-loglikelihood_rolling
@@ -1 +1 @@
-789b2bdb31564d512b70f801316f49320a26c83ba361226bac0afb255341d477
\ No newline at end of file
+789b2bdb31564d512b70f801316f49320a26c83ba361226bac0afb255341d477
diff --git a/tests/testdata/pile_uspto-v1-res.json b/tests/testdata/pile_uspto-v1-res.json
index 599ae44ef4..48a5b98029 100644
--- a/tests/testdata/pile_uspto-v1-res.json
+++ b/tests/testdata/pile_uspto-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_uspto": {"bits_per_byte": 0.000174024142670342, "byte_perplexity": 1.00012063161925, "word_perplexity": 1.0007716198916954}}, "versions": {"pile_uspto": 1}}
\ No newline at end of file
+{"results": {"pile_uspto": {"bits_per_byte": 0.000174024142670342, "byte_perplexity": 1.00012063161925, "word_perplexity": 1.0007716198916954}}, "versions": {"pile_uspto": 1}}
diff --git a/tests/testdata/pile_wikipedia-v0-loglikelihood_rolling b/tests/testdata/pile_wikipedia-v0-loglikelihood_rolling
index e44bd27628..1174a961e7 100644
--- a/tests/testdata/pile_wikipedia-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_wikipedia-v0-loglikelihood_rolling
@@ -1 +1 @@
-ef9ec0dd408316ca6537228a6812e839f14b30608973081d41efc47c138338da
\ No newline at end of file
+ef9ec0dd408316ca6537228a6812e839f14b30608973081d41efc47c138338da
diff --git a/tests/testdata/pile_wikipedia-v0-res.json b/tests/testdata/pile_wikipedia-v0-res.json
index bfffde9938..d04bd589a3 100644
--- a/tests/testdata/pile_wikipedia-v0-res.json
+++ b/tests/testdata/pile_wikipedia-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_wikipedia": {"bits_per_byte": 0.00016834722287561703, "byte_perplexity": 1.0001683613940646, "word_perplexity": 1.001084677949439}}, "versions": {"pile_wikipedia": 0}}
\ No newline at end of file
+{"results": {"pile_wikipedia": {"bits_per_byte": 0.00016834722287561703, "byte_perplexity": 1.0001683613940646, "word_perplexity": 1.001084677949439}}, "versions": {"pile_wikipedia": 0}}
diff --git a/tests/testdata/pile_wikipedia-v1-loglikelihood_rolling b/tests/testdata/pile_wikipedia-v1-loglikelihood_rolling
index e44bd27628..1174a961e7 100644
--- a/tests/testdata/pile_wikipedia-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_wikipedia-v1-loglikelihood_rolling
@@ -1 +1 @@
-ef9ec0dd408316ca6537228a6812e839f14b30608973081d41efc47c138338da
\ No newline at end of file
+ef9ec0dd408316ca6537228a6812e839f14b30608973081d41efc47c138338da
diff --git a/tests/testdata/pile_wikipedia-v1-res.json b/tests/testdata/pile_wikipedia-v1-res.json
index 4f2314e66b..9985d55eb3 100644
--- a/tests/testdata/pile_wikipedia-v1-res.json
+++ b/tests/testdata/pile_wikipedia-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_wikipedia": {"bits_per_byte": 0.00024287370359008176, "byte_perplexity": 1.0001683613940646, "word_perplexity": 1.001084677949439}}, "versions": {"pile_wikipedia": 1}}
\ No newline at end of file
+{"results": {"pile_wikipedia": {"bits_per_byte": 0.00024287370359008176, "byte_perplexity": 1.0001683613940646, "word_perplexity": 1.001084677949439}}, "versions": {"pile_wikipedia": 1}}
diff --git a/tests/testdata/pile_youtubesubtitles-v0-loglikelihood_rolling b/tests/testdata/pile_youtubesubtitles-v0-loglikelihood_rolling
index 81c2e5ed06..b5db202c2d 100644
--- a/tests/testdata/pile_youtubesubtitles-v0-loglikelihood_rolling
+++ b/tests/testdata/pile_youtubesubtitles-v0-loglikelihood_rolling
@@ -1 +1 @@
-68263c52adc0086011e2220b619983935cabb1cc1f5f9f8ee1a74ab2a7457967
\ No newline at end of file
+68263c52adc0086011e2220b619983935cabb1cc1f5f9f8ee1a74ab2a7457967
diff --git a/tests/testdata/pile_youtubesubtitles-v0-res.json b/tests/testdata/pile_youtubesubtitles-v0-res.json
index b58ce148f0..4a8a5fdf40 100644
--- a/tests/testdata/pile_youtubesubtitles-v0-res.json
+++ b/tests/testdata/pile_youtubesubtitles-v0-res.json
@@ -1 +1 @@
-{"results": {"pile_youtubesubtitles": {"bits_per_byte": 2.3447170928931888e-05, "byte_perplexity": 1.000023447445816, "word_perplexity": 1.0001529192262875}}, "versions": {"pile_youtubesubtitles": 0}}
\ No newline at end of file
+{"results": {"pile_youtubesubtitles": {"bits_per_byte": 2.3447170928931888e-05, "byte_perplexity": 1.000023447445816, "word_perplexity": 1.0001529192262875}}, "versions": {"pile_youtubesubtitles": 0}}
diff --git a/tests/testdata/pile_youtubesubtitles-v1-loglikelihood_rolling b/tests/testdata/pile_youtubesubtitles-v1-loglikelihood_rolling
index 81c2e5ed06..b5db202c2d 100644
--- a/tests/testdata/pile_youtubesubtitles-v1-loglikelihood_rolling
+++ b/tests/testdata/pile_youtubesubtitles-v1-loglikelihood_rolling
@@ -1 +1 @@
-68263c52adc0086011e2220b619983935cabb1cc1f5f9f8ee1a74ab2a7457967
\ No newline at end of file
+68263c52adc0086011e2220b619983935cabb1cc1f5f9f8ee1a74ab2a7457967
diff --git a/tests/testdata/pile_youtubesubtitles-v1-res.json b/tests/testdata/pile_youtubesubtitles-v1-res.json
index fcf2faa8bc..f4c82da451 100644
--- a/tests/testdata/pile_youtubesubtitles-v1-res.json
+++ b/tests/testdata/pile_youtubesubtitles-v1-res.json
@@ -1 +1 @@
-{"results": {"pile_youtubesubtitles": {"bits_per_byte": 3.3827117222045906e-05, "byte_perplexity": 1.000023447445816, "word_perplexity": 1.0001529192262875}}, "versions": {"pile_youtubesubtitles": 1}}
\ No newline at end of file
+{"results": {"pile_youtubesubtitles": {"bits_per_byte": 3.3827117222045906e-05, "byte_perplexity": 1.000023447445816, "word_perplexity": 1.0001529192262875}}, "versions": {"pile_youtubesubtitles": 1}}
diff --git a/tests/testdata/piqa-v0-loglikelihood b/tests/testdata/piqa-v0-loglikelihood
index b01b1fe5d8..27950464e4 100644
--- a/tests/testdata/piqa-v0-loglikelihood
+++ b/tests/testdata/piqa-v0-loglikelihood
@@ -1 +1 @@
-6048a3a2bb3ad1e6a3d98139618e06b4d7de766edd685bd38837596199c3f69f
\ No newline at end of file
+6048a3a2bb3ad1e6a3d98139618e06b4d7de766edd685bd38837596199c3f69f
diff --git a/tests/testdata/piqa-v0-res.json b/tests/testdata/piqa-v0-res.json
index bb6ebfb9a2..9c06db138d 100644
--- a/tests/testdata/piqa-v0-res.json
+++ b/tests/testdata/piqa-v0-res.json
@@ -1 +1 @@
-{"results": {"piqa": {"acc": 0.514145810663765, "acc_norm": 0.5114254624591947, "acc_norm_stderr": 0.01166277802645167, "acc_stderr": 0.011661154475524836}}, "versions": {"piqa": 0}}
\ No newline at end of file
+{"results": {"piqa": {"acc": 0.514145810663765, "acc_norm": 0.5114254624591947, "acc_norm_stderr": 0.01166277802645167, "acc_stderr": 0.011661154475524836}}, "versions": {"piqa": 0}}
diff --git a/tests/testdata/prost-v0-loglikelihood b/tests/testdata/prost-v0-loglikelihood
index a94b8cdec9..57c5931f49 100644
--- a/tests/testdata/prost-v0-loglikelihood
+++ b/tests/testdata/prost-v0-loglikelihood
@@ -1 +1 @@
-7c475f5b36a8b79f94c2be035441e7fd59dac021b0713b1fc72d256424c70b0b
\ No newline at end of file
+7c475f5b36a8b79f94c2be035441e7fd59dac021b0713b1fc72d256424c70b0b
diff --git a/tests/testdata/prost-v0-res.json b/tests/testdata/prost-v0-res.json
index ff99d83f40..5c2ef8a3a2 100644
--- a/tests/testdata/prost-v0-res.json
+++ b/tests/testdata/prost-v0-res.json
@@ -1 +1 @@
-{"results": {"prost": {"acc": 0.24631725021349274, "acc_norm": 0.2581127241673783, "acc_norm_stderr": 0.00319703079646546, "acc_stderr": 0.003147855968061357}}, "versions": {"prost": 0}}
\ No newline at end of file
+{"results": {"prost": {"acc": 0.24631725021349274, "acc_norm": 0.2581127241673783, "acc_norm_stderr": 0.00319703079646546, "acc_stderr": 0.003147855968061357}}, "versions": {"prost": 0}}
diff --git a/tests/testdata/pubmedqa-v0-loglikelihood b/tests/testdata/pubmedqa-v0-loglikelihood
index 97db87ce2b..2c839c632b 100644
--- a/tests/testdata/pubmedqa-v0-loglikelihood
+++ b/tests/testdata/pubmedqa-v0-loglikelihood
@@ -1 +1 @@
-7a04a1fb1d2b19db84fd15c224015d6c0306a41195a4e71fe6abd48fb4d53b9f
\ No newline at end of file
+7a04a1fb1d2b19db84fd15c224015d6c0306a41195a4e71fe6abd48fb4d53b9f
diff --git a/tests/testdata/pubmedqa-v0-res.json b/tests/testdata/pubmedqa-v0-res.json
index bb39463a4a..75acb8f095 100644
--- a/tests/testdata/pubmedqa-v0-res.json
+++ b/tests/testdata/pubmedqa-v0-res.json
@@ -1 +1 @@
-{"results": {"pubmedqa": {"acc": 0.324, "acc_stderr": 0.01480686473373886}}, "versions": {"pubmedqa": 0}}
\ No newline at end of file
+{"results": {"pubmedqa": {"acc": 0.324, "acc_stderr": 0.01480686473373886}}, "versions": {"pubmedqa": 0}}
diff --git a/tests/testdata/qa4mre_2011-v0-loglikelihood b/tests/testdata/qa4mre_2011-v0-loglikelihood
index 049134c7a1..d030906e56 100644
--- a/tests/testdata/qa4mre_2011-v0-loglikelihood
+++ b/tests/testdata/qa4mre_2011-v0-loglikelihood
@@ -1 +1 @@
-0d09f17c65768e797633494d2d218e4e46a26f718cab8b0bf3d156b073a8c437
\ No newline at end of file
+0d09f17c65768e797633494d2d218e4e46a26f718cab8b0bf3d156b073a8c437
diff --git a/tests/testdata/qa4mre_2011-v0-res.json b/tests/testdata/qa4mre_2011-v0-res.json
index 601c4eb763..44be8afc49 100644
--- a/tests/testdata/qa4mre_2011-v0-res.json
+++ b/tests/testdata/qa4mre_2011-v0-res.json
@@ -1 +1 @@
-{"results": {"qa4mre_2011": {"acc": 0.225, "acc_norm": 0.23333333333333334, "acc_norm_stderr": 0.03877199986918664, "acc_stderr": 0.0382797091741014}}, "versions": {"qa4mre_2011": 0}}
\ No newline at end of file
+{"results": {"qa4mre_2011": {"acc": 0.225, "acc_norm": 0.23333333333333334, "acc_norm_stderr": 0.03877199986918664, "acc_stderr": 0.0382797091741014}}, "versions": {"qa4mre_2011": 0}}
diff --git a/tests/testdata/qa4mre_2012-v0-loglikelihood b/tests/testdata/qa4mre_2012-v0-loglikelihood
index 0e67fac5f7..dab729c1bb 100644
--- a/tests/testdata/qa4mre_2012-v0-loglikelihood
+++ b/tests/testdata/qa4mre_2012-v0-loglikelihood
@@ -1 +1 @@
-7e17261820acb365966cb9431d93aec983b14393eaeefbc96e30a11cf58bc6df
\ No newline at end of file
+7e17261820acb365966cb9431d93aec983b14393eaeefbc96e30a11cf58bc6df
diff --git a/tests/testdata/qa4mre_2012-v0-res.json b/tests/testdata/qa4mre_2012-v0-res.json
index 91d8f36604..8b6dc364ee 100644
--- a/tests/testdata/qa4mre_2012-v0-res.json
+++ b/tests/testdata/qa4mre_2012-v0-res.json
@@ -1 +1 @@
-{"results": {"qa4mre_2012": {"acc": 0.15625, "acc_norm": 0.16875, "acc_norm_stderr": 0.029702236908328808, "acc_stderr": 0.02879508360159146}}, "versions": {"qa4mre_2012": 0}}
\ No newline at end of file
+{"results": {"qa4mre_2012": {"acc": 0.15625, "acc_norm": 0.16875, "acc_norm_stderr": 0.029702236908328808, "acc_stderr": 0.02879508360159146}}, "versions": {"qa4mre_2012": 0}}
diff --git a/tests/testdata/qa4mre_2013-v0-loglikelihood b/tests/testdata/qa4mre_2013-v0-loglikelihood
index 43243706d9..59c31926de 100644
--- a/tests/testdata/qa4mre_2013-v0-loglikelihood
+++ b/tests/testdata/qa4mre_2013-v0-loglikelihood
@@ -1 +1 @@
-52fc431e94c67f983e28ebc70cf45e6c14116b0ae77dc1bf22347c705a65d054
\ No newline at end of file
+52fc431e94c67f983e28ebc70cf45e6c14116b0ae77dc1bf22347c705a65d054
diff --git a/tests/testdata/qa4mre_2013-v0-res.json b/tests/testdata/qa4mre_2013-v0-res.json
index c87e487e9a..e0f65ff125 100644
--- a/tests/testdata/qa4mre_2013-v0-res.json
+++ b/tests/testdata/qa4mre_2013-v0-res.json
@@ -1 +1 @@
-{"results": {"qa4mre_2013": {"acc": 0.18309859154929578, "acc_norm": 0.22183098591549297, "acc_norm_stderr": 0.02469760575535269, "acc_stderr": 0.022989742475464973}}, "versions": {"qa4mre_2013": 0}}
\ No newline at end of file
+{"results": {"qa4mre_2013": {"acc": 0.18309859154929578, "acc_norm": 0.22183098591549297, "acc_norm_stderr": 0.02469760575535269, "acc_stderr": 0.022989742475464973}}, "versions": {"qa4mre_2013": 0}}
diff --git a/tests/testdata/qnli-v0-loglikelihood b/tests/testdata/qnli-v0-loglikelihood
index 883202c385..bb6554ad5c 100644
--- a/tests/testdata/qnli-v0-loglikelihood
+++ b/tests/testdata/qnli-v0-loglikelihood
@@ -1 +1 @@
-4281d4ff5cf1244358b0ea0220c67863c69fbade850696b43e8ff05138e01e12
\ No newline at end of file
+4281d4ff5cf1244358b0ea0220c67863c69fbade850696b43e8ff05138e01e12
diff --git a/tests/testdata/qnli-v0-res.json b/tests/testdata/qnli-v0-res.json
index 31c3097605..64d731ccfe 100644
--- a/tests/testdata/qnli-v0-res.json
+++ b/tests/testdata/qnli-v0-res.json
@@ -1 +1 @@
-{"results": {"qnli": {"acc": 0.5108914515833791, "acc_stderr": 0.00676380528502966}}, "versions": {"qnli": 0}}
\ No newline at end of file
+{"results": {"qnli": {"acc": 0.5108914515833791, "acc_stderr": 0.00676380528502966}}, "versions": {"qnli": 0}}
diff --git a/tests/testdata/qqp-v0-loglikelihood b/tests/testdata/qqp-v0-loglikelihood
index ecc86dc396..30e142b2a0 100644
--- a/tests/testdata/qqp-v0-loglikelihood
+++ b/tests/testdata/qqp-v0-loglikelihood
@@ -1 +1 @@
-97b551b0fc3d239aad4929a2e8e79c986891aefd9fcd19441fea0382d507889e
\ No newline at end of file
+97b551b0fc3d239aad4929a2e8e79c986891aefd9fcd19441fea0382d507889e
diff --git a/tests/testdata/qqp-v0-res.json b/tests/testdata/qqp-v0-res.json
index b7b31355e6..ebf8ada06f 100644
--- a/tests/testdata/qqp-v0-res.json
+++ b/tests/testdata/qqp-v0-res.json
@@ -1 +1 @@
-{"results": {"qqp": {"acc": 0.49782339846648527, "acc_stderr": 0.0024866770696239894, "f1": 0.42322661288031593, "f1_stderr": 0.002695903831328166}}, "versions": {"qqp": 0}}
\ No newline at end of file
+{"results": {"qqp": {"acc": 0.49782339846648527, "acc_stderr": 0.0024866770696239894, "f1": 0.42322661288031593, "f1_stderr": 0.002695903831328166}}, "versions": {"qqp": 0}}
diff --git a/tests/testdata/race-v0-loglikelihood b/tests/testdata/race-v0-loglikelihood
index 5fe1ce356b..ddc6e6e9e7 100644
--- a/tests/testdata/race-v0-loglikelihood
+++ b/tests/testdata/race-v0-loglikelihood
@@ -1 +1 @@
-bdfdfab7fa1c7af0c1e161785e347b1b8071a15cbf971f6f2a9ae8c8e845199f
\ No newline at end of file
+bdfdfab7fa1c7af0c1e161785e347b1b8071a15cbf971f6f2a9ae8c8e845199f
diff --git a/tests/testdata/race-v0-res.json b/tests/testdata/race-v0-res.json
index 017b00669b..e70be2cd10 100644
--- a/tests/testdata/race-v0-res.json
+++ b/tests/testdata/race-v0-res.json
@@ -1 +1 @@
-{"results": {"race": {"acc": 0.23253588516746412, "acc_stderr": 0.013074460615265295}}, "versions": {"race": 0}}
\ No newline at end of file
+{"results": {"race": {"acc": 0.23253588516746412, "acc_stderr": 0.013074460615265295}}, "versions": {"race": 0}}
diff --git a/tests/testdata/random_insertion-v0-greedy_until b/tests/testdata/random_insertion-v0-greedy_until
index 4844e5393b..11a07276b4 100644
--- a/tests/testdata/random_insertion-v0-greedy_until
+++ b/tests/testdata/random_insertion-v0-greedy_until
@@ -1 +1 @@
-6c48baa6924f3635120f33062251c4b571b3d4e9fe46b14d91f54ddd1c857997
\ No newline at end of file
+6c48baa6924f3635120f33062251c4b571b3d4e9fe46b14d91f54ddd1c857997
diff --git a/tests/testdata/random_insertion-v0-res.json b/tests/testdata/random_insertion-v0-res.json
index 9b5f507f67..be1ac2fb3a 100644
--- a/tests/testdata/random_insertion-v0-res.json
+++ b/tests/testdata/random_insertion-v0-res.json
@@ -1 +1 @@
-{"results": {"random_insertion": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"random_insertion": 0}}
\ No newline at end of file
+{"results": {"random_insertion": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"random_insertion": 0}}
diff --git a/tests/testdata/record-v0-loglikelihood b/tests/testdata/record-v0-loglikelihood
index a54fa05cd1..dbd898d2a8 100644
--- a/tests/testdata/record-v0-loglikelihood
+++ b/tests/testdata/record-v0-loglikelihood
@@ -1 +1 @@
-a3e378fbde4e28f375cac1561bbfc7d7673c2af193628a774ad012d5192393aa
\ No newline at end of file
+a3e378fbde4e28f375cac1561bbfc7d7673c2af193628a774ad012d5192393aa
diff --git a/tests/testdata/record-v0-res.json b/tests/testdata/record-v0-res.json
index 006c381372..0712b6a69d 100644
--- a/tests/testdata/record-v0-res.json
+++ b/tests/testdata/record-v0-res.json
@@ -1 +1 @@
-{"results": {"record": {"em": 0.1521, "em_stderr": 0.0035913575128186616, "f1": 0.1581870634920636, "f1_stderr": 0.0036146895141474576}}, "versions": {"record": 0}}
\ No newline at end of file
+{"results": {"record": {"em": 0.1521, "em_stderr": 0.0035913575128186616, "f1": 0.1581870634920636, "f1_stderr": 0.0036146895141474576}}, "versions": {"record": 0}}
diff --git a/tests/testdata/reversed_words-v0-greedy_until b/tests/testdata/reversed_words-v0-greedy_until
index 3f28488a90..633391b66b 100644
--- a/tests/testdata/reversed_words-v0-greedy_until
+++ b/tests/testdata/reversed_words-v0-greedy_until
@@ -1 +1 @@
-1d79fc4f0177f9624a487b9973f4e0e1d3f8404993b419a7b807a690ebbbb290
\ No newline at end of file
+1d79fc4f0177f9624a487b9973f4e0e1d3f8404993b419a7b807a690ebbbb290
diff --git a/tests/testdata/reversed_words-v0-res.json b/tests/testdata/reversed_words-v0-res.json
index 9285ff2694..1349728bdd 100644
--- a/tests/testdata/reversed_words-v0-res.json
+++ b/tests/testdata/reversed_words-v0-res.json
@@ -1 +1 @@
-{"results": {"reversed_words": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"reversed_words": 0}}
\ No newline at end of file
+{"results": {"reversed_words": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"reversed_words": 0}}
diff --git a/tests/testdata/rte-v0-loglikelihood b/tests/testdata/rte-v0-loglikelihood
index c239923e4f..8bed1472c9 100644
--- a/tests/testdata/rte-v0-loglikelihood
+++ b/tests/testdata/rte-v0-loglikelihood
@@ -1 +1 @@
-c80ce13c8c736087f1557f8736d5d318b540ff01e4bb7f55e568890dc8b0393e
\ No newline at end of file
+c80ce13c8c736087f1557f8736d5d318b540ff01e4bb7f55e568890dc8b0393e
diff --git a/tests/testdata/rte-v0-res.json b/tests/testdata/rte-v0-res.json
index 10314dd047..be36edbcad 100644
--- a/tests/testdata/rte-v0-res.json
+++ b/tests/testdata/rte-v0-res.json
@@ -1 +1 @@
-{"results": {"rte": {"acc": 0.5379061371841155, "acc_stderr": 0.030009848912529117}}, "versions": {"rte": 0}}
\ No newline at end of file
+{"results": {"rte": {"acc": 0.5379061371841155, "acc_stderr": 0.030009848912529117}}, "versions": {"rte": 0}}
diff --git a/tests/testdata/sciq-v0-loglikelihood b/tests/testdata/sciq-v0-loglikelihood
index 25ce988773..577a753162 100644
--- a/tests/testdata/sciq-v0-loglikelihood
+++ b/tests/testdata/sciq-v0-loglikelihood
@@ -1 +1 @@
-71cbb6e2a7ac4512c3761ea801d420eb3fac49d158c7e4deaa3ab8727bea923c
\ No newline at end of file
+71cbb6e2a7ac4512c3761ea801d420eb3fac49d158c7e4deaa3ab8727bea923c
diff --git a/tests/testdata/sciq-v0-res.json b/tests/testdata/sciq-v0-res.json
index 7071515827..334111fe02 100644
--- a/tests/testdata/sciq-v0-res.json
+++ b/tests/testdata/sciq-v0-res.json
@@ -1 +1 @@
-{"results": {"sciq": {"acc": 0.234, "acc_norm": 0.239, "acc_norm_stderr": 0.01349300044693758, "acc_stderr": 0.01339490288966001}}, "versions": {"sciq": 0}}
\ No newline at end of file
+{"results": {"sciq": {"acc": 0.234, "acc_norm": 0.239, "acc_norm_stderr": 0.01349300044693758, "acc_stderr": 0.01339490288966001}}, "versions": {"sciq": 0}}
diff --git a/tests/testdata/squad2-v0-greedy_until b/tests/testdata/squad2-v0-greedy_until
index 024652e0a3..8160d9e4f2 100644
--- a/tests/testdata/squad2-v0-greedy_until
+++ b/tests/testdata/squad2-v0-greedy_until
@@ -1 +1 @@
-b261e8885c11750ce6911bb11e8693de03d53758297c26fb14cfc1ef508862cb
\ No newline at end of file
+b261e8885c11750ce6911bb11e8693de03d53758297c26fb14cfc1ef508862cb
diff --git a/tests/testdata/squad2-v0-loglikelihood b/tests/testdata/squad2-v0-loglikelihood
index 41300bc19f..6b95d6c0c1 100644
--- a/tests/testdata/squad2-v0-loglikelihood
+++ b/tests/testdata/squad2-v0-loglikelihood
@@ -1 +1 @@
-287e87cc6878debcc80d9b6df4e2d0a74ed29068e0e0a80906c8441843a17cee
\ No newline at end of file
+287e87cc6878debcc80d9b6df4e2d0a74ed29068e0e0a80906c8441843a17cee
diff --git a/tests/testdata/squad2-v0-res.json b/tests/testdata/squad2-v0-res.json
index 2b370553ac..a813f14bda 100644
--- a/tests/testdata/squad2-v0-res.json
+++ b/tests/testdata/squad2-v0-res.json
@@ -1 +1 @@
-{"results": {"squad2": {"HasAns_exact": 0.0, "HasAns_f1": 0.0, "NoAns_exact": 0.0, "NoAns_f1": 0.0, "best_exact": 50.07159100480081, "best_f1": 50.07159100480081, "exact": 0.0, "f1": 0.0}}, "versions": {"squad2": 0}}
\ No newline at end of file
+{"results": {"squad2": {"HasAns_exact": 0.0, "HasAns_f1": 0.0, "NoAns_exact": 0.0, "NoAns_f1": 0.0, "best_exact": 50.07159100480081, "best_f1": 50.07159100480081, "exact": 0.0, "f1": 0.0}}, "versions": {"squad2": 0}}
diff --git a/tests/testdata/squad2-v1-greedy_until b/tests/testdata/squad2-v1-greedy_until
index 70df2fd6ae..caf8511cf1 100644
--- a/tests/testdata/squad2-v1-greedy_until
+++ b/tests/testdata/squad2-v1-greedy_until
@@ -1 +1 @@
-e17e3d85c1d5adaf2d6b4b752c4babc2e0b3a6e144e6de70cb3b2287e85109b8
\ No newline at end of file
+e17e3d85c1d5adaf2d6b4b752c4babc2e0b3a6e144e6de70cb3b2287e85109b8
diff --git a/tests/testdata/squad2-v1-loglikelihood b/tests/testdata/squad2-v1-loglikelihood
index 2c970f7583..fa1d967cf8 100644
--- a/tests/testdata/squad2-v1-loglikelihood
+++ b/tests/testdata/squad2-v1-loglikelihood
@@ -1 +1 @@
-f5da6173402b274dc89130755c222c6ca6b2a3bacaaa4e4ab07be9322b7bad65
\ No newline at end of file
+f5da6173402b274dc89130755c222c6ca6b2a3bacaaa4e4ab07be9322b7bad65
diff --git a/tests/testdata/squad2-v1-res.json b/tests/testdata/squad2-v1-res.json
index dd69f00abb..e095ea76bf 100644
--- a/tests/testdata/squad2-v1-res.json
+++ b/tests/testdata/squad2-v1-res.json
@@ -1 +1 @@
-{"results": {"squad2": {"HasAns_exact": 0.0, "HasAns_f1": 0.0, "NoAns_exact": 0.0, "NoAns_f1": 0.0, "best_exact": 50.07159100480081, "best_f1": 50.07159100480081, "exact": 0.0, "f1": 0.0}}, "versions": {"squad2": 1}}
\ No newline at end of file
+{"results": {"squad2": {"HasAns_exact": 0.0, "HasAns_f1": 0.0, "NoAns_exact": 0.0, "NoAns_f1": 0.0, "best_exact": 50.07159100480081, "best_f1": 50.07159100480081, "exact": 0.0, "f1": 0.0}}, "versions": {"squad2": 1}}
diff --git a/tests/testdata/sst-v0-loglikelihood b/tests/testdata/sst-v0-loglikelihood
index 52050de16b..f45014645d 100644
--- a/tests/testdata/sst-v0-loglikelihood
+++ b/tests/testdata/sst-v0-loglikelihood
@@ -1 +1 @@
-d2ebe3a63517d1d481aa1513bebe124c57a0904554a1e95f566979cfe67b1a7f
\ No newline at end of file
+d2ebe3a63517d1d481aa1513bebe124c57a0904554a1e95f566979cfe67b1a7f
diff --git a/tests/testdata/sst-v0-res.json b/tests/testdata/sst-v0-res.json
index 5fe3c62a20..a21393bcde 100644
--- a/tests/testdata/sst-v0-res.json
+++ b/tests/testdata/sst-v0-res.json
@@ -1 +1 @@
-{"results": {"sst": {"acc": 0.5172018348623854, "acc_stderr": 0.016931824425903734}}, "versions": {"sst": 0}}
\ No newline at end of file
+{"results": {"sst": {"acc": 0.5172018348623854, "acc_stderr": 0.016931824425903734}}, "versions": {"sst": 0}}
diff --git a/tests/testdata/swag-v0-loglikelihood b/tests/testdata/swag-v0-loglikelihood
index c8152027dc..861cb0d72a 100644
--- a/tests/testdata/swag-v0-loglikelihood
+++ b/tests/testdata/swag-v0-loglikelihood
@@ -1 +1 @@
-be4fcbad876124c4ba3c71970538a97fec0e36a9cc677c70b6c9243a7bcee0ec
\ No newline at end of file
+be4fcbad876124c4ba3c71970538a97fec0e36a9cc677c70b6c9243a7bcee0ec
diff --git a/tests/testdata/swag-v0-res.json b/tests/testdata/swag-v0-res.json
index a1aeee972e..f31caf72ef 100644
--- a/tests/testdata/swag-v0-res.json
+++ b/tests/testdata/swag-v0-res.json
@@ -1 +1 @@
-{"results": {"swag": {"acc": 0.2482255323402979, "acc_norm": 0.24882535239428172, "acc_norm_stderr": 0.00305666959496067, "acc_stderr": 0.003054201832644171}}, "versions": {"swag": 0}}
\ No newline at end of file
+{"results": {"swag": {"acc": 0.2482255323402979, "acc_norm": 0.24882535239428172, "acc_norm_stderr": 0.00305666959496067, "acc_stderr": 0.003054201832644171}}, "versions": {"swag": 0}}
diff --git a/tests/testdata/triviaqa-v0-loglikelihood b/tests/testdata/triviaqa-v0-loglikelihood
index d576c4977f..0a28fcfa69 100644
--- a/tests/testdata/triviaqa-v0-loglikelihood
+++ b/tests/testdata/triviaqa-v0-loglikelihood
@@ -1 +1 @@
-f8ec05b306b9f6187c0f8117cae441fb85a7a2e4670f4f9a1a3b632b1978421a
\ No newline at end of file
+f8ec05b306b9f6187c0f8117cae441fb85a7a2e4670f4f9a1a3b632b1978421a
diff --git a/tests/testdata/triviaqa-v0-res.json b/tests/testdata/triviaqa-v0-res.json
index ab98847da6..dba83d2c40 100644
--- a/tests/testdata/triviaqa-v0-res.json
+++ b/tests/testdata/triviaqa-v0-res.json
@@ -1 +1 @@
-{"results": {"triviaqa": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"triviaqa": 0}}
\ No newline at end of file
+{"results": {"triviaqa": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"triviaqa": 0}}
diff --git a/tests/testdata/truthfulqa_gen-v0-greedy_until b/tests/testdata/truthfulqa_gen-v0-greedy_until
index 52156c8507..08a6b49eee 100644
--- a/tests/testdata/truthfulqa_gen-v0-greedy_until
+++ b/tests/testdata/truthfulqa_gen-v0-greedy_until
@@ -1 +1 @@
-0d7c56e1aa71ffd8f94bde28f6e8dfdd35f7aaadffa0620bd2a27704253d6c14
\ No newline at end of file
+0d7c56e1aa71ffd8f94bde28f6e8dfdd35f7aaadffa0620bd2a27704253d6c14
diff --git a/tests/testdata/truthfulqa_gen-v0-res.json b/tests/testdata/truthfulqa_gen-v0-res.json
index 5e68fa8dc6..24cdb8de49 100644
--- a/tests/testdata/truthfulqa_gen-v0-res.json
+++ b/tests/testdata/truthfulqa_gen-v0-res.json
@@ -1 +1 @@
-{"results": {"truthfulqa_gen": {"bleu_acc": 0.0, "bleu_acc_stderr": 0.0, "bleu_diff": 0.0, "bleu_diff_stderr": 0.0, "bleu_max": 0.0, "bleu_max_stderr": 0.0, "bleurt_acc": 0.8372093023255814, "bleurt_acc_stderr": 0.012923696051772253, "bleurt_diff": 0.13967358205134603, "bleurt_diff_stderr": 0.00532907098769571, "bleurt_max": -1.4402793981454072, "bleurt_max_stderr": 0.0021884846359458963, "rouge1_acc": 0.0, "rouge1_acc_stderr": 0.0, "rouge1_diff": 0.0, "rouge1_diff_stderr": 0.0, "rouge1_max": 0.0, "rouge1_max_stderr": 0.0, "rouge2_acc": 0.0, "rouge2_acc_stderr": 0.0, "rouge2_diff": 0.0, "rouge2_diff_stderr": 0.0, "rouge2_max": 0.0, "rouge2_max_stderr": 0.0, "rougeL_acc": 0.0, "rougeL_acc_stderr": 0.0, "rougeL_diff": 0.0, "rougeL_diff_stderr": 0.0, "rougeL_max": 0.0, "rougeL_max_stderr": 0.0}}, "versions": {"truthfulqa_gen": 0}}
\ No newline at end of file
+{"results": {"truthfulqa_gen": {"bleu_acc": 0.0, "bleu_acc_stderr": 0.0, "bleu_diff": 0.0, "bleu_diff_stderr": 0.0, "bleu_max": 0.0, "bleu_max_stderr": 0.0, "bleurt_acc": 0.8372093023255814, "bleurt_acc_stderr": 0.012923696051772253, "bleurt_diff": 0.13967358205134603, "bleurt_diff_stderr": 0.00532907098769571, "bleurt_max": -1.4402793981454072, "bleurt_max_stderr": 0.0021884846359458963, "rouge1_acc": 0.0, "rouge1_acc_stderr": 0.0, "rouge1_diff": 0.0, "rouge1_diff_stderr": 0.0, "rouge1_max": 0.0, "rouge1_max_stderr": 0.0, "rouge2_acc": 0.0, "rouge2_acc_stderr": 0.0, "rouge2_diff": 0.0, "rouge2_diff_stderr": 0.0, "rouge2_max": 0.0, "rouge2_max_stderr": 0.0, "rougeL_acc": 0.0, "rougeL_acc_stderr": 0.0, "rougeL_diff": 0.0, "rougeL_diff_stderr": 0.0, "rougeL_max": 0.0, "rougeL_max_stderr": 0.0}}, "versions": {"truthfulqa_gen": 0}}
diff --git a/tests/testdata/truthfulqa_gen-v1-greedy_until b/tests/testdata/truthfulqa_gen-v1-greedy_until
index d5261f2213..08cb423cac 100644
--- a/tests/testdata/truthfulqa_gen-v1-greedy_until
+++ b/tests/testdata/truthfulqa_gen-v1-greedy_until
@@ -1 +1 @@
-1a280973bbac2b7ac29dd64dddac474fb4749585f7de893483b4034814466c67
\ No newline at end of file
+1a280973bbac2b7ac29dd64dddac474fb4749585f7de893483b4034814466c67
diff --git a/tests/testdata/truthfulqa_gen-v1-res.json b/tests/testdata/truthfulqa_gen-v1-res.json
index 30aa72f2ba..b932ddc30b 100644
--- a/tests/testdata/truthfulqa_gen-v1-res.json
+++ b/tests/testdata/truthfulqa_gen-v1-res.json
@@ -1 +1 @@
-{"results": {"truthfulqa_gen": {"bleu_acc": 0.0, "bleu_acc_stderr": 0.0, "bleu_diff": 0.0, "bleu_diff_stderr": 0.0, "bleu_max": 0.0, "bleu_max_stderr": 0.0, "bleurt_acc": 0.835985312117503, "bleurt_acc_stderr": 0.012962704327492454, "bleurt_diff": 0.14077322143090107, "bleurt_diff_stderr": 0.005459888909582694, "bleurt_max": -1.4399358725752065, "bleurt_max_stderr": 0.0022126992369197133, "rouge1_acc": 0.0, "rouge1_acc_stderr": 0.0, "rouge1_diff": 0.0, "rouge1_diff_stderr": 0.0, "rouge1_max": 0.0, "rouge1_max_stderr": 0.0, "rouge2_acc": 0.0, "rouge2_acc_stderr": 0.0, "rouge2_diff": 0.0, "rouge2_diff_stderr": 0.0, "rouge2_max": 0.0, "rouge2_max_stderr": 0.0, "rougeL_acc": 0.0, "rougeL_acc_stderr": 0.0, "rougeL_diff": 0.0, "rougeL_diff_stderr": 0.0, "rougeL_max": 0.0, "rougeL_max_stderr": 0.0}}, "versions": {"truthfulqa_gen": 1}}
\ No newline at end of file
+{"results": {"truthfulqa_gen": {"bleu_acc": 0.0, "bleu_acc_stderr": 0.0, "bleu_diff": 0.0, "bleu_diff_stderr": 0.0, "bleu_max": 0.0, "bleu_max_stderr": 0.0, "bleurt_acc": 0.835985312117503, "bleurt_acc_stderr": 0.012962704327492454, "bleurt_diff": 0.14077322143090107, "bleurt_diff_stderr": 0.005459888909582694, "bleurt_max": -1.4399358725752065, "bleurt_max_stderr": 0.0022126992369197133, "rouge1_acc": 0.0, "rouge1_acc_stderr": 0.0, "rouge1_diff": 0.0, "rouge1_diff_stderr": 0.0, "rouge1_max": 0.0, "rouge1_max_stderr": 0.0, "rouge2_acc": 0.0, "rouge2_acc_stderr": 0.0, "rouge2_diff": 0.0, "rouge2_diff_stderr": 0.0, "rouge2_max": 0.0, "rouge2_max_stderr": 0.0, "rougeL_acc": 0.0, "rougeL_acc_stderr": 0.0, "rougeL_diff": 0.0, "rougeL_diff_stderr": 0.0, "rougeL_max": 0.0, "rougeL_max_stderr": 0.0}}, "versions": {"truthfulqa_gen": 1}}
diff --git a/tests/testdata/truthfulqa_mc-v0-loglikelihood b/tests/testdata/truthfulqa_mc-v0-loglikelihood
index 51303977a9..9aedee2285 100644
--- a/tests/testdata/truthfulqa_mc-v0-loglikelihood
+++ b/tests/testdata/truthfulqa_mc-v0-loglikelihood
@@ -1 +1 @@
-226a6783976177dc9ceda5688623ff37023242eff30ddf270b886bf7b9b32228
\ No newline at end of file
+226a6783976177dc9ceda5688623ff37023242eff30ddf270b886bf7b9b32228
diff --git a/tests/testdata/truthfulqa_mc-v0-res.json b/tests/testdata/truthfulqa_mc-v0-res.json
index b12b4765cc..56247084c8 100644
--- a/tests/testdata/truthfulqa_mc-v0-res.json
+++ b/tests/testdata/truthfulqa_mc-v0-res.json
@@ -1 +1 @@
-{"results": {"truthfulqa_mc": {"mc1": 0.2141982864137087, "mc1_stderr": 0.01436214815569045, "mc2": 0.465436996173817, "mc2_stderr": 0.0048422530880316405}}, "versions": {"truthfulqa_mc": 0}}
\ No newline at end of file
+{"results": {"truthfulqa_mc": {"mc1": 0.2141982864137087, "mc1_stderr": 0.01436214815569045, "mc2": 0.465436996173817, "mc2_stderr": 0.0048422530880316405}}, "versions": {"truthfulqa_mc": 0}}
diff --git a/tests/testdata/truthfulqa_mc-v1-loglikelihood b/tests/testdata/truthfulqa_mc-v1-loglikelihood
index 4bab2d1f4d..f43b62372c 100644
--- a/tests/testdata/truthfulqa_mc-v1-loglikelihood
+++ b/tests/testdata/truthfulqa_mc-v1-loglikelihood
@@ -1 +1 @@
-1e07020e9cf41d46ed65312eb39d2b8e6599673d4f0d6b67c0d0eba0efb493bb
\ No newline at end of file
+1e07020e9cf41d46ed65312eb39d2b8e6599673d4f0d6b67c0d0eba0efb493bb
diff --git a/tests/testdata/truthfulqa_mc-v1-res.json b/tests/testdata/truthfulqa_mc-v1-res.json
index c1b1854c2e..bf29a1b958 100644
--- a/tests/testdata/truthfulqa_mc-v1-res.json
+++ b/tests/testdata/truthfulqa_mc-v1-res.json
@@ -1 +1 @@
-{"results": {"truthfulqa_mc": {"mc1": 0.23255813953488372, "mc1_stderr": 0.01478915753108052, "mc2": 0.4462325560722362, "mc2_stderr": 0.004986523944692003}}, "versions": {"truthfulqa_mc": 1}}
\ No newline at end of file
+{"results": {"truthfulqa_mc": {"mc1": 0.23255813953488372, "mc1_stderr": 0.01478915753108052, "mc2": 0.4462325560722362, "mc2_stderr": 0.004986523944692003}}, "versions": {"truthfulqa_mc": 1}}
diff --git a/tests/testdata/webqs-v0-loglikelihood b/tests/testdata/webqs-v0-loglikelihood
index 4d604d438d..201bf657dc 100644
--- a/tests/testdata/webqs-v0-loglikelihood
+++ b/tests/testdata/webqs-v0-loglikelihood
@@ -1 +1 @@
-96b218173468cc94552a0b946193bda89faba51f1bfc3e7945531f9dff8d6fe9
\ No newline at end of file
+96b218173468cc94552a0b946193bda89faba51f1bfc3e7945531f9dff8d6fe9
diff --git a/tests/testdata/webqs-v0-res.json b/tests/testdata/webqs-v0-res.json
index 9f0fdc76ca..a9778832f6 100644
--- a/tests/testdata/webqs-v0-res.json
+++ b/tests/testdata/webqs-v0-res.json
@@ -1 +1 @@
-{"results": {"webqs": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"webqs": 0}}
\ No newline at end of file
+{"results": {"webqs": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"webqs": 0}}
diff --git a/tests/testdata/wic-v0-loglikelihood b/tests/testdata/wic-v0-loglikelihood
index d27430a9a2..3f63702ced 100644
--- a/tests/testdata/wic-v0-loglikelihood
+++ b/tests/testdata/wic-v0-loglikelihood
@@ -1 +1 @@
-403a08da05e4c44d7e3dd3358382a7ba489c41d223e24cd1a9ed82ef1a2d004b
\ No newline at end of file
+403a08da05e4c44d7e3dd3358382a7ba489c41d223e24cd1a9ed82ef1a2d004b
diff --git a/tests/testdata/wic-v0-res.json b/tests/testdata/wic-v0-res.json
index eadc573ed3..224f3da0ec 100644
--- a/tests/testdata/wic-v0-res.json
+++ b/tests/testdata/wic-v0-res.json
@@ -1 +1 @@
-{"results": {"wic": {"acc": 0.49216300940438873, "acc_stderr": 0.01980828765781383}}, "versions": {"wic": 0}}
\ No newline at end of file
+{"results": {"wic": {"acc": 0.49216300940438873, "acc_stderr": 0.01980828765781383}}, "versions": {"wic": 0}}
diff --git a/tests/testdata/wikitext-v0-loglikelihood_rolling b/tests/testdata/wikitext-v0-loglikelihood_rolling
index f09af45a38..ee3e5942dc 100644
--- a/tests/testdata/wikitext-v0-loglikelihood_rolling
+++ b/tests/testdata/wikitext-v0-loglikelihood_rolling
@@ -1 +1 @@
-b6f83e6cf7535ee41b0057c3e2ec2cf7f2fa5a9119b305c479a83091d1142b2c
\ No newline at end of file
+b6f83e6cf7535ee41b0057c3e2ec2cf7f2fa5a9119b305c479a83091d1142b2c
diff --git a/tests/testdata/wikitext-v0-res.json b/tests/testdata/wikitext-v0-res.json
index 9ac0c37bb5..2c3aa13525 100644
--- a/tests/testdata/wikitext-v0-res.json
+++ b/tests/testdata/wikitext-v0-res.json
@@ -1 +1 @@
-{"results": {"wikitext": {"bits_per_byte": 2.219817611605802e-05, "byte_perplexity": 1.0000221984224973, "word_perplexity": 1.000118710696617}}, "versions": {"wikitext": 0}}
\ No newline at end of file
+{"results": {"wikitext": {"bits_per_byte": 2.219817611605802e-05, "byte_perplexity": 1.0000221984224973, "word_perplexity": 1.000118710696617}}, "versions": {"wikitext": 0}}
diff --git a/tests/testdata/wikitext-v1-loglikelihood_rolling b/tests/testdata/wikitext-v1-loglikelihood_rolling
index f09af45a38..ee3e5942dc 100644
--- a/tests/testdata/wikitext-v1-loglikelihood_rolling
+++ b/tests/testdata/wikitext-v1-loglikelihood_rolling
@@ -1 +1 @@
-b6f83e6cf7535ee41b0057c3e2ec2cf7f2fa5a9119b305c479a83091d1142b2c
\ No newline at end of file
+b6f83e6cf7535ee41b0057c3e2ec2cf7f2fa5a9119b305c479a83091d1142b2c
diff --git a/tests/testdata/wikitext-v1-res.json b/tests/testdata/wikitext-v1-res.json
index 122098aec2..8e8e1fbd7a 100644
--- a/tests/testdata/wikitext-v1-res.json
+++ b/tests/testdata/wikitext-v1-res.json
@@ -1 +1 @@
-{"results": {"wikitext": {"bits_per_byte": 3.202519859941674e-05, "byte_perplexity": 1.0000221984224973, "word_perplexity": 1.000118710696617}}, "versions": {"wikitext": 1}}
\ No newline at end of file
+{"results": {"wikitext": {"bits_per_byte": 3.202519859941674e-05, "byte_perplexity": 1.0000221984224973, "word_perplexity": 1.000118710696617}}, "versions": {"wikitext": 1}}
diff --git a/tests/testdata/winogrande-v0-loglikelihood b/tests/testdata/winogrande-v0-loglikelihood
index 97866f6ce4..b9405d7a2f 100644
--- a/tests/testdata/winogrande-v0-loglikelihood
+++ b/tests/testdata/winogrande-v0-loglikelihood
@@ -1 +1 @@
-90a3eff49de9173964d46f5ed57bcf9a78a72dd1bfe0e5323b25cebb40b49ea9
\ No newline at end of file
+90a3eff49de9173964d46f5ed57bcf9a78a72dd1bfe0e5323b25cebb40b49ea9
diff --git a/tests/testdata/winogrande-v0-res.json b/tests/testdata/winogrande-v0-res.json
index 9fa7903a56..cac4dc632a 100644
--- a/tests/testdata/winogrande-v0-res.json
+++ b/tests/testdata/winogrande-v0-res.json
@@ -1 +1 @@
-{"results": {"winogrande": {"acc": 0.516179952644041, "acc_stderr": 0.014045126130978606}}, "versions": {"winogrande": 0}}
\ No newline at end of file
+{"results": {"winogrande": {"acc": 0.516179952644041, "acc_stderr": 0.014045126130978606}}, "versions": {"winogrande": 0}}
diff --git a/tests/testdata/wmt14-en-fr-v0-greedy_until b/tests/testdata/wmt14-en-fr-v0-greedy_until
index 6d48d5579e..73c0f39eaa 100644
--- a/tests/testdata/wmt14-en-fr-v0-greedy_until
+++ b/tests/testdata/wmt14-en-fr-v0-greedy_until
@@ -1 +1 @@
-368ae7eec0f902b5123f2d5197caa5109a23942011c53fe68d9eaeee20180e46
\ No newline at end of file
+368ae7eec0f902b5123f2d5197caa5109a23942011c53fe68d9eaeee20180e46
diff --git a/tests/testdata/wmt14-en-fr-v0-res.json b/tests/testdata/wmt14-en-fr-v0-res.json
index 1aa13f0285..b175c5405c 100644
--- a/tests/testdata/wmt14-en-fr-v0-res.json
+++ b/tests/testdata/wmt14-en-fr-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt14-en-fr": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.011284118461117099, "chrf_stderr": 7.340651275964445e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt14-en-fr": 0}}
\ No newline at end of file
+{"results": {"wmt14-en-fr": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.011284118461117099, "chrf_stderr": 7.340651275964445e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt14-en-fr": 0}}
diff --git a/tests/testdata/wmt14-fr-en-v0-greedy_until b/tests/testdata/wmt14-fr-en-v0-greedy_until
index 7249d39990..ac8f4b4361 100644
--- a/tests/testdata/wmt14-fr-en-v0-greedy_until
+++ b/tests/testdata/wmt14-fr-en-v0-greedy_until
@@ -1 +1 @@
-c1d9f7283755fbdd7ecd6cc4278b0ac25a80ac256b7071ea5f839ccd038e5974
\ No newline at end of file
+c1d9f7283755fbdd7ecd6cc4278b0ac25a80ac256b7071ea5f839ccd038e5974
diff --git a/tests/testdata/wmt14-fr-en-v0-res.json b/tests/testdata/wmt14-fr-en-v0-res.json
index 5261876f55..f327e96164 100644
--- a/tests/testdata/wmt14-fr-en-v0-res.json
+++ b/tests/testdata/wmt14-fr-en-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt14-fr-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.01275083169440515, "chrf_stderr": 8.45474998563806e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt14-fr-en": 0}}
\ No newline at end of file
+{"results": {"wmt14-fr-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.01275083169440515, "chrf_stderr": 8.45474998563806e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt14-fr-en": 0}}
diff --git a/tests/testdata/wmt16-de-en-v0-greedy_until b/tests/testdata/wmt16-de-en-v0-greedy_until
index 75f1072b6e..ecdb510f51 100644
--- a/tests/testdata/wmt16-de-en-v0-greedy_until
+++ b/tests/testdata/wmt16-de-en-v0-greedy_until
@@ -1 +1 @@
-d30e23e38d9a45b9c31e1dfd14b58d0b7020df4b9c8a1c697aa6bc5fba8ce08a
\ No newline at end of file
+d30e23e38d9a45b9c31e1dfd14b58d0b7020df4b9c8a1c697aa6bc5fba8ce08a
diff --git a/tests/testdata/wmt16-de-en-v0-res.json b/tests/testdata/wmt16-de-en-v0-res.json
index 826e0382ab..c2d02476ca 100644
--- a/tests/testdata/wmt16-de-en-v0-res.json
+++ b/tests/testdata/wmt16-de-en-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt16-de-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.013700416764482968, "chrf_stderr": 0.00016071651360909355, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt16-de-en": 0}}
\ No newline at end of file
+{"results": {"wmt16-de-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.013700416764482968, "chrf_stderr": 0.00016071651360909355, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt16-de-en": 0}}
diff --git a/tests/testdata/wmt16-en-de-v0-greedy_until b/tests/testdata/wmt16-en-de-v0-greedy_until
index 45eaaaca8c..0fdfd2f668 100644
--- a/tests/testdata/wmt16-en-de-v0-greedy_until
+++ b/tests/testdata/wmt16-en-de-v0-greedy_until
@@ -1 +1 @@
-d71e2074af3770e9b29ac561caf2e1c29ad6b0dc50ec2e7bcc5501747b11f0da
\ No newline at end of file
+d71e2074af3770e9b29ac561caf2e1c29ad6b0dc50ec2e7bcc5501747b11f0da
diff --git a/tests/testdata/wmt16-en-de-v0-res.json b/tests/testdata/wmt16-en-de-v0-res.json
index 88bee7ffa6..9facc33e24 100644
--- a/tests/testdata/wmt16-en-de-v0-res.json
+++ b/tests/testdata/wmt16-en-de-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt16-en-de": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.010909486120840577, "chrf_stderr": 0.000122611124711072, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt16-en-de": 0}}
\ No newline at end of file
+{"results": {"wmt16-en-de": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.010909486120840577, "chrf_stderr": 0.000122611124711072, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt16-en-de": 0}}
diff --git a/tests/testdata/wmt16-en-ro-v0-greedy_until b/tests/testdata/wmt16-en-ro-v0-greedy_until
index 291492556e..0dd42926c0 100644
--- a/tests/testdata/wmt16-en-ro-v0-greedy_until
+++ b/tests/testdata/wmt16-en-ro-v0-greedy_until
@@ -1 +1 @@
-4be7fdda313394f19b5995b00ada1dfa3bb158ee1f020ef8d07ecea260fa60b2
\ No newline at end of file
+4be7fdda313394f19b5995b00ada1dfa3bb158ee1f020ef8d07ecea260fa60b2
diff --git a/tests/testdata/wmt16-en-ro-v0-res.json b/tests/testdata/wmt16-en-ro-v0-res.json
index babb8d2d74..878c584f63 100644
--- a/tests/testdata/wmt16-en-ro-v0-res.json
+++ b/tests/testdata/wmt16-en-ro-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt16-en-ro": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.012004814364156886, "chrf_stderr": 6.424423961332661e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt16-en-ro": 0}}
\ No newline at end of file
+{"results": {"wmt16-en-ro": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.012004814364156886, "chrf_stderr": 6.424423961332661e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt16-en-ro": 0}}
diff --git a/tests/testdata/wmt16-ro-en-v0-greedy_until b/tests/testdata/wmt16-ro-en-v0-greedy_until
index fbcac1b7e3..745a5fdd81 100644
--- a/tests/testdata/wmt16-ro-en-v0-greedy_until
+++ b/tests/testdata/wmt16-ro-en-v0-greedy_until
@@ -1 +1 @@
-d1b7c50751b0d5d7470b7f49f2bab9d09792c91460fc92cc34f06617013d7c65
\ No newline at end of file
+d1b7c50751b0d5d7470b7f49f2bab9d09792c91460fc92cc34f06617013d7c65
diff --git a/tests/testdata/wmt16-ro-en-v0-res.json b/tests/testdata/wmt16-ro-en-v0-res.json
index 267763793d..415aece638 100644
--- a/tests/testdata/wmt16-ro-en-v0-res.json
+++ b/tests/testdata/wmt16-ro-en-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt16-ro-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.01262029828861831, "chrf_stderr": 0.00014507496111350828, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt16-ro-en": 0}}
\ No newline at end of file
+{"results": {"wmt16-ro-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.01262029828861831, "chrf_stderr": 0.00014507496111350828, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt16-ro-en": 0}}
diff --git a/tests/testdata/wmt20-cs-en-v0-greedy_until b/tests/testdata/wmt20-cs-en-v0-greedy_until
index 7bcf240b70..4d0ddb177a 100644
--- a/tests/testdata/wmt20-cs-en-v0-greedy_until
+++ b/tests/testdata/wmt20-cs-en-v0-greedy_until
@@ -1 +1 @@
-bfead9efdb1b2402a414c55929c8d8f956585f938a35466931d44e81d89cfe00
\ No newline at end of file
+bfead9efdb1b2402a414c55929c8d8f956585f938a35466931d44e81d89cfe00
diff --git a/tests/testdata/wmt20-cs-en-v0-res.json b/tests/testdata/wmt20-cs-en-v0-res.json
index 70c80afe5b..27bcea3ed0 100644
--- a/tests/testdata/wmt20-cs-en-v0-res.json
+++ b/tests/testdata/wmt20-cs-en-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-cs-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.006212086270964023, "chrf_stderr": 0.0001119165191795531, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-cs-en": 0}}
\ No newline at end of file
+{"results": {"wmt20-cs-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.006212086270964023, "chrf_stderr": 0.0001119165191795531, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-cs-en": 0}}
diff --git a/tests/testdata/wmt20-de-en-v0-greedy_until b/tests/testdata/wmt20-de-en-v0-greedy_until
index c02fb9875d..e102ccfac1 100644
--- a/tests/testdata/wmt20-de-en-v0-greedy_until
+++ b/tests/testdata/wmt20-de-en-v0-greedy_until
@@ -1 +1 @@
-d13b5a6915ca86ac6c6ebc50d9be0d0be3dfca600c12e896df53190d875de74d
\ No newline at end of file
+d13b5a6915ca86ac6c6ebc50d9be0d0be3dfca600c12e896df53190d875de74d
diff --git a/tests/testdata/wmt20-de-en-v0-res.json b/tests/testdata/wmt20-de-en-v0-res.json
index 790424fe4f..36246c0f33 100644
--- a/tests/testdata/wmt20-de-en-v0-res.json
+++ b/tests/testdata/wmt20-de-en-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-de-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.006703243310670055, "chrf_stderr": 0.0001292711927988445, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-de-en": 0}}
\ No newline at end of file
+{"results": {"wmt20-de-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.006703243310670055, "chrf_stderr": 0.0001292711927988445, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-de-en": 0}}
diff --git a/tests/testdata/wmt20-de-fr-v0-greedy_until b/tests/testdata/wmt20-de-fr-v0-greedy_until
index 7cb9424082..3b0e21daaf 100644
--- a/tests/testdata/wmt20-de-fr-v0-greedy_until
+++ b/tests/testdata/wmt20-de-fr-v0-greedy_until
@@ -1 +1 @@
-7f197bc281d6dbf9425900ef0dee7175021c43e355050f149f43b161c52bf0b0
\ No newline at end of file
+7f197bc281d6dbf9425900ef0dee7175021c43e355050f149f43b161c52bf0b0
diff --git a/tests/testdata/wmt20-de-fr-v0-res.json b/tests/testdata/wmt20-de-fr-v0-res.json
index 79a0d12fe6..820b75bcd1 100644
--- a/tests/testdata/wmt20-de-fr-v0-res.json
+++ b/tests/testdata/wmt20-de-fr-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-de-fr": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.011897164096796364, "chrf_stderr": 0.00010158164726118333, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-de-fr": 0}}
\ No newline at end of file
+{"results": {"wmt20-de-fr": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.011897164096796364, "chrf_stderr": 0.00010158164726118333, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-de-fr": 0}}
diff --git a/tests/testdata/wmt20-en-cs-v0-greedy_until b/tests/testdata/wmt20-en-cs-v0-greedy_until
index d14fc4939a..fad767f930 100644
--- a/tests/testdata/wmt20-en-cs-v0-greedy_until
+++ b/tests/testdata/wmt20-en-cs-v0-greedy_until
@@ -1 +1 @@
-5a34e6863bf6965afd31653de50bac5fecf58db65dbaba46921504a2b7463786
\ No newline at end of file
+5a34e6863bf6965afd31653de50bac5fecf58db65dbaba46921504a2b7463786
diff --git a/tests/testdata/wmt20-en-cs-v0-res.json b/tests/testdata/wmt20-en-cs-v0-res.json
index 2ba9db70d3..b9998954e3 100644
--- a/tests/testdata/wmt20-en-cs-v0-res.json
+++ b/tests/testdata/wmt20-en-cs-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-en-cs": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.009879653442394573, "chrf_stderr": 8.210293331159994e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-cs": 0}}
\ No newline at end of file
+{"results": {"wmt20-en-cs": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.009879653442394573, "chrf_stderr": 8.210293331159994e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-cs": 0}}
diff --git a/tests/testdata/wmt20-en-de-v0-greedy_until b/tests/testdata/wmt20-en-de-v0-greedy_until
index c4078efd99..46a0dc5f51 100644
--- a/tests/testdata/wmt20-en-de-v0-greedy_until
+++ b/tests/testdata/wmt20-en-de-v0-greedy_until
@@ -1 +1 @@
-b6e9c305766ea23ce1027309f83c6d4c2ce8948d70b63a7858586ca34050d7fb
\ No newline at end of file
+b6e9c305766ea23ce1027309f83c6d4c2ce8948d70b63a7858586ca34050d7fb
diff --git a/tests/testdata/wmt20-en-de-v0-res.json b/tests/testdata/wmt20-en-de-v0-res.json
index 183e66270a..78059f723f 100644
--- a/tests/testdata/wmt20-en-de-v0-res.json
+++ b/tests/testdata/wmt20-en-de-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-en-de": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.007148103038872972, "chrf_stderr": 9.594096858911254e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-de": 0}}
\ No newline at end of file
+{"results": {"wmt20-en-de": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.007148103038872972, "chrf_stderr": 9.594096858911254e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-de": 0}}
diff --git a/tests/testdata/wmt20-en-iu-v0-greedy_until b/tests/testdata/wmt20-en-iu-v0-greedy_until
index d26bb4f92a..3039491584 100644
--- a/tests/testdata/wmt20-en-iu-v0-greedy_until
+++ b/tests/testdata/wmt20-en-iu-v0-greedy_until
@@ -1 +1 @@
-f5688199890a48f73f2cc04a2152e35190f0e0ddd40e629fa24ee39d423ea389
\ No newline at end of file
+f5688199890a48f73f2cc04a2152e35190f0e0ddd40e629fa24ee39d423ea389
diff --git a/tests/testdata/wmt20-en-iu-v0-res.json b/tests/testdata/wmt20-en-iu-v0-res.json
index 22f042eb4e..72dcfbeea2 100644
--- a/tests/testdata/wmt20-en-iu-v0-res.json
+++ b/tests/testdata/wmt20-en-iu-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-en-iu": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.00011803644548940443, "chrf_stderr": 2.175287038623409e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-iu": 0}}
\ No newline at end of file
+{"results": {"wmt20-en-iu": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.00011803644548940443, "chrf_stderr": 2.175287038623409e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-iu": 0}}
diff --git a/tests/testdata/wmt20-en-ja-v0-greedy_until b/tests/testdata/wmt20-en-ja-v0-greedy_until
index 9777002c79..ba67a22399 100644
--- a/tests/testdata/wmt20-en-ja-v0-greedy_until
+++ b/tests/testdata/wmt20-en-ja-v0-greedy_until
@@ -1 +1 @@
-7fe61f5847a51e93e97c84b39f4420978727754e4b6cf636a27851c615857530
\ No newline at end of file
+7fe61f5847a51e93e97c84b39f4420978727754e4b6cf636a27851c615857530
diff --git a/tests/testdata/wmt20-en-ja-v0-res.json b/tests/testdata/wmt20-en-ja-v0-res.json
index 57bad300d7..9b00c0fb16 100644
--- a/tests/testdata/wmt20-en-ja-v0-res.json
+++ b/tests/testdata/wmt20-en-ja-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-en-ja": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 4.1308658294778584e-05, "chrf_stderr": 2.0456539027807417e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-ja": 0}}
\ No newline at end of file
+{"results": {"wmt20-en-ja": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 4.1308658294778584e-05, "chrf_stderr": 2.0456539027807417e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-ja": 0}}
diff --git a/tests/testdata/wmt20-en-ja-v1-greedy_until b/tests/testdata/wmt20-en-ja-v1-greedy_until
index 9777002c79..ba67a22399 100644
--- a/tests/testdata/wmt20-en-ja-v1-greedy_until
+++ b/tests/testdata/wmt20-en-ja-v1-greedy_until
@@ -1 +1 @@
-7fe61f5847a51e93e97c84b39f4420978727754e4b6cf636a27851c615857530
\ No newline at end of file
+7fe61f5847a51e93e97c84b39f4420978727754e4b6cf636a27851c615857530
diff --git a/tests/testdata/wmt20-en-ja-v1-res.json b/tests/testdata/wmt20-en-ja-v1-res.json
index be5e56abcf..8eda5824b7 100644
--- a/tests/testdata/wmt20-en-ja-v1-res.json
+++ b/tests/testdata/wmt20-en-ja-v1-res.json
@@ -1 +1 @@
-{"results": {"wmt20-en-ja": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 4.1305928226819116e-05, "chrf_stderr": 2.0455354158878388e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-ja": 1}}
\ No newline at end of file
+{"results": {"wmt20-en-ja": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 4.1305928226819116e-05, "chrf_stderr": 2.0455354158878388e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-ja": 1}}
diff --git a/tests/testdata/wmt20-en-km-v0-greedy_until b/tests/testdata/wmt20-en-km-v0-greedy_until
index ddce46a79f..a77668f1ed 100644
--- a/tests/testdata/wmt20-en-km-v0-greedy_until
+++ b/tests/testdata/wmt20-en-km-v0-greedy_until
@@ -1 +1 @@
-eb5365c46f22ffec9a157991627d6e1fd1117fccffaedfc73619e93bafb5a408
\ No newline at end of file
+eb5365c46f22ffec9a157991627d6e1fd1117fccffaedfc73619e93bafb5a408
diff --git a/tests/testdata/wmt20-en-km-v0-res.json b/tests/testdata/wmt20-en-km-v0-res.json
index e5ee2e9be9..f9f0799431 100644
--- a/tests/testdata/wmt20-en-km-v0-res.json
+++ b/tests/testdata/wmt20-en-km-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-en-km": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 1.9008351315007364e-05, "chrf_stderr": 7.136657625458525e-06, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-km": 0}}
\ No newline at end of file
+{"results": {"wmt20-en-km": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 1.9008351315007364e-05, "chrf_stderr": 7.136657625458525e-06, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-km": 0}}
diff --git a/tests/testdata/wmt20-en-pl-v0-greedy_until b/tests/testdata/wmt20-en-pl-v0-greedy_until
index bd431d61c4..17d5f4dade 100644
--- a/tests/testdata/wmt20-en-pl-v0-greedy_until
+++ b/tests/testdata/wmt20-en-pl-v0-greedy_until
@@ -1 +1 @@
-952f02575d4936d93c4d2808d86c4bf5f1f3a0901212acee6cbc1f9cbd30d39e
\ No newline at end of file
+952f02575d4936d93c4d2808d86c4bf5f1f3a0901212acee6cbc1f9cbd30d39e
diff --git a/tests/testdata/wmt20-en-pl-v0-res.json b/tests/testdata/wmt20-en-pl-v0-res.json
index 13bfd5b552..2a9f3b9621 100644
--- a/tests/testdata/wmt20-en-pl-v0-res.json
+++ b/tests/testdata/wmt20-en-pl-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-en-pl": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.009006977773147825, "chrf_stderr": 0.00023387733367766675, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-pl": 0}}
\ No newline at end of file
+{"results": {"wmt20-en-pl": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.009006977773147825, "chrf_stderr": 0.00023387733367766675, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-pl": 0}}
diff --git a/tests/testdata/wmt20-en-ps-v0-greedy_until b/tests/testdata/wmt20-en-ps-v0-greedy_until
index 77b600c49a..36570999ff 100644
--- a/tests/testdata/wmt20-en-ps-v0-greedy_until
+++ b/tests/testdata/wmt20-en-ps-v0-greedy_until
@@ -1 +1 @@
-8411c2cb73114cbd0c6e0f17eab2625d486cc3a601105deb0ea1338a401df689
\ No newline at end of file
+8411c2cb73114cbd0c6e0f17eab2625d486cc3a601105deb0ea1338a401df689
diff --git a/tests/testdata/wmt20-en-ps-v0-res.json b/tests/testdata/wmt20-en-ps-v0-res.json
index fcfb51f053..98cad37e31 100644
--- a/tests/testdata/wmt20-en-ps-v0-res.json
+++ b/tests/testdata/wmt20-en-ps-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-en-ps": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 2.1193813610582323e-06, "chrf_stderr": 2.113911466119111e-06, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-ps": 0}}
\ No newline at end of file
+{"results": {"wmt20-en-ps": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 2.1193813610582323e-06, "chrf_stderr": 2.113911466119111e-06, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-ps": 0}}
diff --git a/tests/testdata/wmt20-en-ru-v0-greedy_until b/tests/testdata/wmt20-en-ru-v0-greedy_until
index d21d39ac9f..ee0ab5524f 100644
--- a/tests/testdata/wmt20-en-ru-v0-greedy_until
+++ b/tests/testdata/wmt20-en-ru-v0-greedy_until
@@ -1 +1 @@
-a1613831f69c1679a54670092af40ce76617b79d7cc837984803b0fc52bb8bde
\ No newline at end of file
+a1613831f69c1679a54670092af40ce76617b79d7cc837984803b0fc52bb8bde
diff --git a/tests/testdata/wmt20-en-ru-v0-res.json b/tests/testdata/wmt20-en-ru-v0-res.json
index af339eda5d..f8da87036c 100644
--- a/tests/testdata/wmt20-en-ru-v0-res.json
+++ b/tests/testdata/wmt20-en-ru-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-en-ru": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.0007327811114614671, "chrf_stderr": 4.43155903515048e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-ru": 0}}
\ No newline at end of file
+{"results": {"wmt20-en-ru": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.0007327811114614671, "chrf_stderr": 4.43155903515048e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-ru": 0}}
diff --git a/tests/testdata/wmt20-en-ta-v0-greedy_until b/tests/testdata/wmt20-en-ta-v0-greedy_until
index 1b40263f15..47a7be0129 100644
--- a/tests/testdata/wmt20-en-ta-v0-greedy_until
+++ b/tests/testdata/wmt20-en-ta-v0-greedy_until
@@ -1 +1 @@
-5fc556fa90bca7f1b1396e97e392eac8080b0ad53488358799b8fc0b21a94cb1
\ No newline at end of file
+5fc556fa90bca7f1b1396e97e392eac8080b0ad53488358799b8fc0b21a94cb1
diff --git a/tests/testdata/wmt20-en-ta-v0-res.json b/tests/testdata/wmt20-en-ta-v0-res.json
index b04f968d76..6b121c2e40 100644
--- a/tests/testdata/wmt20-en-ta-v0-res.json
+++ b/tests/testdata/wmt20-en-ta-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-en-ta": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.0, "chrf_stderr": 0.0, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-ta": 0}}
\ No newline at end of file
+{"results": {"wmt20-en-ta": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.0, "chrf_stderr": 0.0, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-ta": 0}}
diff --git a/tests/testdata/wmt20-en-zh-v0-greedy_until b/tests/testdata/wmt20-en-zh-v0-greedy_until
index db79b7f03f..5a2eebe6d8 100644
--- a/tests/testdata/wmt20-en-zh-v0-greedy_until
+++ b/tests/testdata/wmt20-en-zh-v0-greedy_until
@@ -1 +1 @@
-67f0333ddbcb07d7a9ac12919129a18fe4fea24e4826a11bbdde4fd5ed5ed83f
\ No newline at end of file
+67f0333ddbcb07d7a9ac12919129a18fe4fea24e4826a11bbdde4fd5ed5ed83f
diff --git a/tests/testdata/wmt20-en-zh-v0-res.json b/tests/testdata/wmt20-en-zh-v0-res.json
index 24db35e62f..18b5ff551e 100644
--- a/tests/testdata/wmt20-en-zh-v0-res.json
+++ b/tests/testdata/wmt20-en-zh-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-en-zh": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.00014170297316825535, "chrf_stderr": 6.590669847391838e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-zh": 0}}
\ No newline at end of file
+{"results": {"wmt20-en-zh": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.00014170297316825535, "chrf_stderr": 6.590669847391838e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-zh": 0}}
diff --git a/tests/testdata/wmt20-en-zh-v1-greedy_until b/tests/testdata/wmt20-en-zh-v1-greedy_until
index db79b7f03f..5a2eebe6d8 100644
--- a/tests/testdata/wmt20-en-zh-v1-greedy_until
+++ b/tests/testdata/wmt20-en-zh-v1-greedy_until
@@ -1 +1 @@
-67f0333ddbcb07d7a9ac12919129a18fe4fea24e4826a11bbdde4fd5ed5ed83f
\ No newline at end of file
+67f0333ddbcb07d7a9ac12919129a18fe4fea24e4826a11bbdde4fd5ed5ed83f
diff --git a/tests/testdata/wmt20-en-zh-v1-res.json b/tests/testdata/wmt20-en-zh-v1-res.json
index a7a56daf0e..e06c0dbe96 100644
--- a/tests/testdata/wmt20-en-zh-v1-res.json
+++ b/tests/testdata/wmt20-en-zh-v1-res.json
@@ -1 +1 @@
-{"results": {"wmt20-en-zh": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.00014170297316825535, "chrf_stderr": 6.590669847391838e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-zh": 1}}
\ No newline at end of file
+{"results": {"wmt20-en-zh": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.00014170297316825535, "chrf_stderr": 6.590669847391838e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-zh": 1}}
diff --git a/tests/testdata/wmt20-fr-de-v0-greedy_until b/tests/testdata/wmt20-fr-de-v0-greedy_until
index 7353ad4475..10b82f1b97 100644
--- a/tests/testdata/wmt20-fr-de-v0-greedy_until
+++ b/tests/testdata/wmt20-fr-de-v0-greedy_until
@@ -1 +1 @@
-8a4b65c59dcac6591d46261909ee92ebcf41c19ee7442b12842302b2d8aeb36f
\ No newline at end of file
+8a4b65c59dcac6591d46261909ee92ebcf41c19ee7442b12842302b2d8aeb36f
diff --git a/tests/testdata/wmt20-fr-de-v0-res.json b/tests/testdata/wmt20-fr-de-v0-res.json
index d5d06a02a3..3b3c168ae2 100644
--- a/tests/testdata/wmt20-fr-de-v0-res.json
+++ b/tests/testdata/wmt20-fr-de-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-fr-de": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.01143193767396364, "chrf_stderr": 0.00012555271954563658, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-fr-de": 0}}
\ No newline at end of file
+{"results": {"wmt20-fr-de": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.01143193767396364, "chrf_stderr": 0.00012555271954563658, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-fr-de": 0}}
diff --git a/tests/testdata/wmt20-iu-en-v0-greedy_until b/tests/testdata/wmt20-iu-en-v0-greedy_until
index 87a1981e79..35df9319d4 100644
--- a/tests/testdata/wmt20-iu-en-v0-greedy_until
+++ b/tests/testdata/wmt20-iu-en-v0-greedy_until
@@ -1 +1 @@
-97bf664a8efa54b5366b8341f77b418106dd0cb26169d5b2d0144e4d3d2bc5c9
\ No newline at end of file
+97bf664a8efa54b5366b8341f77b418106dd0cb26169d5b2d0144e4d3d2bc5c9
diff --git a/tests/testdata/wmt20-iu-en-v0-res.json b/tests/testdata/wmt20-iu-en-v0-res.json
index e94cac8876..ce0cf39dfa 100644
--- a/tests/testdata/wmt20-iu-en-v0-res.json
+++ b/tests/testdata/wmt20-iu-en-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-iu-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.012204628007572778, "chrf_stderr": 8.944407532175802e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-iu-en": 0}}
\ No newline at end of file
+{"results": {"wmt20-iu-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.012204628007572778, "chrf_stderr": 8.944407532175802e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-iu-en": 0}}
diff --git a/tests/testdata/wmt20-ja-en-v0-greedy_until b/tests/testdata/wmt20-ja-en-v0-greedy_until
index 3a89d7fcdf..7f2363fa46 100644
--- a/tests/testdata/wmt20-ja-en-v0-greedy_until
+++ b/tests/testdata/wmt20-ja-en-v0-greedy_until
@@ -1 +1 @@
-1fd846f3c0104e794eb380dae7f648592092ab8bf59234c26d0a671bbbc28df1
\ No newline at end of file
+1fd846f3c0104e794eb380dae7f648592092ab8bf59234c26d0a671bbbc28df1
diff --git a/tests/testdata/wmt20-ja-en-v0-res.json b/tests/testdata/wmt20-ja-en-v0-res.json
index 4344b7cd8a..4e19eda5ef 100644
--- a/tests/testdata/wmt20-ja-en-v0-res.json
+++ b/tests/testdata/wmt20-ja-en-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-ja-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.010703148854351403, "chrf_stderr": 0.00022242113108130186, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-ja-en": 0}}
\ No newline at end of file
+{"results": {"wmt20-ja-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.010703148854351403, "chrf_stderr": 0.00022242113108130186, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-ja-en": 0}}
diff --git a/tests/testdata/wmt20-km-en-v0-greedy_until b/tests/testdata/wmt20-km-en-v0-greedy_until
index a6f1486610..867aa63de7 100644
--- a/tests/testdata/wmt20-km-en-v0-greedy_until
+++ b/tests/testdata/wmt20-km-en-v0-greedy_until
@@ -1 +1 @@
-fb4ec81bb89c70df7e21b43e0e882915b7b71a2a85bb8d4b59e0c7938baaa4c2
\ No newline at end of file
+fb4ec81bb89c70df7e21b43e0e882915b7b71a2a85bb8d4b59e0c7938baaa4c2
diff --git a/tests/testdata/wmt20-km-en-v0-res.json b/tests/testdata/wmt20-km-en-v0-res.json
index 4f6dc98604..cf4e8dc7a6 100644
--- a/tests/testdata/wmt20-km-en-v0-res.json
+++ b/tests/testdata/wmt20-km-en-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-km-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.015142474534585969, "chrf_stderr": 0.0001518735048829897, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-km-en": 0}}
\ No newline at end of file
+{"results": {"wmt20-km-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.015142474534585969, "chrf_stderr": 0.0001518735048829897, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-km-en": 0}}
diff --git a/tests/testdata/wmt20-pl-en-v0-greedy_until b/tests/testdata/wmt20-pl-en-v0-greedy_until
index 899ce01919..5ab5b017b7 100644
--- a/tests/testdata/wmt20-pl-en-v0-greedy_until
+++ b/tests/testdata/wmt20-pl-en-v0-greedy_until
@@ -1 +1 @@
-89274499d84176b1ffe4eaec06f2c89ca807342384dc946c2e348d00116aaade
\ No newline at end of file
+89274499d84176b1ffe4eaec06f2c89ca807342384dc946c2e348d00116aaade
diff --git a/tests/testdata/wmt20-pl-en-v0-res.json b/tests/testdata/wmt20-pl-en-v0-res.json
index a2f5cb31be..b1c165bdcb 100644
--- a/tests/testdata/wmt20-pl-en-v0-res.json
+++ b/tests/testdata/wmt20-pl-en-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-pl-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.01353367757716276, "chrf_stderr": 0.00018386199249976465, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-pl-en": 0}}
\ No newline at end of file
+{"results": {"wmt20-pl-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.01353367757716276, "chrf_stderr": 0.00018386199249976465, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-pl-en": 0}}
diff --git a/tests/testdata/wmt20-ps-en-v0-greedy_until b/tests/testdata/wmt20-ps-en-v0-greedy_until
index 7776c59523..b77563a655 100644
--- a/tests/testdata/wmt20-ps-en-v0-greedy_until
+++ b/tests/testdata/wmt20-ps-en-v0-greedy_until
@@ -1 +1 @@
-c3976465e3709b4bc371175cc1494c69fe096ea4ba7d114da779d2baa0a47466
\ No newline at end of file
+c3976465e3709b4bc371175cc1494c69fe096ea4ba7d114da779d2baa0a47466
diff --git a/tests/testdata/wmt20-ps-en-v0-res.json b/tests/testdata/wmt20-ps-en-v0-res.json
index 00c9c742e4..f01519d2dd 100644
--- a/tests/testdata/wmt20-ps-en-v0-res.json
+++ b/tests/testdata/wmt20-ps-en-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-ps-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.015192865365105723, "chrf_stderr": 0.00011334541381539086, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-ps-en": 0}}
\ No newline at end of file
+{"results": {"wmt20-ps-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.015192865365105723, "chrf_stderr": 0.00011334541381539086, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-ps-en": 0}}
diff --git a/tests/testdata/wmt20-ru-en-v0-greedy_until b/tests/testdata/wmt20-ru-en-v0-greedy_until
index 27c60fb721..ca51f07216 100644
--- a/tests/testdata/wmt20-ru-en-v0-greedy_until
+++ b/tests/testdata/wmt20-ru-en-v0-greedy_until
@@ -1 +1 @@
-1477ab6542c26bd0222cc1aded174f33bf8d04d1cf6a1c0959aeca4ff3779adc
\ No newline at end of file
+1477ab6542c26bd0222cc1aded174f33bf8d04d1cf6a1c0959aeca4ff3779adc
diff --git a/tests/testdata/wmt20-ru-en-v0-res.json b/tests/testdata/wmt20-ru-en-v0-res.json
index b6d0c71ad7..1a5633a53d 100644
--- a/tests/testdata/wmt20-ru-en-v0-res.json
+++ b/tests/testdata/wmt20-ru-en-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-ru-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.013344639906399232, "chrf_stderr": 7.583552652374546e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-ru-en": 0}}
\ No newline at end of file
+{"results": {"wmt20-ru-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.013344639906399232, "chrf_stderr": 7.583552652374546e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-ru-en": 0}}
diff --git a/tests/testdata/wmt20-ta-en-v0-greedy_until b/tests/testdata/wmt20-ta-en-v0-greedy_until
index f0f6597245..96e62e8bf7 100644
--- a/tests/testdata/wmt20-ta-en-v0-greedy_until
+++ b/tests/testdata/wmt20-ta-en-v0-greedy_until
@@ -1 +1 @@
-111ea3efdc08f1cf536631b9426c3a20e482c575d009d2a8c71f59c027578eec
\ No newline at end of file
+111ea3efdc08f1cf536631b9426c3a20e482c575d009d2a8c71f59c027578eec
diff --git a/tests/testdata/wmt20-ta-en-v0-res.json b/tests/testdata/wmt20-ta-en-v0-res.json
index a2ad506bf9..2e0c1cadd3 100644
--- a/tests/testdata/wmt20-ta-en-v0-res.json
+++ b/tests/testdata/wmt20-ta-en-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-ta-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.013841110664859798, "chrf_stderr": 0.00018476696850880766, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-ta-en": 0}}
\ No newline at end of file
+{"results": {"wmt20-ta-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.013841110664859798, "chrf_stderr": 0.00018476696850880766, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-ta-en": 0}}
diff --git a/tests/testdata/wmt20-zh-en-v0-greedy_until b/tests/testdata/wmt20-zh-en-v0-greedy_until
index 41a1e91515..8792a4c1fa 100644
--- a/tests/testdata/wmt20-zh-en-v0-greedy_until
+++ b/tests/testdata/wmt20-zh-en-v0-greedy_until
@@ -1 +1 @@
-07dbadfd6f2b2b9462ab6187dbfaabae6e5192ab89a8e4ede9237834b9364dd1
\ No newline at end of file
+07dbadfd6f2b2b9462ab6187dbfaabae6e5192ab89a8e4ede9237834b9364dd1
diff --git a/tests/testdata/wmt20-zh-en-v0-res.json b/tests/testdata/wmt20-zh-en-v0-res.json
index 11b8df7f87..341812febe 100644
--- a/tests/testdata/wmt20-zh-en-v0-res.json
+++ b/tests/testdata/wmt20-zh-en-v0-res.json
@@ -1 +1 @@
-{"results": {"wmt20-zh-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.008438201290981157, "chrf_stderr": 0.0001109053964076822, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-zh-en": 0}}
\ No newline at end of file
+{"results": {"wmt20-zh-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.008438201290981157, "chrf_stderr": 0.0001109053964076822, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-zh-en": 0}}
diff --git a/tests/testdata/wnli-v0-loglikelihood b/tests/testdata/wnli-v0-loglikelihood
index 0c5c0b8ceb..512acc556f 100644
--- a/tests/testdata/wnli-v0-loglikelihood
+++ b/tests/testdata/wnli-v0-loglikelihood
@@ -1 +1 @@
-2ffd304d6096416eb29607e2e7642b1d6043163624967bcf4c4fc00fddc6c721
\ No newline at end of file
+2ffd304d6096416eb29607e2e7642b1d6043163624967bcf4c4fc00fddc6c721
diff --git a/tests/testdata/wnli-v0-res.json b/tests/testdata/wnli-v0-res.json
index 8841cb74d1..8be722cd63 100644
--- a/tests/testdata/wnli-v0-res.json
+++ b/tests/testdata/wnli-v0-res.json
@@ -1 +1 @@
-{"results": {"wnli": {"acc": 0.3380281690140845, "acc_stderr": 0.05653887739133514}}, "versions": {"wnli": 0}}
\ No newline at end of file
+{"results": {"wnli": {"acc": 0.3380281690140845, "acc_stderr": 0.05653887739133514}}, "versions": {"wnli": 0}}
diff --git a/tests/testdata/wnli-v1-loglikelihood b/tests/testdata/wnli-v1-loglikelihood
index cbf4ad3777..e782f305e0 100644
--- a/tests/testdata/wnli-v1-loglikelihood
+++ b/tests/testdata/wnli-v1-loglikelihood
@@ -1 +1 @@
-8a0f81661d2ab2334bbc8031fac31c0c8882f1d9271dd51599d21dfdbb726dea
\ No newline at end of file
+8a0f81661d2ab2334bbc8031fac31c0c8882f1d9271dd51599d21dfdbb726dea
diff --git a/tests/testdata/wnli-v1-res.json b/tests/testdata/wnli-v1-res.json
index d12348e0ae..c0b77b5469 100644
--- a/tests/testdata/wnli-v1-res.json
+++ b/tests/testdata/wnli-v1-res.json
@@ -1 +1 @@
-{"results": {"wnli": {"acc": 0.5633802816901409, "acc_stderr": 0.0592793555841297}}, "versions": {"wnli": 1}}
\ No newline at end of file
+{"results": {"wnli": {"acc": 0.5633802816901409, "acc_stderr": 0.0592793555841297}}, "versions": {"wnli": 1}}
diff --git a/tests/testdata/wsc-v0-loglikelihood b/tests/testdata/wsc-v0-loglikelihood
index d0d2963fe9..a1fce6b236 100644
--- a/tests/testdata/wsc-v0-loglikelihood
+++ b/tests/testdata/wsc-v0-loglikelihood
@@ -1 +1 @@
-45865468eff5ca31e6a050947a6b3310d9d5ed19d0f2e578a32ecaf1c768600f
\ No newline at end of file
+45865468eff5ca31e6a050947a6b3310d9d5ed19d0f2e578a32ecaf1c768600f
diff --git a/tests/testdata/wsc-v0-res.json b/tests/testdata/wsc-v0-res.json
index 84be596241..fbedf192d6 100644
--- a/tests/testdata/wsc-v0-res.json
+++ b/tests/testdata/wsc-v0-res.json
@@ -1 +1 @@
-{"results": {"wsc": {"acc": 0.5480769230769231, "acc_stderr": 0.049038186969314335}}, "versions": {"wsc": 0}}
\ No newline at end of file
+{"results": {"wsc": {"acc": 0.5480769230769231, "acc_stderr": 0.049038186969314335}}, "versions": {"wsc": 0}}
diff --git a/tests/testdata/wsc273-v0-loglikelihood b/tests/testdata/wsc273-v0-loglikelihood
index 9d592917bd..a78c1ba104 100644
--- a/tests/testdata/wsc273-v0-loglikelihood
+++ b/tests/testdata/wsc273-v0-loglikelihood
@@ -1 +1 @@
-26450d414c4581feb51a09882080e7a9b95882e7eab47b1751a4a6024b5a60ee
\ No newline at end of file
+26450d414c4581feb51a09882080e7a9b95882e7eab47b1751a4a6024b5a60ee
diff --git a/tests/testdata/wsc273-v0-res.json b/tests/testdata/wsc273-v0-res.json
index 8f023b422a..726eb141d8 100644
--- a/tests/testdata/wsc273-v0-res.json
+++ b/tests/testdata/wsc273-v0-res.json
@@ -1 +1 @@
-{"results": {"wsc273": {"acc": 0.5164835164835165, "acc_stderr": 0.0303004740355766}}, "versions": {"wsc273": 0}}
\ No newline at end of file
+{"results": {"wsc273": {"acc": 0.5164835164835165, "acc_stderr": 0.0303004740355766}}, "versions": {"wsc273": 0}}
diff --git a/tests/tests/testdata/blimp_adjunct_island-v0-loglikelihood b/tests/tests/testdata/blimp_adjunct_island-v0-loglikelihood
index 85f0e8fb2a..04dcda8865 100644
--- a/tests/tests/testdata/blimp_adjunct_island-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_adjunct_island-v0-loglikelihood
@@ -1 +1 @@
-976a5cac4bdb724632eebd4cb9e522203ce3da8d5525288a597c86e80469f3f2
\ No newline at end of file
+976a5cac4bdb724632eebd4cb9e522203ce3da8d5525288a597c86e80469f3f2
diff --git a/tests/tests/testdata/blimp_adjunct_island-v0-res.json b/tests/tests/testdata/blimp_adjunct_island-v0-res.json
index 39e2517bbc..163ce5a628 100644
--- a/tests/tests/testdata/blimp_adjunct_island-v0-res.json
+++ b/tests/tests/testdata/blimp_adjunct_island-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_adjunct_island": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_adjunct_island": 0}}
\ No newline at end of file
+{"results": {"blimp_adjunct_island": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_adjunct_island": 0}}
diff --git a/tests/tests/testdata/blimp_anaphor_gender_agreement-v0-loglikelihood b/tests/tests/testdata/blimp_anaphor_gender_agreement-v0-loglikelihood
index 32b700ea9e..ea9de13a17 100644
--- a/tests/tests/testdata/blimp_anaphor_gender_agreement-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_anaphor_gender_agreement-v0-loglikelihood
@@ -1 +1 @@
-2d8964e56a17661502ecf3f09c0befba63915360ddf2145b0bd845816950515d
\ No newline at end of file
+2d8964e56a17661502ecf3f09c0befba63915360ddf2145b0bd845816950515d
diff --git a/tests/tests/testdata/blimp_anaphor_gender_agreement-v0-res.json b/tests/tests/testdata/blimp_anaphor_gender_agreement-v0-res.json
index 1c39ab7045..9a748a2b8c 100644
--- a/tests/tests/testdata/blimp_anaphor_gender_agreement-v0-res.json
+++ b/tests/tests/testdata/blimp_anaphor_gender_agreement-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_anaphor_gender_agreement": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_anaphor_gender_agreement": 0}}
\ No newline at end of file
+{"results": {"blimp_anaphor_gender_agreement": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_anaphor_gender_agreement": 0}}
diff --git a/tests/tests/testdata/blimp_anaphor_number_agreement-v0-loglikelihood b/tests/tests/testdata/blimp_anaphor_number_agreement-v0-loglikelihood
index 347570f3a6..512b36da8e 100644
--- a/tests/tests/testdata/blimp_anaphor_number_agreement-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_anaphor_number_agreement-v0-loglikelihood
@@ -1 +1 @@
-0bdad31c974ba064e1f1ba931841ec2ba7461e8b0ca54ea5f79f08b6bae0bab5
\ No newline at end of file
+0bdad31c974ba064e1f1ba931841ec2ba7461e8b0ca54ea5f79f08b6bae0bab5
diff --git a/tests/tests/testdata/blimp_anaphor_number_agreement-v0-res.json b/tests/tests/testdata/blimp_anaphor_number_agreement-v0-res.json
index 68bbe21379..5391c08491 100644
--- a/tests/tests/testdata/blimp_anaphor_number_agreement-v0-res.json
+++ b/tests/tests/testdata/blimp_anaphor_number_agreement-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_anaphor_number_agreement": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_anaphor_number_agreement": 0}}
\ No newline at end of file
+{"results": {"blimp_anaphor_number_agreement": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_anaphor_number_agreement": 0}}
diff --git a/tests/tests/testdata/blimp_animate_subject_passive-v0-loglikelihood b/tests/tests/testdata/blimp_animate_subject_passive-v0-loglikelihood
index 47cd3d3be1..719a6eb0f8 100644
--- a/tests/tests/testdata/blimp_animate_subject_passive-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_animate_subject_passive-v0-loglikelihood
@@ -1 +1 @@
-064c38fcd072b8bd12f54ea4f8e41599ed4e11dc386e93b77e1fc07967d1f960
\ No newline at end of file
+064c38fcd072b8bd12f54ea4f8e41599ed4e11dc386e93b77e1fc07967d1f960
diff --git a/tests/tests/testdata/blimp_animate_subject_passive-v0-res.json b/tests/tests/testdata/blimp_animate_subject_passive-v0-res.json
index 96a7ed5e2a..6c0d6d45ea 100644
--- a/tests/tests/testdata/blimp_animate_subject_passive-v0-res.json
+++ b/tests/tests/testdata/blimp_animate_subject_passive-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_animate_subject_passive": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_animate_subject_passive": 0}}
\ No newline at end of file
+{"results": {"blimp_animate_subject_passive": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_animate_subject_passive": 0}}
diff --git a/tests/tests/testdata/blimp_animate_subject_trans-v0-loglikelihood b/tests/tests/testdata/blimp_animate_subject_trans-v0-loglikelihood
index 07106a9058..70cff68a1a 100644
--- a/tests/tests/testdata/blimp_animate_subject_trans-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_animate_subject_trans-v0-loglikelihood
@@ -1 +1 @@
-2a84231e7b79f517427e57e2099c88fed3d60a7efab4ef9506e263b4091d5cfa
\ No newline at end of file
+2a84231e7b79f517427e57e2099c88fed3d60a7efab4ef9506e263b4091d5cfa
diff --git a/tests/tests/testdata/blimp_animate_subject_trans-v0-res.json b/tests/tests/testdata/blimp_animate_subject_trans-v0-res.json
index 480cf29a4d..de987875a5 100644
--- a/tests/tests/testdata/blimp_animate_subject_trans-v0-res.json
+++ b/tests/tests/testdata/blimp_animate_subject_trans-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_animate_subject_trans": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_animate_subject_trans": 0}}
\ No newline at end of file
+{"results": {"blimp_animate_subject_trans": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_animate_subject_trans": 0}}
diff --git a/tests/tests/testdata/blimp_causative-v0-loglikelihood b/tests/tests/testdata/blimp_causative-v0-loglikelihood
index 5a0f6a3559..475d483914 100644
--- a/tests/tests/testdata/blimp_causative-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_causative-v0-loglikelihood
@@ -1 +1 @@
-3d67ad025185dbb0808ebd7f508edcb5750c18fc3c01ad91f20fda80780c916c
\ No newline at end of file
+3d67ad025185dbb0808ebd7f508edcb5750c18fc3c01ad91f20fda80780c916c
diff --git a/tests/tests/testdata/blimp_causative-v0-res.json b/tests/tests/testdata/blimp_causative-v0-res.json
index 90dc95da81..d84658aed6 100644
--- a/tests/tests/testdata/blimp_causative-v0-res.json
+++ b/tests/tests/testdata/blimp_causative-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_causative": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_causative": 0}}
\ No newline at end of file
+{"results": {"blimp_causative": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_causative": 0}}
diff --git a/tests/tests/testdata/blimp_complex_NP_island-v0-loglikelihood b/tests/tests/testdata/blimp_complex_NP_island-v0-loglikelihood
index 3a6d0875c6..cbc8ba8117 100644
--- a/tests/tests/testdata/blimp_complex_NP_island-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_complex_NP_island-v0-loglikelihood
@@ -1 +1 @@
-f46cfcc7e43050a235fd2a6b989cabbfbcce76786df74db9f0d4a9cd1caa1628
\ No newline at end of file
+f46cfcc7e43050a235fd2a6b989cabbfbcce76786df74db9f0d4a9cd1caa1628
diff --git a/tests/tests/testdata/blimp_complex_NP_island-v0-res.json b/tests/tests/testdata/blimp_complex_NP_island-v0-res.json
index 5bfbffb6e4..86754d2f25 100644
--- a/tests/tests/testdata/blimp_complex_NP_island-v0-res.json
+++ b/tests/tests/testdata/blimp_complex_NP_island-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_complex_NP_island": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_complex_NP_island": 0}}
\ No newline at end of file
+{"results": {"blimp_complex_NP_island": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_complex_NP_island": 0}}
diff --git a/tests/tests/testdata/blimp_coordinate_structure_constraint_complex_left_branch-v0-loglikelihood b/tests/tests/testdata/blimp_coordinate_structure_constraint_complex_left_branch-v0-loglikelihood
index 8970b32aff..742e7e16cd 100644
--- a/tests/tests/testdata/blimp_coordinate_structure_constraint_complex_left_branch-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_coordinate_structure_constraint_complex_left_branch-v0-loglikelihood
@@ -1 +1 @@
-7e1cc5b9f71abfbe56c4bdf343a1e5632785b66a986b8e904a41ed8f45a2c33e
\ No newline at end of file
+7e1cc5b9f71abfbe56c4bdf343a1e5632785b66a986b8e904a41ed8f45a2c33e
diff --git a/tests/tests/testdata/blimp_coordinate_structure_constraint_complex_left_branch-v0-res.json b/tests/tests/testdata/blimp_coordinate_structure_constraint_complex_left_branch-v0-res.json
index 2750fcda2a..b7807d77fa 100644
--- a/tests/tests/testdata/blimp_coordinate_structure_constraint_complex_left_branch-v0-res.json
+++ b/tests/tests/testdata/blimp_coordinate_structure_constraint_complex_left_branch-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_coordinate_structure_constraint_complex_left_branch": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_coordinate_structure_constraint_complex_left_branch": 0}}
\ No newline at end of file
+{"results": {"blimp_coordinate_structure_constraint_complex_left_branch": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_coordinate_structure_constraint_complex_left_branch": 0}}
diff --git a/tests/tests/testdata/blimp_coordinate_structure_constraint_object_extraction-v0-loglikelihood b/tests/tests/testdata/blimp_coordinate_structure_constraint_object_extraction-v0-loglikelihood
index f1edb69cb1..5f64d037ca 100644
--- a/tests/tests/testdata/blimp_coordinate_structure_constraint_object_extraction-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_coordinate_structure_constraint_object_extraction-v0-loglikelihood
@@ -1 +1 @@
-23ddafdff7b1ebe331b146e23b2c21aa109fe57aa1ce8ca201a0d239fcbdd166
\ No newline at end of file
+23ddafdff7b1ebe331b146e23b2c21aa109fe57aa1ce8ca201a0d239fcbdd166
diff --git a/tests/tests/testdata/blimp_coordinate_structure_constraint_object_extraction-v0-res.json b/tests/tests/testdata/blimp_coordinate_structure_constraint_object_extraction-v0-res.json
index 80f2c6a7a0..271d758092 100644
--- a/tests/tests/testdata/blimp_coordinate_structure_constraint_object_extraction-v0-res.json
+++ b/tests/tests/testdata/blimp_coordinate_structure_constraint_object_extraction-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_coordinate_structure_constraint_object_extraction": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_coordinate_structure_constraint_object_extraction": 0}}
\ No newline at end of file
+{"results": {"blimp_coordinate_structure_constraint_object_extraction": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_coordinate_structure_constraint_object_extraction": 0}}
diff --git a/tests/tests/testdata/blimp_determiner_noun_agreement_1-v0-loglikelihood b/tests/tests/testdata/blimp_determiner_noun_agreement_1-v0-loglikelihood
index 5fe9e64bc6..6f7397c31c 100644
--- a/tests/tests/testdata/blimp_determiner_noun_agreement_1-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_determiner_noun_agreement_1-v0-loglikelihood
@@ -1 +1 @@
-2df8cc7f17089f7e8c7d974dcb324c809d30ef059a5be22aed6b69f44230809f
\ No newline at end of file
+2df8cc7f17089f7e8c7d974dcb324c809d30ef059a5be22aed6b69f44230809f
diff --git a/tests/tests/testdata/blimp_determiner_noun_agreement_1-v0-res.json b/tests/tests/testdata/blimp_determiner_noun_agreement_1-v0-res.json
index a245755067..c1d74c8373 100644
--- a/tests/tests/testdata/blimp_determiner_noun_agreement_1-v0-res.json
+++ b/tests/tests/testdata/blimp_determiner_noun_agreement_1-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_determiner_noun_agreement_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_1": 0}}
\ No newline at end of file
+{"results": {"blimp_determiner_noun_agreement_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_1": 0}}
diff --git a/tests/tests/testdata/blimp_determiner_noun_agreement_2-v0-loglikelihood b/tests/tests/testdata/blimp_determiner_noun_agreement_2-v0-loglikelihood
index 72ab237e58..4fa30bc200 100644
--- a/tests/tests/testdata/blimp_determiner_noun_agreement_2-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_determiner_noun_agreement_2-v0-loglikelihood
@@ -1 +1 @@
-123e2acd00fbba60aba1fbae607c79a062e512c9e79c7d8dfafff63e30111d76
\ No newline at end of file
+123e2acd00fbba60aba1fbae607c79a062e512c9e79c7d8dfafff63e30111d76
diff --git a/tests/tests/testdata/blimp_determiner_noun_agreement_2-v0-res.json b/tests/tests/testdata/blimp_determiner_noun_agreement_2-v0-res.json
index bc2dc6e1ed..53b0dceb6c 100644
--- a/tests/tests/testdata/blimp_determiner_noun_agreement_2-v0-res.json
+++ b/tests/tests/testdata/blimp_determiner_noun_agreement_2-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_determiner_noun_agreement_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_2": 0}}
\ No newline at end of file
+{"results": {"blimp_determiner_noun_agreement_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_2": 0}}
diff --git a/tests/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood b/tests/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood
index f808af4605..650d52cc4e 100644
--- a/tests/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood
@@ -1 +1 @@
-7fab9f02e71a224ae7931aa77f8a9a61d887a7480756adc965d4746e97fb04a5
\ No newline at end of file
+7fab9f02e71a224ae7931aa77f8a9a61d887a7480756adc965d4746e97fb04a5
diff --git a/tests/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-res.json b/tests/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-res.json
index 8caeecf43d..3921ff3b88 100644
--- a/tests/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-res.json
+++ b/tests/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_determiner_noun_agreement_irregular_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_irregular_1": 0}}
\ No newline at end of file
+{"results": {"blimp_determiner_noun_agreement_irregular_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_irregular_1": 0}}
diff --git a/tests/tests/testdata/blimp_determiner_noun_agreement_irregular_2-v0-loglikelihood b/tests/tests/testdata/blimp_determiner_noun_agreement_irregular_2-v0-loglikelihood
index 12a4ebe1d2..a89c854b6e 100644
--- a/tests/tests/testdata/blimp_determiner_noun_agreement_irregular_2-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_determiner_noun_agreement_irregular_2-v0-loglikelihood
@@ -1 +1 @@
-ddb24ddfaebe076b3aa7107937d71bf5f4503a78283bc889e39200368603681e
\ No newline at end of file
+ddb24ddfaebe076b3aa7107937d71bf5f4503a78283bc889e39200368603681e
diff --git a/tests/tests/testdata/blimp_determiner_noun_agreement_irregular_2-v0-res.json b/tests/tests/testdata/blimp_determiner_noun_agreement_irregular_2-v0-res.json
index c04ead4577..ab47b016e6 100644
--- a/tests/tests/testdata/blimp_determiner_noun_agreement_irregular_2-v0-res.json
+++ b/tests/tests/testdata/blimp_determiner_noun_agreement_irregular_2-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_determiner_noun_agreement_irregular_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_irregular_2": 0}}
\ No newline at end of file
+{"results": {"blimp_determiner_noun_agreement_irregular_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_irregular_2": 0}}
diff --git a/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_2-v0-loglikelihood b/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_2-v0-loglikelihood
index a260838746..6cf9e371c6 100644
--- a/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_2-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_2-v0-loglikelihood
@@ -1 +1 @@
-95acb74fac7d57ae2c9d208361a5f8ad36b0b19a055f02e648ed8e99505f4b43
\ No newline at end of file
+95acb74fac7d57ae2c9d208361a5f8ad36b0b19a055f02e648ed8e99505f4b43
diff --git a/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_2-v0-res.json b/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_2-v0-res.json
index 67ea47559d..831302ad4a 100644
--- a/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_2-v0-res.json
+++ b/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_2-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_determiner_noun_agreement_with_adj_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adj_2": 0}}
\ No newline at end of file
+{"results": {"blimp_determiner_noun_agreement_with_adj_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adj_2": 0}}
diff --git a/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_1-v0-loglikelihood b/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_1-v0-loglikelihood
index 6756cc4020..9d47dd4928 100644
--- a/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_1-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_1-v0-loglikelihood
@@ -1 +1 @@
-ad61c619aa79433d02f1aeacde2ab87291fd5d5c370032c24d41c4f0065ed1f9
\ No newline at end of file
+ad61c619aa79433d02f1aeacde2ab87291fd5d5c370032c24d41c4f0065ed1f9
diff --git a/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_1-v0-res.json b/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_1-v0-res.json
index defc3560d9..765d2cf8ea 100644
--- a/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_1-v0-res.json
+++ b/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_1-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_determiner_noun_agreement_with_adj_irregular_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adj_irregular_1": 0}}
\ No newline at end of file
+{"results": {"blimp_determiner_noun_agreement_with_adj_irregular_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adj_irregular_1": 0}}
diff --git a/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-loglikelihood b/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-loglikelihood
index 13176ac613..e6e2a4497d 100644
--- a/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-loglikelihood
@@ -1 +1 @@
-ccc64b4d5e80c081d5161aae5828212ba49d277ca8c5a4281f181744727a6a99
\ No newline at end of file
+ccc64b4d5e80c081d5161aae5828212ba49d277ca8c5a4281f181744727a6a99
diff --git a/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json b/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json
index 276f03f76d..7b57aaaed2 100644
--- a/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json
+++ b/tests/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_determiner_noun_agreement_with_adj_irregular_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adj_irregular_2": 0}}
\ No newline at end of file
+{"results": {"blimp_determiner_noun_agreement_with_adj_irregular_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adj_irregular_2": 0}}
diff --git a/tests/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-loglikelihood b/tests/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-loglikelihood
index d765bb5906..9f139b4b16 100644
--- a/tests/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-loglikelihood
@@ -1 +1 @@
-007c47e5fbf88119c5180feef75e1345d448e56adcd4c7ab2d52fb8d67350d34
\ No newline at end of file
+007c47e5fbf88119c5180feef75e1345d448e56adcd4c7ab2d52fb8d67350d34
diff --git a/tests/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-res.json b/tests/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-res.json
index 66b30be1b8..3f8cb59ad3 100644
--- a/tests/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-res.json
+++ b/tests/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_determiner_noun_agreement_with_adjective_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adjective_1": 0}}
\ No newline at end of file
+{"results": {"blimp_determiner_noun_agreement_with_adjective_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adjective_1": 0}}
diff --git a/tests/tests/testdata/blimp_distractor_agreement_relational_noun-v0-loglikelihood b/tests/tests/testdata/blimp_distractor_agreement_relational_noun-v0-loglikelihood
index f926cf3d4b..cdf9c7b657 100644
--- a/tests/tests/testdata/blimp_distractor_agreement_relational_noun-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_distractor_agreement_relational_noun-v0-loglikelihood
@@ -1 +1 @@
-8aab641bd5933f84f46a14f5c1208a3c855cace7e67b44abcd5aff8fec96717d
\ No newline at end of file
+8aab641bd5933f84f46a14f5c1208a3c855cace7e67b44abcd5aff8fec96717d
diff --git a/tests/tests/testdata/blimp_distractor_agreement_relational_noun-v0-res.json b/tests/tests/testdata/blimp_distractor_agreement_relational_noun-v0-res.json
index d8ce0672c2..cb2599b9e0 100644
--- a/tests/tests/testdata/blimp_distractor_agreement_relational_noun-v0-res.json
+++ b/tests/tests/testdata/blimp_distractor_agreement_relational_noun-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_distractor_agreement_relational_noun": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_distractor_agreement_relational_noun": 0}}
\ No newline at end of file
+{"results": {"blimp_distractor_agreement_relational_noun": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_distractor_agreement_relational_noun": 0}}
diff --git a/tests/tests/testdata/blimp_distractor_agreement_relative_clause-v0-loglikelihood b/tests/tests/testdata/blimp_distractor_agreement_relative_clause-v0-loglikelihood
index 1fddc2190c..d1057c6947 100644
--- a/tests/tests/testdata/blimp_distractor_agreement_relative_clause-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_distractor_agreement_relative_clause-v0-loglikelihood
@@ -1 +1 @@
-bf78e2b53c0f3531303c668c96bd3897a0a35e960da37439e63724ecba4e371a
\ No newline at end of file
+bf78e2b53c0f3531303c668c96bd3897a0a35e960da37439e63724ecba4e371a
diff --git a/tests/tests/testdata/blimp_distractor_agreement_relative_clause-v0-res.json b/tests/tests/testdata/blimp_distractor_agreement_relative_clause-v0-res.json
index cf08b036b9..f892031db4 100644
--- a/tests/tests/testdata/blimp_distractor_agreement_relative_clause-v0-res.json
+++ b/tests/tests/testdata/blimp_distractor_agreement_relative_clause-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_distractor_agreement_relative_clause": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_distractor_agreement_relative_clause": 0}}
\ No newline at end of file
+{"results": {"blimp_distractor_agreement_relative_clause": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_distractor_agreement_relative_clause": 0}}
diff --git a/tests/tests/testdata/blimp_drop_argument-v0-loglikelihood b/tests/tests/testdata/blimp_drop_argument-v0-loglikelihood
index 1d6bea95e1..616b099d98 100644
--- a/tests/tests/testdata/blimp_drop_argument-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_drop_argument-v0-loglikelihood
@@ -1 +1 @@
-616109e63f162dcd31a632943e7ef0c9e0431afeb179e83e9b04b39007b16f5b
\ No newline at end of file
+616109e63f162dcd31a632943e7ef0c9e0431afeb179e83e9b04b39007b16f5b
diff --git a/tests/tests/testdata/blimp_drop_argument-v0-res.json b/tests/tests/testdata/blimp_drop_argument-v0-res.json
index 853a4d2f92..ed70ea9350 100644
--- a/tests/tests/testdata/blimp_drop_argument-v0-res.json
+++ b/tests/tests/testdata/blimp_drop_argument-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_drop_argument": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_drop_argument": 0}}
\ No newline at end of file
+{"results": {"blimp_drop_argument": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_drop_argument": 0}}
diff --git a/tests/tests/testdata/blimp_ellipsis_n_bar_1-v0-loglikelihood b/tests/tests/testdata/blimp_ellipsis_n_bar_1-v0-loglikelihood
index 611211bec0..4fc62005c5 100644
--- a/tests/tests/testdata/blimp_ellipsis_n_bar_1-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_ellipsis_n_bar_1-v0-loglikelihood
@@ -1 +1 @@
-d14e4b7fcdd68991eb39b9cf3ade4b37dee9ddd39b688f861d81a327e47a969f
\ No newline at end of file
+d14e4b7fcdd68991eb39b9cf3ade4b37dee9ddd39b688f861d81a327e47a969f
diff --git a/tests/tests/testdata/blimp_ellipsis_n_bar_1-v0-res.json b/tests/tests/testdata/blimp_ellipsis_n_bar_1-v0-res.json
index 82f320ce8f..6169eef26c 100644
--- a/tests/tests/testdata/blimp_ellipsis_n_bar_1-v0-res.json
+++ b/tests/tests/testdata/blimp_ellipsis_n_bar_1-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_ellipsis_n_bar_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_ellipsis_n_bar_1": 0}}
\ No newline at end of file
+{"results": {"blimp_ellipsis_n_bar_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_ellipsis_n_bar_1": 0}}
diff --git a/tests/tests/testdata/blimp_ellipsis_n_bar_2-v0-loglikelihood b/tests/tests/testdata/blimp_ellipsis_n_bar_2-v0-loglikelihood
index 1005f68060..3675cffd8b 100644
--- a/tests/tests/testdata/blimp_ellipsis_n_bar_2-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_ellipsis_n_bar_2-v0-loglikelihood
@@ -1 +1 @@
-0523771a217759f0b22b89807694ee7f6381ce98a584b1fd070ba96194a3273b
\ No newline at end of file
+0523771a217759f0b22b89807694ee7f6381ce98a584b1fd070ba96194a3273b
diff --git a/tests/tests/testdata/blimp_ellipsis_n_bar_2-v0-res.json b/tests/tests/testdata/blimp_ellipsis_n_bar_2-v0-res.json
index 5b721ca152..f30b7b4704 100644
--- a/tests/tests/testdata/blimp_ellipsis_n_bar_2-v0-res.json
+++ b/tests/tests/testdata/blimp_ellipsis_n_bar_2-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_ellipsis_n_bar_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_ellipsis_n_bar_2": 0}}
\ No newline at end of file
+{"results": {"blimp_ellipsis_n_bar_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_ellipsis_n_bar_2": 0}}
diff --git a/tests/tests/testdata/blimp_existential_there_object_raising-v0-loglikelihood b/tests/tests/testdata/blimp_existential_there_object_raising-v0-loglikelihood
index d23fba902a..e1599102ba 100644
--- a/tests/tests/testdata/blimp_existential_there_object_raising-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_existential_there_object_raising-v0-loglikelihood
@@ -1 +1 @@
-63567712076256f373131971676c1c6d711efef73cd0e4de3cc639bc631a2413
\ No newline at end of file
+63567712076256f373131971676c1c6d711efef73cd0e4de3cc639bc631a2413
diff --git a/tests/tests/testdata/blimp_existential_there_object_raising-v0-res.json b/tests/tests/testdata/blimp_existential_there_object_raising-v0-res.json
index da3deb1aaf..4556caf48b 100644
--- a/tests/tests/testdata/blimp_existential_there_object_raising-v0-res.json
+++ b/tests/tests/testdata/blimp_existential_there_object_raising-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_existential_there_object_raising": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_existential_there_object_raising": 0}}
\ No newline at end of file
+{"results": {"blimp_existential_there_object_raising": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_existential_there_object_raising": 0}}
diff --git a/tests/tests/testdata/blimp_existential_there_quantifiers_1-v0-loglikelihood b/tests/tests/testdata/blimp_existential_there_quantifiers_1-v0-loglikelihood
index 7697713f85..c288c7bb55 100644
--- a/tests/tests/testdata/blimp_existential_there_quantifiers_1-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_existential_there_quantifiers_1-v0-loglikelihood
@@ -1 +1 @@
-d77594382e6d9af31a8b8ef00ba1ef6c29d6be6d0ddb7a9c27ef25ace654e05a
\ No newline at end of file
+d77594382e6d9af31a8b8ef00ba1ef6c29d6be6d0ddb7a9c27ef25ace654e05a
diff --git a/tests/tests/testdata/blimp_existential_there_quantifiers_1-v0-res.json b/tests/tests/testdata/blimp_existential_there_quantifiers_1-v0-res.json
index 076319f01e..99c821bab2 100644
--- a/tests/tests/testdata/blimp_existential_there_quantifiers_1-v0-res.json
+++ b/tests/tests/testdata/blimp_existential_there_quantifiers_1-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_existential_there_quantifiers_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_existential_there_quantifiers_1": 0}}
\ No newline at end of file
+{"results": {"blimp_existential_there_quantifiers_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_existential_there_quantifiers_1": 0}}
diff --git a/tests/tests/testdata/blimp_existential_there_quantifiers_2-v0-loglikelihood b/tests/tests/testdata/blimp_existential_there_quantifiers_2-v0-loglikelihood
index 4b1a428c4d..0b8d9be879 100644
--- a/tests/tests/testdata/blimp_existential_there_quantifiers_2-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_existential_there_quantifiers_2-v0-loglikelihood
@@ -1 +1 @@
-6e6add7baff4217f383425bef58288202018e041b24084edcaa5df8af08f820c
\ No newline at end of file
+6e6add7baff4217f383425bef58288202018e041b24084edcaa5df8af08f820c
diff --git a/tests/tests/testdata/blimp_existential_there_quantifiers_2-v0-res.json b/tests/tests/testdata/blimp_existential_there_quantifiers_2-v0-res.json
index b8500d68b5..80c6fbb97f 100644
--- a/tests/tests/testdata/blimp_existential_there_quantifiers_2-v0-res.json
+++ b/tests/tests/testdata/blimp_existential_there_quantifiers_2-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_existential_there_quantifiers_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_existential_there_quantifiers_2": 0}}
\ No newline at end of file
+{"results": {"blimp_existential_there_quantifiers_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_existential_there_quantifiers_2": 0}}
diff --git a/tests/tests/testdata/blimp_existential_there_subject_raising-v0-loglikelihood b/tests/tests/testdata/blimp_existential_there_subject_raising-v0-loglikelihood
index 925e5b4680..54aa63c903 100644
--- a/tests/tests/testdata/blimp_existential_there_subject_raising-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_existential_there_subject_raising-v0-loglikelihood
@@ -1 +1 @@
-9b324b28ae3e1b5d49ecf4b7b2a16c7bbc8ff38d000cf216fab75df633da2084
\ No newline at end of file
+9b324b28ae3e1b5d49ecf4b7b2a16c7bbc8ff38d000cf216fab75df633da2084
diff --git a/tests/tests/testdata/blimp_existential_there_subject_raising-v0-res.json b/tests/tests/testdata/blimp_existential_there_subject_raising-v0-res.json
index 00c913dcd3..e7483274c8 100644
--- a/tests/tests/testdata/blimp_existential_there_subject_raising-v0-res.json
+++ b/tests/tests/testdata/blimp_existential_there_subject_raising-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_existential_there_subject_raising": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_existential_there_subject_raising": 0}}
\ No newline at end of file
+{"results": {"blimp_existential_there_subject_raising": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_existential_there_subject_raising": 0}}
diff --git a/tests/tests/testdata/blimp_expletive_it_object_raising-v0-loglikelihood b/tests/tests/testdata/blimp_expletive_it_object_raising-v0-loglikelihood
index 31772c9a1c..236950e3aa 100644
--- a/tests/tests/testdata/blimp_expletive_it_object_raising-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_expletive_it_object_raising-v0-loglikelihood
@@ -1 +1 @@
-ceede5b38248a62125a74a8332602b8eac5ef40864f071ad8d86e7971e07219d
\ No newline at end of file
+ceede5b38248a62125a74a8332602b8eac5ef40864f071ad8d86e7971e07219d
diff --git a/tests/tests/testdata/blimp_expletive_it_object_raising-v0-res.json b/tests/tests/testdata/blimp_expletive_it_object_raising-v0-res.json
index 735dc09826..808b3f9b8d 100644
--- a/tests/tests/testdata/blimp_expletive_it_object_raising-v0-res.json
+++ b/tests/tests/testdata/blimp_expletive_it_object_raising-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_expletive_it_object_raising": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_expletive_it_object_raising": 0}}
\ No newline at end of file
+{"results": {"blimp_expletive_it_object_raising": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_expletive_it_object_raising": 0}}
diff --git a/tests/tests/testdata/blimp_inchoative-v0-loglikelihood b/tests/tests/testdata/blimp_inchoative-v0-loglikelihood
index b494980087..26f8f1fcb9 100644
--- a/tests/tests/testdata/blimp_inchoative-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_inchoative-v0-loglikelihood
@@ -1 +1 @@
-3ff73629fb4473986a0e8ae2fcb7c40e88292189ab0d8755d20836c5aa5a2f99
\ No newline at end of file
+3ff73629fb4473986a0e8ae2fcb7c40e88292189ab0d8755d20836c5aa5a2f99
diff --git a/tests/tests/testdata/blimp_inchoative-v0-res.json b/tests/tests/testdata/blimp_inchoative-v0-res.json
index 8d1b39c2d4..2f0cc1a7ca 100644
--- a/tests/tests/testdata/blimp_inchoative-v0-res.json
+++ b/tests/tests/testdata/blimp_inchoative-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_inchoative": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_inchoative": 0}}
\ No newline at end of file
+{"results": {"blimp_inchoative": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_inchoative": 0}}
diff --git a/tests/tests/testdata/blimp_intransitive-v0-loglikelihood b/tests/tests/testdata/blimp_intransitive-v0-loglikelihood
index b16238545d..8dd8c4f2c6 100644
--- a/tests/tests/testdata/blimp_intransitive-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_intransitive-v0-loglikelihood
@@ -1 +1 @@
-6469ae3b0d46b008846b5fd132f2d2b26ea2858745d056df1470b89aa97a790f
\ No newline at end of file
+6469ae3b0d46b008846b5fd132f2d2b26ea2858745d056df1470b89aa97a790f
diff --git a/tests/tests/testdata/blimp_intransitive-v0-res.json b/tests/tests/testdata/blimp_intransitive-v0-res.json
index d5b2f91179..d4dc91ae4b 100644
--- a/tests/tests/testdata/blimp_intransitive-v0-res.json
+++ b/tests/tests/testdata/blimp_intransitive-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_intransitive": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_intransitive": 0}}
\ No newline at end of file
+{"results": {"blimp_intransitive": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_intransitive": 0}}
diff --git a/tests/tests/testdata/blimp_irregular_past_participle_adjectives-v0-loglikelihood b/tests/tests/testdata/blimp_irregular_past_participle_adjectives-v0-loglikelihood
index a030be1d72..008745c366 100644
--- a/tests/tests/testdata/blimp_irregular_past_participle_adjectives-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_irregular_past_participle_adjectives-v0-loglikelihood
@@ -1 +1 @@
-47c56f336df11924d8b97feb46339ce55bea4b216b6fd13946cc999ea36a4a95
\ No newline at end of file
+47c56f336df11924d8b97feb46339ce55bea4b216b6fd13946cc999ea36a4a95
diff --git a/tests/tests/testdata/blimp_irregular_past_participle_adjectives-v0-res.json b/tests/tests/testdata/blimp_irregular_past_participle_adjectives-v0-res.json
index e3b8718ff8..5819ba6a6f 100644
--- a/tests/tests/testdata/blimp_irregular_past_participle_adjectives-v0-res.json
+++ b/tests/tests/testdata/blimp_irregular_past_participle_adjectives-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_irregular_past_participle_adjectives": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_irregular_past_participle_adjectives": 0}}
\ No newline at end of file
+{"results": {"blimp_irregular_past_participle_adjectives": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_irregular_past_participle_adjectives": 0}}
diff --git a/tests/tests/testdata/blimp_irregular_past_participle_verbs-v0-loglikelihood b/tests/tests/testdata/blimp_irregular_past_participle_verbs-v0-loglikelihood
index 1ff9f6b991..5c01c94ef7 100644
--- a/tests/tests/testdata/blimp_irregular_past_participle_verbs-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_irregular_past_participle_verbs-v0-loglikelihood
@@ -1 +1 @@
-63ec733873f94ace71cb34112d1c3cd5bb768c26b975fb90acc9b8ba3f4e938e
\ No newline at end of file
+63ec733873f94ace71cb34112d1c3cd5bb768c26b975fb90acc9b8ba3f4e938e
diff --git a/tests/tests/testdata/blimp_irregular_past_participle_verbs-v0-res.json b/tests/tests/testdata/blimp_irregular_past_participle_verbs-v0-res.json
index 94d73d41da..13a6167484 100644
--- a/tests/tests/testdata/blimp_irregular_past_participle_verbs-v0-res.json
+++ b/tests/tests/testdata/blimp_irregular_past_participle_verbs-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_irregular_past_participle_verbs": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_irregular_past_participle_verbs": 0}}
\ No newline at end of file
+{"results": {"blimp_irregular_past_participle_verbs": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_irregular_past_participle_verbs": 0}}
diff --git a/tests/tests/testdata/blimp_irregular_plural_subject_verb_agreement_1-v0-loglikelihood b/tests/tests/testdata/blimp_irregular_plural_subject_verb_agreement_1-v0-loglikelihood
index bd7f4bd9ea..023096be2c 100644
--- a/tests/tests/testdata/blimp_irregular_plural_subject_verb_agreement_1-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_irregular_plural_subject_verb_agreement_1-v0-loglikelihood
@@ -1 +1 @@
-7084358b1b7dd7fb5ead1a58f4b499d6f7610eca897bfac25a986d0f9a91aa5d
\ No newline at end of file
+7084358b1b7dd7fb5ead1a58f4b499d6f7610eca897bfac25a986d0f9a91aa5d
diff --git a/tests/tests/testdata/blimp_irregular_plural_subject_verb_agreement_1-v0-res.json b/tests/tests/testdata/blimp_irregular_plural_subject_verb_agreement_1-v0-res.json
index d70bd8bad3..d5f38df686 100644
--- a/tests/tests/testdata/blimp_irregular_plural_subject_verb_agreement_1-v0-res.json
+++ b/tests/tests/testdata/blimp_irregular_plural_subject_verb_agreement_1-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_irregular_plural_subject_verb_agreement_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_irregular_plural_subject_verb_agreement_1": 0}}
\ No newline at end of file
+{"results": {"blimp_irregular_plural_subject_verb_agreement_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_irregular_plural_subject_verb_agreement_1": 0}}
diff --git a/tests/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood b/tests/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood
index 187b79e94c..68ef732797 100644
--- a/tests/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood
@@ -1 +1 @@
-9534751f83a86b6cbe1fb12fb9feb827b0b7836a663108928b4ecc1d70b08871
\ No newline at end of file
+9534751f83a86b6cbe1fb12fb9feb827b0b7836a663108928b4ecc1d70b08871
diff --git a/tests/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-res.json b/tests/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-res.json
index b0289b9dea..02f042e827 100644
--- a/tests/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-res.json
+++ b/tests/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_irregular_plural_subject_verb_agreement_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_irregular_plural_subject_verb_agreement_2": 0}}
\ No newline at end of file
+{"results": {"blimp_irregular_plural_subject_verb_agreement_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_irregular_plural_subject_verb_agreement_2": 0}}
diff --git a/tests/tests/testdata/blimp_left_branch_island_echo_question-v0-loglikelihood b/tests/tests/testdata/blimp_left_branch_island_echo_question-v0-loglikelihood
index da909529e5..6846155766 100644
--- a/tests/tests/testdata/blimp_left_branch_island_echo_question-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_left_branch_island_echo_question-v0-loglikelihood
@@ -1 +1 @@
-9852b38612db8c6adf938a5d8a7a9e5ce9e655259d6cc806b142506fcaff0ed4
\ No newline at end of file
+9852b38612db8c6adf938a5d8a7a9e5ce9e655259d6cc806b142506fcaff0ed4
diff --git a/tests/tests/testdata/blimp_left_branch_island_echo_question-v0-res.json b/tests/tests/testdata/blimp_left_branch_island_echo_question-v0-res.json
index 198f9a289c..0c1723b0f9 100644
--- a/tests/tests/testdata/blimp_left_branch_island_echo_question-v0-res.json
+++ b/tests/tests/testdata/blimp_left_branch_island_echo_question-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_left_branch_island_echo_question": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_left_branch_island_echo_question": 0}}
\ No newline at end of file
+{"results": {"blimp_left_branch_island_echo_question": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_left_branch_island_echo_question": 0}}
diff --git a/tests/tests/testdata/blimp_left_branch_island_simple_question-v0-loglikelihood b/tests/tests/testdata/blimp_left_branch_island_simple_question-v0-loglikelihood
index 22adb2995e..585a1dab73 100644
--- a/tests/tests/testdata/blimp_left_branch_island_simple_question-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_left_branch_island_simple_question-v0-loglikelihood
@@ -1 +1 @@
-6cb36bbdae7754f8832f50872c3dd511ce12547e00fa0771deb747be3355eb85
\ No newline at end of file
+6cb36bbdae7754f8832f50872c3dd511ce12547e00fa0771deb747be3355eb85
diff --git a/tests/tests/testdata/blimp_left_branch_island_simple_question-v0-res.json b/tests/tests/testdata/blimp_left_branch_island_simple_question-v0-res.json
index 057af2db85..27b71d0de9 100644
--- a/tests/tests/testdata/blimp_left_branch_island_simple_question-v0-res.json
+++ b/tests/tests/testdata/blimp_left_branch_island_simple_question-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_left_branch_island_simple_question": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_left_branch_island_simple_question": 0}}
\ No newline at end of file
+{"results": {"blimp_left_branch_island_simple_question": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_left_branch_island_simple_question": 0}}
diff --git a/tests/tests/testdata/blimp_matrix_question_npi_licensor_present-v0-loglikelihood b/tests/tests/testdata/blimp_matrix_question_npi_licensor_present-v0-loglikelihood
index a5c4bc6ca2..38824111a0 100644
--- a/tests/tests/testdata/blimp_matrix_question_npi_licensor_present-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_matrix_question_npi_licensor_present-v0-loglikelihood
@@ -1 +1 @@
-a3a702a3335c79b02b36caf37c68069050c2a8a3a03c3610c09afc39d2b83fb1
\ No newline at end of file
+a3a702a3335c79b02b36caf37c68069050c2a8a3a03c3610c09afc39d2b83fb1
diff --git a/tests/tests/testdata/blimp_matrix_question_npi_licensor_present-v0-res.json b/tests/tests/testdata/blimp_matrix_question_npi_licensor_present-v0-res.json
index 4fba717b88..77211f8352 100644
--- a/tests/tests/testdata/blimp_matrix_question_npi_licensor_present-v0-res.json
+++ b/tests/tests/testdata/blimp_matrix_question_npi_licensor_present-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_matrix_question_npi_licensor_present": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_matrix_question_npi_licensor_present": 0}}
\ No newline at end of file
+{"results": {"blimp_matrix_question_npi_licensor_present": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_matrix_question_npi_licensor_present": 0}}
diff --git a/tests/tests/testdata/blimp_npi_present_1-v0-loglikelihood b/tests/tests/testdata/blimp_npi_present_1-v0-loglikelihood
index 910e490a98..697a296562 100644
--- a/tests/tests/testdata/blimp_npi_present_1-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_npi_present_1-v0-loglikelihood
@@ -1 +1 @@
-3ef532a85e0ee8f8ff779bc7ddc873d515969a708da84a4eb4a85b7c843cf244
\ No newline at end of file
+3ef532a85e0ee8f8ff779bc7ddc873d515969a708da84a4eb4a85b7c843cf244
diff --git a/tests/tests/testdata/blimp_npi_present_1-v0-res.json b/tests/tests/testdata/blimp_npi_present_1-v0-res.json
index 8e4ae8d6ef..3db6b3cdb4 100644
--- a/tests/tests/testdata/blimp_npi_present_1-v0-res.json
+++ b/tests/tests/testdata/blimp_npi_present_1-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_npi_present_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_npi_present_1": 0}}
\ No newline at end of file
+{"results": {"blimp_npi_present_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_npi_present_1": 0}}
diff --git a/tests/tests/testdata/blimp_npi_present_2-v0-loglikelihood b/tests/tests/testdata/blimp_npi_present_2-v0-loglikelihood
index 543fdc0614..d7f4e66e70 100644
--- a/tests/tests/testdata/blimp_npi_present_2-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_npi_present_2-v0-loglikelihood
@@ -1 +1 @@
-fdb688ac6259bb65d234ef0a36e9a9ee449f9608f633b12e1943b462aead8e17
\ No newline at end of file
+fdb688ac6259bb65d234ef0a36e9a9ee449f9608f633b12e1943b462aead8e17
diff --git a/tests/tests/testdata/blimp_npi_present_2-v0-res.json b/tests/tests/testdata/blimp_npi_present_2-v0-res.json
index efe40ced37..58f75fa16a 100644
--- a/tests/tests/testdata/blimp_npi_present_2-v0-res.json
+++ b/tests/tests/testdata/blimp_npi_present_2-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_npi_present_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_npi_present_2": 0}}
\ No newline at end of file
+{"results": {"blimp_npi_present_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_npi_present_2": 0}}
diff --git a/tests/tests/testdata/blimp_only_npi_licensor_present-v0-loglikelihood b/tests/tests/testdata/blimp_only_npi_licensor_present-v0-loglikelihood
index 03f45fd619..70d74d84ab 100644
--- a/tests/tests/testdata/blimp_only_npi_licensor_present-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_only_npi_licensor_present-v0-loglikelihood
@@ -1 +1 @@
-d2d0711611b5b218c6fa8c7278494749252b7868c396451919b761303556bd66
\ No newline at end of file
+d2d0711611b5b218c6fa8c7278494749252b7868c396451919b761303556bd66
diff --git a/tests/tests/testdata/blimp_only_npi_licensor_present-v0-res.json b/tests/tests/testdata/blimp_only_npi_licensor_present-v0-res.json
index 321702a66e..b934ea7a77 100644
--- a/tests/tests/testdata/blimp_only_npi_licensor_present-v0-res.json
+++ b/tests/tests/testdata/blimp_only_npi_licensor_present-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_only_npi_licensor_present": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_only_npi_licensor_present": 0}}
\ No newline at end of file
+{"results": {"blimp_only_npi_licensor_present": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_only_npi_licensor_present": 0}}
diff --git a/tests/tests/testdata/blimp_only_npi_scope-v0-loglikelihood b/tests/tests/testdata/blimp_only_npi_scope-v0-loglikelihood
index f1846d3e93..7b0445e9db 100644
--- a/tests/tests/testdata/blimp_only_npi_scope-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_only_npi_scope-v0-loglikelihood
@@ -1 +1 @@
-fc0be817478c212327050fa297ef61ad214f4847dbff61d4e0fe7914c06a1691
\ No newline at end of file
+fc0be817478c212327050fa297ef61ad214f4847dbff61d4e0fe7914c06a1691
diff --git a/tests/tests/testdata/blimp_only_npi_scope-v0-res.json b/tests/tests/testdata/blimp_only_npi_scope-v0-res.json
index 82fbbab07d..bec2e7d33b 100644
--- a/tests/tests/testdata/blimp_only_npi_scope-v0-res.json
+++ b/tests/tests/testdata/blimp_only_npi_scope-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_only_npi_scope": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_only_npi_scope": 0}}
\ No newline at end of file
+{"results": {"blimp_only_npi_scope": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_only_npi_scope": 0}}
diff --git a/tests/tests/testdata/blimp_passive_1-v0-loglikelihood b/tests/tests/testdata/blimp_passive_1-v0-loglikelihood
index 183b815d22..52f5b2332f 100644
--- a/tests/tests/testdata/blimp_passive_1-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_passive_1-v0-loglikelihood
@@ -1 +1 @@
-fa4addddd8e380031b8e0871776cabcb707c0f21dcaf5d8b3defec66cce55043
\ No newline at end of file
+fa4addddd8e380031b8e0871776cabcb707c0f21dcaf5d8b3defec66cce55043
diff --git a/tests/tests/testdata/blimp_passive_1-v0-res.json b/tests/tests/testdata/blimp_passive_1-v0-res.json
index 64070cf58d..3dd08a649f 100644
--- a/tests/tests/testdata/blimp_passive_1-v0-res.json
+++ b/tests/tests/testdata/blimp_passive_1-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_passive_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_passive_1": 0}}
\ No newline at end of file
+{"results": {"blimp_passive_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_passive_1": 0}}
diff --git a/tests/tests/testdata/blimp_passive_2-v0-loglikelihood b/tests/tests/testdata/blimp_passive_2-v0-loglikelihood
index d667f46946..d8cab002c4 100644
--- a/tests/tests/testdata/blimp_passive_2-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_passive_2-v0-loglikelihood
@@ -1 +1 @@
-755bdfe2c89737c43001ff1dc83d68ad33e444aaf0669af66aaf82dcd09f2eca
\ No newline at end of file
+755bdfe2c89737c43001ff1dc83d68ad33e444aaf0669af66aaf82dcd09f2eca
diff --git a/tests/tests/testdata/blimp_passive_2-v0-res.json b/tests/tests/testdata/blimp_passive_2-v0-res.json
index 5a4dd092c4..5205b31eb7 100644
--- a/tests/tests/testdata/blimp_passive_2-v0-res.json
+++ b/tests/tests/testdata/blimp_passive_2-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_passive_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_passive_2": 0}}
\ No newline at end of file
+{"results": {"blimp_passive_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_passive_2": 0}}
diff --git a/tests/tests/testdata/blimp_principle_A_c_command-v0-loglikelihood b/tests/tests/testdata/blimp_principle_A_c_command-v0-loglikelihood
index 87b49c5de9..a9c864d0a4 100644
--- a/tests/tests/testdata/blimp_principle_A_c_command-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_principle_A_c_command-v0-loglikelihood
@@ -1 +1 @@
-7c2ed82612af9175052cd44d8e178b6dd084c04eb462a3d88fcacfad2df8be8e
\ No newline at end of file
+7c2ed82612af9175052cd44d8e178b6dd084c04eb462a3d88fcacfad2df8be8e
diff --git a/tests/tests/testdata/blimp_principle_A_c_command-v0-res.json b/tests/tests/testdata/blimp_principle_A_c_command-v0-res.json
index 43fadc2e0b..34113bf8d1 100644
--- a/tests/tests/testdata/blimp_principle_A_c_command-v0-res.json
+++ b/tests/tests/testdata/blimp_principle_A_c_command-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_principle_A_c_command": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_c_command": 0}}
\ No newline at end of file
+{"results": {"blimp_principle_A_c_command": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_c_command": 0}}
diff --git a/tests/tests/testdata/blimp_principle_A_case_1-v0-loglikelihood b/tests/tests/testdata/blimp_principle_A_case_1-v0-loglikelihood
index ce8166c460..5812c4732c 100644
--- a/tests/tests/testdata/blimp_principle_A_case_1-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_principle_A_case_1-v0-loglikelihood
@@ -1 +1 @@
-49d2b8ce6667a6166fdc2a2e5dbe7ff07d9b8415e9f33482aef15956b3ebc24a
\ No newline at end of file
+49d2b8ce6667a6166fdc2a2e5dbe7ff07d9b8415e9f33482aef15956b3ebc24a
diff --git a/tests/tests/testdata/blimp_principle_A_case_1-v0-res.json b/tests/tests/testdata/blimp_principle_A_case_1-v0-res.json
index f325c2e3e3..952f4fb4ee 100644
--- a/tests/tests/testdata/blimp_principle_A_case_1-v0-res.json
+++ b/tests/tests/testdata/blimp_principle_A_case_1-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_principle_A_case_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_case_1": 0}}
\ No newline at end of file
+{"results": {"blimp_principle_A_case_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_case_1": 0}}
diff --git a/tests/tests/testdata/blimp_principle_A_case_2-v0-loglikelihood b/tests/tests/testdata/blimp_principle_A_case_2-v0-loglikelihood
index 8c043857d4..ad1aac68e3 100644
--- a/tests/tests/testdata/blimp_principle_A_case_2-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_principle_A_case_2-v0-loglikelihood
@@ -1 +1 @@
-cd68adb65c891d672e22bf53c054b2083ab08bc1da43951732b409c942d14bc7
\ No newline at end of file
+cd68adb65c891d672e22bf53c054b2083ab08bc1da43951732b409c942d14bc7
diff --git a/tests/tests/testdata/blimp_principle_A_case_2-v0-res.json b/tests/tests/testdata/blimp_principle_A_case_2-v0-res.json
index ec8108c88d..e8dd8fad63 100644
--- a/tests/tests/testdata/blimp_principle_A_case_2-v0-res.json
+++ b/tests/tests/testdata/blimp_principle_A_case_2-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_principle_A_case_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_case_2": 0}}
\ No newline at end of file
+{"results": {"blimp_principle_A_case_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_case_2": 0}}
diff --git a/tests/tests/testdata/blimp_principle_A_domain_1-v0-loglikelihood b/tests/tests/testdata/blimp_principle_A_domain_1-v0-loglikelihood
index 6b900d05f4..0ac838c09e 100644
--- a/tests/tests/testdata/blimp_principle_A_domain_1-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_principle_A_domain_1-v0-loglikelihood
@@ -1 +1 @@
-290e7eddacea4ec16989af697f2ee3373fdd9aef4b452bf887184c6e2f6e7d9d
\ No newline at end of file
+290e7eddacea4ec16989af697f2ee3373fdd9aef4b452bf887184c6e2f6e7d9d
diff --git a/tests/tests/testdata/blimp_principle_A_domain_1-v0-res.json b/tests/tests/testdata/blimp_principle_A_domain_1-v0-res.json
index 9efbffb50f..546ccc8b22 100644
--- a/tests/tests/testdata/blimp_principle_A_domain_1-v0-res.json
+++ b/tests/tests/testdata/blimp_principle_A_domain_1-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_principle_A_domain_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_domain_1": 0}}
\ No newline at end of file
+{"results": {"blimp_principle_A_domain_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_domain_1": 0}}
diff --git a/tests/tests/testdata/blimp_principle_A_domain_2-v0-loglikelihood b/tests/tests/testdata/blimp_principle_A_domain_2-v0-loglikelihood
index 0e201fe3c8..c227edddc2 100644
--- a/tests/tests/testdata/blimp_principle_A_domain_2-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_principle_A_domain_2-v0-loglikelihood
@@ -1 +1 @@
-eb5ddf0a97982373ab1a4e58267cfcdebdecdb86c376dfd5ebf46737c9d3ee12
\ No newline at end of file
+eb5ddf0a97982373ab1a4e58267cfcdebdecdb86c376dfd5ebf46737c9d3ee12
diff --git a/tests/tests/testdata/blimp_principle_A_domain_2-v0-res.json b/tests/tests/testdata/blimp_principle_A_domain_2-v0-res.json
index 1bda1a2aa9..763f00c1da 100644
--- a/tests/tests/testdata/blimp_principle_A_domain_2-v0-res.json
+++ b/tests/tests/testdata/blimp_principle_A_domain_2-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_principle_A_domain_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_domain_2": 0}}
\ No newline at end of file
+{"results": {"blimp_principle_A_domain_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_domain_2": 0}}
diff --git a/tests/tests/testdata/blimp_principle_A_domain_3-v0-loglikelihood b/tests/tests/testdata/blimp_principle_A_domain_3-v0-loglikelihood
index c37e936401..d525bd3c1a 100644
--- a/tests/tests/testdata/blimp_principle_A_domain_3-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_principle_A_domain_3-v0-loglikelihood
@@ -1 +1 @@
-38454befedcf1f3f6ef27d3bef9ccfdfb3e94a7ab32d86a63493a920d2d50093
\ No newline at end of file
+38454befedcf1f3f6ef27d3bef9ccfdfb3e94a7ab32d86a63493a920d2d50093
diff --git a/tests/tests/testdata/blimp_principle_A_domain_3-v0-res.json b/tests/tests/testdata/blimp_principle_A_domain_3-v0-res.json
index 77c4bf916a..9e11de23aa 100644
--- a/tests/tests/testdata/blimp_principle_A_domain_3-v0-res.json
+++ b/tests/tests/testdata/blimp_principle_A_domain_3-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_principle_A_domain_3": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_domain_3": 0}}
\ No newline at end of file
+{"results": {"blimp_principle_A_domain_3": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_domain_3": 0}}
diff --git a/tests/tests/testdata/blimp_principle_A_reconstruction-v0-loglikelihood b/tests/tests/testdata/blimp_principle_A_reconstruction-v0-loglikelihood
index f8d1d1f87f..2a19fcc794 100644
--- a/tests/tests/testdata/blimp_principle_A_reconstruction-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_principle_A_reconstruction-v0-loglikelihood
@@ -1 +1 @@
-894efedfd8750d5b8de6157f9b2ed2b51b5290d3a78ea9b041fc62d34e96efbc
\ No newline at end of file
+894efedfd8750d5b8de6157f9b2ed2b51b5290d3a78ea9b041fc62d34e96efbc
diff --git a/tests/tests/testdata/blimp_principle_A_reconstruction-v0-res.json b/tests/tests/testdata/blimp_principle_A_reconstruction-v0-res.json
index 0e7d8db1e2..8d0a8ed044 100644
--- a/tests/tests/testdata/blimp_principle_A_reconstruction-v0-res.json
+++ b/tests/tests/testdata/blimp_principle_A_reconstruction-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_principle_A_reconstruction": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_reconstruction": 0}}
\ No newline at end of file
+{"results": {"blimp_principle_A_reconstruction": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_reconstruction": 0}}
diff --git a/tests/tests/testdata/blimp_regular_plural_subject_verb_agreement_1-v0-loglikelihood b/tests/tests/testdata/blimp_regular_plural_subject_verb_agreement_1-v0-loglikelihood
index 0a32ca7f97..69aea9868d 100644
--- a/tests/tests/testdata/blimp_regular_plural_subject_verb_agreement_1-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_regular_plural_subject_verb_agreement_1-v0-loglikelihood
@@ -1 +1 @@
-5bc0441f31e32443cf761bca6e961d504e1e84b15aa4e1d79e5c8ed5b4c2aa3a
\ No newline at end of file
+5bc0441f31e32443cf761bca6e961d504e1e84b15aa4e1d79e5c8ed5b4c2aa3a
diff --git a/tests/tests/testdata/blimp_regular_plural_subject_verb_agreement_1-v0-res.json b/tests/tests/testdata/blimp_regular_plural_subject_verb_agreement_1-v0-res.json
index 16fed715d4..f26f44b74d 100644
--- a/tests/tests/testdata/blimp_regular_plural_subject_verb_agreement_1-v0-res.json
+++ b/tests/tests/testdata/blimp_regular_plural_subject_verb_agreement_1-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_regular_plural_subject_verb_agreement_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_regular_plural_subject_verb_agreement_1": 0}}
\ No newline at end of file
+{"results": {"blimp_regular_plural_subject_verb_agreement_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_regular_plural_subject_verb_agreement_1": 0}}
diff --git a/tests/tests/testdata/blimp_regular_plural_subject_verb_agreement_2-v0-loglikelihood b/tests/tests/testdata/blimp_regular_plural_subject_verb_agreement_2-v0-loglikelihood
index 4b6525a10e..4f1faa5602 100644
--- a/tests/tests/testdata/blimp_regular_plural_subject_verb_agreement_2-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_regular_plural_subject_verb_agreement_2-v0-loglikelihood
@@ -1 +1 @@
-f69d9891f59872538962221fccc425b07df7cfbd83cdc546ce83e6b0e9a93f7c
\ No newline at end of file
+f69d9891f59872538962221fccc425b07df7cfbd83cdc546ce83e6b0e9a93f7c
diff --git a/tests/tests/testdata/blimp_regular_plural_subject_verb_agreement_2-v0-res.json b/tests/tests/testdata/blimp_regular_plural_subject_verb_agreement_2-v0-res.json
index 6d64b97e20..d3710f6dca 100644
--- a/tests/tests/testdata/blimp_regular_plural_subject_verb_agreement_2-v0-res.json
+++ b/tests/tests/testdata/blimp_regular_plural_subject_verb_agreement_2-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_regular_plural_subject_verb_agreement_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_regular_plural_subject_verb_agreement_2": 0}}
\ No newline at end of file
+{"results": {"blimp_regular_plural_subject_verb_agreement_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_regular_plural_subject_verb_agreement_2": 0}}
diff --git a/tests/tests/testdata/blimp_sentential_negation_npi_licensor_present-v0-loglikelihood b/tests/tests/testdata/blimp_sentential_negation_npi_licensor_present-v0-loglikelihood
index 8e254de7a7..bc436683b1 100644
--- a/tests/tests/testdata/blimp_sentential_negation_npi_licensor_present-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_sentential_negation_npi_licensor_present-v0-loglikelihood
@@ -1 +1 @@
-e6666c5657215ff4bfd646b8ee3ae6df956e71c0be9ab1c287fb1b68291dd0d1
\ No newline at end of file
+e6666c5657215ff4bfd646b8ee3ae6df956e71c0be9ab1c287fb1b68291dd0d1
diff --git a/tests/tests/testdata/blimp_sentential_negation_npi_licensor_present-v0-res.json b/tests/tests/testdata/blimp_sentential_negation_npi_licensor_present-v0-res.json
index 4305bb313c..5147e675df 100644
--- a/tests/tests/testdata/blimp_sentential_negation_npi_licensor_present-v0-res.json
+++ b/tests/tests/testdata/blimp_sentential_negation_npi_licensor_present-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_sentential_negation_npi_licensor_present": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_sentential_negation_npi_licensor_present": 0}}
\ No newline at end of file
+{"results": {"blimp_sentential_negation_npi_licensor_present": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_sentential_negation_npi_licensor_present": 0}}
diff --git a/tests/tests/testdata/blimp_sentential_negation_npi_scope-v0-loglikelihood b/tests/tests/testdata/blimp_sentential_negation_npi_scope-v0-loglikelihood
index c7aa260f91..c8ceed839e 100644
--- a/tests/tests/testdata/blimp_sentential_negation_npi_scope-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_sentential_negation_npi_scope-v0-loglikelihood
@@ -1 +1 @@
-32fcbd0a1c6e664af2751bad552587b5ca3911973b07f4fb2cf0a2acd3de5349
\ No newline at end of file
+32fcbd0a1c6e664af2751bad552587b5ca3911973b07f4fb2cf0a2acd3de5349
diff --git a/tests/tests/testdata/blimp_sentential_negation_npi_scope-v0-res.json b/tests/tests/testdata/blimp_sentential_negation_npi_scope-v0-res.json
index fcaf915f36..c5c869e576 100644
--- a/tests/tests/testdata/blimp_sentential_negation_npi_scope-v0-res.json
+++ b/tests/tests/testdata/blimp_sentential_negation_npi_scope-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_sentential_negation_npi_scope": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_sentential_negation_npi_scope": 0}}
\ No newline at end of file
+{"results": {"blimp_sentential_negation_npi_scope": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_sentential_negation_npi_scope": 0}}
diff --git a/tests/tests/testdata/blimp_sentential_subject_island-v0-loglikelihood b/tests/tests/testdata/blimp_sentential_subject_island-v0-loglikelihood
index 6220172936..796f881d3c 100644
--- a/tests/tests/testdata/blimp_sentential_subject_island-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_sentential_subject_island-v0-loglikelihood
@@ -1 +1 @@
-80f5f98fad26240de2767fe58c4b18d864df41cbfa76f06c84c3fce9f14f4833
\ No newline at end of file
+80f5f98fad26240de2767fe58c4b18d864df41cbfa76f06c84c3fce9f14f4833
diff --git a/tests/tests/testdata/blimp_sentential_subject_island-v0-res.json b/tests/tests/testdata/blimp_sentential_subject_island-v0-res.json
index a7f8f1825a..d2e011f01c 100644
--- a/tests/tests/testdata/blimp_sentential_subject_island-v0-res.json
+++ b/tests/tests/testdata/blimp_sentential_subject_island-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_sentential_subject_island": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_sentential_subject_island": 0}}
\ No newline at end of file
+{"results": {"blimp_sentential_subject_island": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_sentential_subject_island": 0}}
diff --git a/tests/tests/testdata/blimp_superlative_quantifiers_1-v0-loglikelihood b/tests/tests/testdata/blimp_superlative_quantifiers_1-v0-loglikelihood
index b7d2819cb3..31f880fb55 100644
--- a/tests/tests/testdata/blimp_superlative_quantifiers_1-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_superlative_quantifiers_1-v0-loglikelihood
@@ -1 +1 @@
-8a01f6a5ea87a01c0c9b0c7b3bc4de4711bf0ff050976976651182b9ed34a0d4
\ No newline at end of file
+8a01f6a5ea87a01c0c9b0c7b3bc4de4711bf0ff050976976651182b9ed34a0d4
diff --git a/tests/tests/testdata/blimp_superlative_quantifiers_1-v0-res.json b/tests/tests/testdata/blimp_superlative_quantifiers_1-v0-res.json
index b69d445f3c..99045a76c6 100644
--- a/tests/tests/testdata/blimp_superlative_quantifiers_1-v0-res.json
+++ b/tests/tests/testdata/blimp_superlative_quantifiers_1-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_superlative_quantifiers_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_superlative_quantifiers_1": 0}}
\ No newline at end of file
+{"results": {"blimp_superlative_quantifiers_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_superlative_quantifiers_1": 0}}
diff --git a/tests/tests/testdata/blimp_superlative_quantifiers_2-v0-loglikelihood b/tests/tests/testdata/blimp_superlative_quantifiers_2-v0-loglikelihood
index 4a8317f0b3..c72ce9158f 100644
--- a/tests/tests/testdata/blimp_superlative_quantifiers_2-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_superlative_quantifiers_2-v0-loglikelihood
@@ -1 +1 @@
-59c20ff0f632cf42afc74ecc682cf92e5e740417b01e6cf9a610a3bc544d2ea5
\ No newline at end of file
+59c20ff0f632cf42afc74ecc682cf92e5e740417b01e6cf9a610a3bc544d2ea5
diff --git a/tests/tests/testdata/blimp_superlative_quantifiers_2-v0-res.json b/tests/tests/testdata/blimp_superlative_quantifiers_2-v0-res.json
index 2733d251cf..02f83fd3b8 100644
--- a/tests/tests/testdata/blimp_superlative_quantifiers_2-v0-res.json
+++ b/tests/tests/testdata/blimp_superlative_quantifiers_2-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_superlative_quantifiers_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_superlative_quantifiers_2": 0}}
\ No newline at end of file
+{"results": {"blimp_superlative_quantifiers_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_superlative_quantifiers_2": 0}}
diff --git a/tests/tests/testdata/blimp_tough_vs_raising_1-v0-loglikelihood b/tests/tests/testdata/blimp_tough_vs_raising_1-v0-loglikelihood
index a26cb174a0..34c4914813 100644
--- a/tests/tests/testdata/blimp_tough_vs_raising_1-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_tough_vs_raising_1-v0-loglikelihood
@@ -1 +1 @@
-973fe56534fdef1207f0fc08dd09a210304c55f33c6cbb17552754bf54f11c86
\ No newline at end of file
+973fe56534fdef1207f0fc08dd09a210304c55f33c6cbb17552754bf54f11c86
diff --git a/tests/tests/testdata/blimp_tough_vs_raising_1-v0-res.json b/tests/tests/testdata/blimp_tough_vs_raising_1-v0-res.json
index 44ea10c138..68575cfb71 100644
--- a/tests/tests/testdata/blimp_tough_vs_raising_1-v0-res.json
+++ b/tests/tests/testdata/blimp_tough_vs_raising_1-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_tough_vs_raising_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_tough_vs_raising_1": 0}}
\ No newline at end of file
+{"results": {"blimp_tough_vs_raising_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_tough_vs_raising_1": 0}}
diff --git a/tests/tests/testdata/blimp_tough_vs_raising_2-v0-loglikelihood b/tests/tests/testdata/blimp_tough_vs_raising_2-v0-loglikelihood
index 3b0f976352..376afc1f89 100644
--- a/tests/tests/testdata/blimp_tough_vs_raising_2-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_tough_vs_raising_2-v0-loglikelihood
@@ -1 +1 @@
-d255a10a34f14d77d9526604a17b0f6747d32f62fc2e3a09e9ab10054535fd45
\ No newline at end of file
+d255a10a34f14d77d9526604a17b0f6747d32f62fc2e3a09e9ab10054535fd45
diff --git a/tests/tests/testdata/blimp_tough_vs_raising_2-v0-res.json b/tests/tests/testdata/blimp_tough_vs_raising_2-v0-res.json
index c9b8c7d061..9db116b8a0 100644
--- a/tests/tests/testdata/blimp_tough_vs_raising_2-v0-res.json
+++ b/tests/tests/testdata/blimp_tough_vs_raising_2-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_tough_vs_raising_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_tough_vs_raising_2": 0}}
\ No newline at end of file
+{"results": {"blimp_tough_vs_raising_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_tough_vs_raising_2": 0}}
diff --git a/tests/tests/testdata/blimp_transitive-v0-loglikelihood b/tests/tests/testdata/blimp_transitive-v0-loglikelihood
index 98156dcf1e..0d464eaed5 100644
--- a/tests/tests/testdata/blimp_transitive-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_transitive-v0-loglikelihood
@@ -1 +1 @@
-d0d47fe40a7ee558ba782edbc4f49f7d9123c8472a36decc97f8ab142b45b9d8
\ No newline at end of file
+d0d47fe40a7ee558ba782edbc4f49f7d9123c8472a36decc97f8ab142b45b9d8
diff --git a/tests/tests/testdata/blimp_transitive-v0-res.json b/tests/tests/testdata/blimp_transitive-v0-res.json
index d2c99ab803..e93acad90a 100644
--- a/tests/tests/testdata/blimp_transitive-v0-res.json
+++ b/tests/tests/testdata/blimp_transitive-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_transitive": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_transitive": 0}}
\ No newline at end of file
+{"results": {"blimp_transitive": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_transitive": 0}}
diff --git a/tests/tests/testdata/blimp_wh_island-v0-loglikelihood b/tests/tests/testdata/blimp_wh_island-v0-loglikelihood
index d27f1316dc..c3006b2b22 100644
--- a/tests/tests/testdata/blimp_wh_island-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_wh_island-v0-loglikelihood
@@ -1 +1 @@
-91a9e4b60b0f3572a7fdbd7648d0e69f36e5eb34db715315b0082558d7ed8b65
\ No newline at end of file
+91a9e4b60b0f3572a7fdbd7648d0e69f36e5eb34db715315b0082558d7ed8b65
diff --git a/tests/tests/testdata/blimp_wh_island-v0-res.json b/tests/tests/testdata/blimp_wh_island-v0-res.json
index 1d50683774..73dc8131e1 100644
--- a/tests/tests/testdata/blimp_wh_island-v0-res.json
+++ b/tests/tests/testdata/blimp_wh_island-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_wh_island": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_island": 0}}
\ No newline at end of file
+{"results": {"blimp_wh_island": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_island": 0}}
diff --git a/tests/tests/testdata/blimp_wh_questions_object_gap-v0-loglikelihood b/tests/tests/testdata/blimp_wh_questions_object_gap-v0-loglikelihood
index c3e6af12f2..dca61bd8a0 100644
--- a/tests/tests/testdata/blimp_wh_questions_object_gap-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_wh_questions_object_gap-v0-loglikelihood
@@ -1 +1 @@
-4d4aaa0274ccd485ff8430ed61b8f83806febe18c16616c7d050f637a0463eba
\ No newline at end of file
+4d4aaa0274ccd485ff8430ed61b8f83806febe18c16616c7d050f637a0463eba
diff --git a/tests/tests/testdata/blimp_wh_questions_object_gap-v0-res.json b/tests/tests/testdata/blimp_wh_questions_object_gap-v0-res.json
index 60228b7918..3acb9d0117 100644
--- a/tests/tests/testdata/blimp_wh_questions_object_gap-v0-res.json
+++ b/tests/tests/testdata/blimp_wh_questions_object_gap-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_wh_questions_object_gap": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_questions_object_gap": 0}}
\ No newline at end of file
+{"results": {"blimp_wh_questions_object_gap": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_questions_object_gap": 0}}
diff --git a/tests/tests/testdata/blimp_wh_questions_subject_gap-v0-loglikelihood b/tests/tests/testdata/blimp_wh_questions_subject_gap-v0-loglikelihood
index 1a88f8fa87..48ad6a252e 100644
--- a/tests/tests/testdata/blimp_wh_questions_subject_gap-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_wh_questions_subject_gap-v0-loglikelihood
@@ -1 +1 @@
-d5486ffcc075cad4302e37ece9bbf5b2063c0b5a48e76c8e1dd365e22a5a48fc
\ No newline at end of file
+d5486ffcc075cad4302e37ece9bbf5b2063c0b5a48e76c8e1dd365e22a5a48fc
diff --git a/tests/tests/testdata/blimp_wh_questions_subject_gap-v0-res.json b/tests/tests/testdata/blimp_wh_questions_subject_gap-v0-res.json
index 4b21da71d5..50869d2c68 100644
--- a/tests/tests/testdata/blimp_wh_questions_subject_gap-v0-res.json
+++ b/tests/tests/testdata/blimp_wh_questions_subject_gap-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_wh_questions_subject_gap": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_questions_subject_gap": 0}}
\ No newline at end of file
+{"results": {"blimp_wh_questions_subject_gap": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_questions_subject_gap": 0}}
diff --git a/tests/tests/testdata/blimp_wh_questions_subject_gap_long_distance-v0-loglikelihood b/tests/tests/testdata/blimp_wh_questions_subject_gap_long_distance-v0-loglikelihood
index f83ed1fb74..ada08f3908 100644
--- a/tests/tests/testdata/blimp_wh_questions_subject_gap_long_distance-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_wh_questions_subject_gap_long_distance-v0-loglikelihood
@@ -1 +1 @@
-37483dfda688b62ad27161c9fc1e1e7710c5a6e6a7cd3474df119bcafd30e97f
\ No newline at end of file
+37483dfda688b62ad27161c9fc1e1e7710c5a6e6a7cd3474df119bcafd30e97f
diff --git a/tests/tests/testdata/blimp_wh_questions_subject_gap_long_distance-v0-res.json b/tests/tests/testdata/blimp_wh_questions_subject_gap_long_distance-v0-res.json
index fe6bbf95e5..c487505f5b 100644
--- a/tests/tests/testdata/blimp_wh_questions_subject_gap_long_distance-v0-res.json
+++ b/tests/tests/testdata/blimp_wh_questions_subject_gap_long_distance-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_wh_questions_subject_gap_long_distance": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_questions_subject_gap_long_distance": 0}}
\ No newline at end of file
+{"results": {"blimp_wh_questions_subject_gap_long_distance": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_questions_subject_gap_long_distance": 0}}
diff --git a/tests/tests/testdata/blimp_wh_vs_that_no_gap-v0-loglikelihood b/tests/tests/testdata/blimp_wh_vs_that_no_gap-v0-loglikelihood
index 5f40ea63f1..4db9b2ae80 100644
--- a/tests/tests/testdata/blimp_wh_vs_that_no_gap-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_wh_vs_that_no_gap-v0-loglikelihood
@@ -1 +1 @@
-d1d3e439b2020ef5ed232bfebbcc9634adc5117e9eb61e38fdbbe2c8ea128d54
\ No newline at end of file
+d1d3e439b2020ef5ed232bfebbcc9634adc5117e9eb61e38fdbbe2c8ea128d54
diff --git a/tests/tests/testdata/blimp_wh_vs_that_no_gap-v0-res.json b/tests/tests/testdata/blimp_wh_vs_that_no_gap-v0-res.json
index dfd3f66b77..c7112f5e57 100644
--- a/tests/tests/testdata/blimp_wh_vs_that_no_gap-v0-res.json
+++ b/tests/tests/testdata/blimp_wh_vs_that_no_gap-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_wh_vs_that_no_gap": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_vs_that_no_gap": 0}}
\ No newline at end of file
+{"results": {"blimp_wh_vs_that_no_gap": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_vs_that_no_gap": 0}}
diff --git a/tests/tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-loglikelihood b/tests/tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-loglikelihood
index 13359ac3d2..e377494473 100644
--- a/tests/tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-loglikelihood
@@ -1 +1 @@
-a142cc2a6fcd93230b650927b07367cad957b8f3f42cb4072151da53dea301df
\ No newline at end of file
+a142cc2a6fcd93230b650927b07367cad957b8f3f42cb4072151da53dea301df
diff --git a/tests/tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-res.json b/tests/tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-res.json
index de9e800718..f23846a765 100644
--- a/tests/tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-res.json
+++ b/tests/tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_wh_vs_that_no_gap_long_distance": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_vs_that_no_gap_long_distance": 0}}
\ No newline at end of file
+{"results": {"blimp_wh_vs_that_no_gap_long_distance": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_vs_that_no_gap_long_distance": 0}}
diff --git a/tests/tests/testdata/blimp_wh_vs_that_with_gap-v0-loglikelihood b/tests/tests/testdata/blimp_wh_vs_that_with_gap-v0-loglikelihood
index 4c15f2283e..0c5c8f1235 100644
--- a/tests/tests/testdata/blimp_wh_vs_that_with_gap-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_wh_vs_that_with_gap-v0-loglikelihood
@@ -1 +1 @@
-d41a9b85e4c31e445bf9b46b8642df02203ccc02b4a9b254bf76066d5c54b4b7
\ No newline at end of file
+d41a9b85e4c31e445bf9b46b8642df02203ccc02b4a9b254bf76066d5c54b4b7
diff --git a/tests/tests/testdata/blimp_wh_vs_that_with_gap-v0-res.json b/tests/tests/testdata/blimp_wh_vs_that_with_gap-v0-res.json
index 14befd4ab6..3a3888e719 100644
--- a/tests/tests/testdata/blimp_wh_vs_that_with_gap-v0-res.json
+++ b/tests/tests/testdata/blimp_wh_vs_that_with_gap-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_wh_vs_that_with_gap": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_vs_that_with_gap": 0}}
\ No newline at end of file
+{"results": {"blimp_wh_vs_that_with_gap": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_vs_that_with_gap": 0}}
diff --git a/tests/tests/testdata/blimp_wh_vs_that_with_gap_long_distance-v0-loglikelihood b/tests/tests/testdata/blimp_wh_vs_that_with_gap_long_distance-v0-loglikelihood
index 34b9591396..976005c758 100644
--- a/tests/tests/testdata/blimp_wh_vs_that_with_gap_long_distance-v0-loglikelihood
+++ b/tests/tests/testdata/blimp_wh_vs_that_with_gap_long_distance-v0-loglikelihood
@@ -1 +1 @@
-eed67491bdf493a1dad8f1d9766bc7bd0e79946365b833c0f7eb81ac998e3dca
\ No newline at end of file
+eed67491bdf493a1dad8f1d9766bc7bd0e79946365b833c0f7eb81ac998e3dca
diff --git a/tests/tests/testdata/blimp_wh_vs_that_with_gap_long_distance-v0-res.json b/tests/tests/testdata/blimp_wh_vs_that_with_gap_long_distance-v0-res.json
index 95a2c0c7e1..c5d2393594 100644
--- a/tests/tests/testdata/blimp_wh_vs_that_with_gap_long_distance-v0-res.json
+++ b/tests/tests/testdata/blimp_wh_vs_that_with_gap_long_distance-v0-res.json
@@ -1 +1 @@
-{"results": {"blimp_wh_vs_that_with_gap_long_distance": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_vs_that_with_gap_long_distance": 0}}
\ No newline at end of file
+{"results": {"blimp_wh_vs_that_with_gap_long_distance": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_vs_that_with_gap_long_distance": 0}}