Skip to content

Commit 843a0f8

Browse files
Fix a few typos and do a tiny refactor (#187)
1 parent 0bceaee commit 843a0f8

14 files changed

+34
-34
lines changed

run_evals_accelerate.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,12 @@ def get_parser():
5151
parser.add_argument(
5252
"--public_run", default=False, action="store_true", help="Push results and details to a public repo"
5353
)
54-
parser.add_argument("--cache_dir", type=str, default=CACHE_DIR)
54+
parser.add_argument(
55+
"--cache_dir",
56+
type=str,
57+
default=CACHE_DIR,
58+
help="Cache directory for downloaded datasets & model, defaults to `HF_HOME` environment variable",
59+
)
5560
parser.add_argument(
5661
"--results_org",
5762
type=str,
@@ -65,13 +70,13 @@ def get_parser():
6570
"--custom_tasks",
6671
type=str,
6772
default=None,
68-
help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formating functions)",
73+
help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formatting functions)",
6974
)
7075
group.add_argument(
7176
"--tasks",
7277
type=str,
7378
default=None,
74-
help="Id of a task, e.g. 'original|mmlu:abstract_algebra|5|0' or path to a texte file with a list of tasks",
79+
help="Comma-separated ids of tasks, e.g. 'original|mmlu:abstract_algebra|5' or path to a text file with a list of tasks",
7580
)
7681
parser.add_argument("--num_fewshot_seeds", type=int, default=1, help="Number of trials the few shots")
7782
return parser

src/lighteval/evaluator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def evaluate( # noqa: C901
6767
# A request output tupe is a Tuple where the first element is the index of
6868
# the request for one document of one task i.e.
6969
# task: "arc_easy", doc: "0"# request: "0" -> request_index = 0,
70-
# We can have multiple request per doc for multi choice tasks for example.
70+
# We can have multiple requests per doc for multi choice tasks for example.
7171

7272
# all responses for each (task, doc)
7373
RequestIndexModelResponseTuple = collections.namedtuple(

src/lighteval/logging/evaluation_tracker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -511,7 +511,7 @@ def push_results_to_tensorboard( # noqa: C901
511511
self, results: dict[str, dict[str, float]], details: dict[str, DetailsLogger.CompiledDetail]
512512
):
513513
if not is_nanotron_available():
514-
hlog_warn("You cannot push results to tensorboard with having nanotron installed. Skipping")
514+
hlog_warn("You cannot push results to tensorboard without having nanotron installed. Skipping")
515515
return
516516
config: Config = self.general_config_logger.config
517517
lighteval_config = config.lighteval

src/lighteval/metrics/imports/bert_scorer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ def greedy_cos_idf(
163163
- :param: `ref_masks` (torch.LongTensor): BxKxK, BERT attention mask for
164164
reference sentences.
165165
- :param: `ref_idf` (torch.Tensor): BxK, idf score of each word
166-
piece in the reference setence
166+
piece in the reference sentence
167167
- :param: `hyp_embedding` (torch.Tensor):
168168
embeddings of candidate sentences, BxKxd,
169169
B: batch size, K: longest length, d: bert dimenison

src/lighteval/metrics/judge_prompts.jsonl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@
44
{"name": "pair-math-v1-multi-turn", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. Your evaluation should consider correctness and helpfulness. You will be given reference answers, the assistant A's answers, the assistant B's answers. Your job is to determine which assistant provides correct and helpful answers to the second user question. Begin your evaluation by comparing both assistants' answers with the reference answers. Identify and correct any mistakes. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>", "description": "Prompt for multi-turn general questions", "category": "general", "output_format": "[[A]]"}
55
{"name": "single-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
66
{"name": "single-math-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}
7-
{"name": "single-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. You evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
8-
{"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. You evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}
7+
{"name": "single-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Your evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
8+
{"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Your evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}

src/lighteval/metrics/metrics.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ class Metrics(Enum):
249249
use_case=MetricUseCase.SUMMARIZATION,
250250
sample_level_fn=JudgeLLM(
251251
judge_model_name="gpt-3.5-turbo",
252-
template_path=os.path.join(os.path.dirname(__file__), "", "judge_prompts.jsonl"),
252+
template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
253253
multi_turn=False,
254254
).compute,
255255
corpus_level_fn={

src/lighteval/metrics/metrics_sample.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -644,7 +644,7 @@ def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[
644644
"""
645645
Compute the score of a generative task using a llm as a judge.
646646
The generative task can be multiturn with 2 turns max, in that case, we
647-
return scores for turn 1 and 2. Also returns user_prompt and judgment
647+
return scores for turn 1 and 2. Also returns user_prompt and judgement
648648
which are ignored later by the aggregator.
649649
"""
650650

src/lighteval/models/base_model.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def __init__(
7979
self._add_special_tokens = config.add_special_tokens if config.add_special_tokens is not None else False
8080
self._tokenizer = self._create_auto_tokenizer(config, env_config)
8181

82-
# If model_parallel is not set we compare the number of process with the number of GPUs
82+
# If model_parallel is not set we compare the number of processes with the number of GPUs
8383
self.model = self._create_auto_model(config, env_config)
8484
self.model.eval()
8585
torch.set_grad_enabled(False)
@@ -819,7 +819,7 @@ def _loglikelihood_tokens(
819819
)
820820
res.append(answer)
821821

822-
# Clean up GPUS
822+
# Clean up GPUs
823823
del model_output
824824
del logits
825825
del batched_inputs
@@ -852,7 +852,7 @@ def prepare_batch_logprob(
852852
hlog_warn("max_context is None, using max_length")
853853
max_context = self.max_length
854854

855-
# Each sample is concatenated and cut to lenght or padded to max_length
855+
# Each sample is concatenated and cut to length or padded to max_length
856856
for orig_tokens in inputs:
857857
truncated.append(max(len(orig_tokens) - max_context, 0))
858858

@@ -1030,7 +1030,7 @@ def _loglikelihood_single_token(
10301030
)
10311031
res.append(answer)
10321032

1033-
# Clean up GPUS
1033+
# Clean up GPUs
10341034
del out
10351035
del batch_probs
10361036
del batched_inputs

src/lighteval/models/model_config.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,9 @@ class BaseModelConfig:
8585
If `None`, the default value will be set to `True` for seq2seq models (e.g. T5) and
8686
`False` for causal models.
8787
model_parallel (bool, optional, defaults to False):
88-
True/False: force to uses or not the `accelerate` library to load a large
88+
True/False: force to use or not the `accelerate` library to load a large
8989
model across multiple devices.
90-
Default: None which correspond to comparing the number of process with
90+
Default: None which corresponds to comparing the number of processes with
9191
the number of GPUs. If it's smaller => model-parallelism, else not.
9292
dtype (Union[str, torch.dtype], optional, defaults to None):):
9393
Converts the model weights to `dtype`, if specified. Strings get
@@ -277,11 +277,8 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]
277277

278278
return BaseModelConfig(**args_dict)
279279

280-
if hasattr(args, "model_config") and args.model_config:
281-
config = args.model_config["model"]
282-
else:
283-
with open(args.model_config_path, "r") as f:
284-
config = yaml.safe_load(f)["model"]
280+
with open(args.model_config_path, "r") as f:
281+
config = yaml.safe_load(f)["model"]
285282

286283
if config["type"] == "tgi":
287284
return TGIModelConfig(

src/lighteval/models/model_loader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ def load_model( # noqa: C901
5757
config: Union[BaseModelConfig, AdapterModelConfig, DeltaModelConfig, TGIModelConfig, InferenceEndpointModelConfig],
5858
env_config: EnvConfig,
5959
) -> Tuple[Union[BaseModel, AdapterModel, DeltaModel, ModelClient], ModelInfo]:
60-
"""Will load either a model from an inference server or a model from a checkpoint. depending
61-
on the arguments passed to the program.
60+
"""Will load either a model from an inference server or a model from a checkpoint, depending
61+
on the config type.
6262
6363
Args:
6464
args (Namespace): arguments passed to the program

src/lighteval/models/nanotron_model.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -846,7 +846,7 @@ def _loglikelihood_single_token(
846846

847847
tq.desc = f"loglikelihood_single_token Subset {s} Node {dist.get_rank(self.parallel_context.world_pg)} - {human_format(tokens_per_sec)} tokens/s"
848848

849-
# Clean up GPUS
849+
# Clean up GPUs
850850
del out
851851
del batch_probs
852852
del batched_inputs
@@ -1083,7 +1083,7 @@ def _loglikelihood_tokens(
10831083
tokens_per_sec = batched_inputs.numel() / (elapsed_time_per_iteration_ms / 1000)
10841084
tq.desc = f"loglikelihood Subset {s} Node {dist.get_rank(self.parallel_context.world_pg)} - {human_format(tokens_per_sec)} tokens/s"
10851085

1086-
# Clean up GPUS
1086+
# Clean up GPUs
10871087
del out
10881088
del logits
10891089
del batched_inputs

src/lighteval/tasks/lighteval_task.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from dataclasses import dataclass
2727
from multiprocessing import Pool
2828
from pathlib import Path
29-
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
29+
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
3030

3131
from datasets import load_dataset
3232

@@ -454,7 +454,7 @@ def get_request_type(self) -> list[RequestType]: # noqa C901
454454

455455
def construct_requests(
456456
self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str
457-
) -> List[Request]:
457+
) -> Dict[RequestType, List[Request]]:
458458
"""
459459
Constructs a list of requests from the task based on the given parameters.
460460

src/lighteval/tasks/registry.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def get_task_dict(
117117
118118
Args:
119119
task_name_list (List[str]): A list of task names.
120-
custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module it-self
120+
custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module itself
121121
extended_tasks (Optional[str]): The path to the extended tasks group of submodules
122122
123123
Returns:
@@ -159,7 +159,7 @@ def create_custom_tasks_module(custom_tasks: Union[str, ModuleType]) -> ModuleTy
159159
"""Creates a custom task module to load tasks defined by the user in their own file.
160160
161161
Args:
162-
custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module it-self
162+
custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module itself
163163
164164
Returns:
165165
ModuleType: The newly imported/created custom tasks modules
@@ -178,7 +178,7 @@ def get_custom_tasks(custom_tasks: Union[str, ModuleType]) -> Tuple[ModuleType,
178178
"""Get all the custom tasks available from the given custom tasks file or module.
179179
180180
Args:
181-
custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module it-self
181+
custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module itself
182182
"""
183183
custom_tasks_module = create_custom_tasks_module(custom_tasks=custom_tasks)
184184
tasks_string = ""

src/lighteval/tasks/requests.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ class TaskExampleId(NamedTuple):
143143
Represents the identifier for an example in a task.
144144
145145
Attributes:
146-
task_name (str): The name of the task.
146+
task_name (str): The name of the task in `name|num_fewshot` format.
147147
doc_id_seed (str): The document id with the seed used for few_shot appended at the end.
148148
"""
149149

@@ -187,9 +187,7 @@ def get_golds(self, few_shot: bool = False):
187187
choices = self.choices
188188
golds = []
189189
for gold_ix in gold_indices:
190-
local_golds = as_list(choices[gold_ix])
191-
for local_gold in local_golds:
192-
golds.append(local_gold)
190+
golds.extend(as_list(choices[gold_ix]))
193191
return golds
194192

195193
def __repr__(self):

0 commit comments

Comments
 (0)