Skip to content

Commit

Permalink
add doc to lighteval_tasks
Browse files Browse the repository at this point in the history
  • Loading branch information
NathanHB committed Jan 30, 2024
1 parent 8b36fe0 commit 4176f1e
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 19 deletions.
2 changes: 1 addition & 1 deletion src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ def higher_is_better():
return res

@staticmethod
def corpus_level_fns():
def corpus_level_fns() -> dict[str, callable]:
res = {}
for metric in Metrics:
if metric.value.category == MetricCategory.IGNORED:
Expand Down
145 changes: 127 additions & 18 deletions src/lighteval/tasks/lighteval_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,19 @@


class LightevalTask:
def __init__(self, name: str, cfg: dict, cache_dir: str = None, custom_tasks_module=None):
def __init__(self, name: str, cfg: dict, cache_dir: Optional[str] = None, custom_tasks_module=None):
"""
Initialize a LightEval task.
Args:
name (str): The name of the task.
cfg (dict): The configuration dictionary containing
task-specific settings (from the task_table.json file).
cache_dir (Optional[str], optional): The directory to cache the
dataset. Defaults to None.
custom_tasks_module ([type], optional): A custom module
containing task-specific functions. Defaults to None.
"""
self.name = name
self.VERSION = 0
self.is_main_process = False
Expand Down Expand Up @@ -108,24 +120,53 @@ def cfg(self):
return self._cfg

def doc_to_text_without_instructions(self, doc: Doc) -> str:
"""
Returns the query of the document without the instructions. If the
document has instructions, it removes them from the query:
Args:
doc (Doc): The document.
Returns:
str: The query of the document without the instructions.
"""
if doc.instruction is not None:
if not doc.query.startswith(doc.instruction):
raise ValueError(f"Prompt query {doc.query} is not starting with instruction {doc.instruction}")
return doc.query[len(doc.instruction) :]
return doc.query

def doc_to_text_and_instructions(self, doc: Doc) -> Tuple[str, str]:
"""
Returns a tuple with the query of the document and the instructions.
If the document has no instructions, the second element of the tuple is
an empty string.
Args:
doc (Doc): The document.
Returns:
Tuple[str, str]: A tuple with the query of the document and the
instructions.
"""
if doc.instruction is not None:
if not doc.query.startswith(doc.instruction):
raise ValueError(f"Prompt query {doc.query} is not starting with instruction {doc.instruction}")
return (doc.query[len(doc.instruction) :], doc.instruction)
return (doc.query, "")

def get_first_possible_fewshot_splits(self, number_of_splits: int = 1) -> list[str]:
"""Parses the possible fewshot split keys in order:
train, then validation keys
and matches them with the available keys.
Returns the first available.
"""
Parses the possible fewshot split keys in order: train, then validation
keys and matches them with the available keys. Returns the first
available.
Args:
number_of_splits (int, optional): The number of splits to return.
Defaults to 1.
Returns:
list[str]: The list of the first available fewshot splits.
"""
# Possible few shot splits are the available splits not used for evaluation
possible_fewshot_splits = [k for k in self.all_available_splits if k not in self.evaluation_split]
Expand All @@ -145,6 +186,17 @@ def get_first_possible_fewshot_splits(self, number_of_splits: int = 1) -> list[s
return None

def _get_docs_from_split(self, keys, few_shots=False) -> list[Doc]:
"""
Get the documents from the dataset for the given keys (splits).
Args:
keys (list): The list of keys (splits).
few_shots (bool, optional): Whether the documents are used for few
shot examples. Defaults to False.
Returns:
list[Doc]: The list of documents.
"""
if self.dataset is None:
self.dataset = download_dataset_worker((self.dataset_path, self.dataset_config_name))

Expand All @@ -159,6 +211,13 @@ def _get_docs_from_split(self, keys, few_shots=False) -> list[Doc]:
return docs

def fewshot_docs(self) -> list[Doc]:
"""
Returns the few shot documents. If the few shot documents are not
available, it gets them from the few shot split or the evaluation split.
Returns:
list[Doc]: The few shot documents.
"""
if self._fewshot_docs is None:
self._fewshot_docs = []

Expand All @@ -170,11 +229,28 @@ def fewshot_docs(self) -> list[Doc]:
return self._fewshot_docs

def eval_docs(self) -> list[Doc]:
"""
Returns the evaluation documents.
Returns:
list[Doc]: The evaluation documents.
"""
if self._docs is None:
self._docs = self._get_docs_from_split(self.evaluation_split)
return self._docs

def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False):
def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False) -> str:
"""
Returns the target of the given document.
Args:
formatted_doc (Doc): The formatted document.
few_shot (bool, optional): Whether the document is used for few
shot examples. Defaults to False.
Returns:
str: The target of the document.
"""
if few_shot:
if formatted_doc.target_for_fewshot_sorting is not None:
return formatted_doc.target_for_fewshot_sorting
Expand All @@ -184,6 +260,16 @@ def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False):

# Requests
def get_request_type(self) -> list[RequestType]:
"""
Returns the request types for the task.
Returns:
list[RequestType]: The request types for the task.
Raises:
NotImplementedError: If the request type is not implemented for the
task.
"""
request_types = []
if self.has_metric_category[MetricCategory.TARGET_PERPLEXITY]:
request_types.append(RequestType.LOGLIKELIHOOD)
Expand All @@ -207,7 +293,7 @@ def construct_requests(
self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str
) -> List[Request]:
"""
Constructs a list of requests based on the given parameters.
Constructs a list of requests from the task based on the given parameters.
Args:
formatted_doc (Doc): The formatted document almost straight from the dataset.
Expand Down Expand Up @@ -282,7 +368,17 @@ def construct_requests(

return requests

def process_results(self, formatted_doc: Doc, results: list[ModelReturn]):
def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dict[str, float]:
"""
Processes the results of the task. and stores them in the output dict.
Args:
formatted_doc (Doc): The formatted document of the task.
results (list[ModelReturn]): The results of the task, returned by the model class after evaluation.
Returns:
dict[str, float]: The output dictionary containing the results of the task.
"""
# Metrics management is done in metrics.__init__
outputs = {}
if self.has_metric_category[MetricCategory.TARGET_PERPLEXITY]:
Expand Down Expand Up @@ -319,6 +415,10 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]):
return outputs

def aggregation(self):
"""
Return a dict with metric name and its aggregation function for all
metrics
"""
return Metrics.corpus_level_fns()

@staticmethod
Expand Down Expand Up @@ -349,6 +449,10 @@ def load_datasets(tasks: list["LightevalTask"], dataset_loading_processes: int =


def download_dataset_worker(args):
"""
Worker function to download a dataset from the HuggingFace Hub.
Used for parallel dataset loading.
"""
dataset_path, dataset_config_name = args
dataset = load_dataset(
path=dataset_path,
Expand All @@ -370,22 +474,27 @@ def create_requests_from_tasks( # noqa: C901
use_chat_template: bool,
) -> Tuple[dict[RequestType, list[Request]], dict[TaskExampleId, Doc]]:
"""
Takes a task dict and a fewshot dict and returns a dict of requests, a dict of docs, and a dict of requests origins.
The construction of prompts and thus the managing of few shots is done here.
Takes a task dict and a fewshot dict and returns a dict of requests, a dict
of docs, and a dict of requests origins. The construction of prompts and
thus the managing of few shots is done here.
Args:
task_dict (_type_): _description_
fewshot_dict (_type_): _description_
num_fewshot_seeds (_type_): _description_
lm (_type_): _description_
max_samples (_type_): _description_
evaluation_tracker (_type_): _description_
task_dict (dict[str, LightevalTask]): A dictionary of tasks.
fewshot_dict (dict[str, list[Tuple[int, bool]]]): A dictionary of few
shot examples.
num_fewshot_seeds (int): The number of few shot seeds.
lm (BaseModel): The language model.
max_samples (int): The maximum number of samples.
evaluation_tracker (EvaluationTracker): The evaluation tracker.
use_chat_template (bool): Whether to use the chat template.
Raises:
RuntimeError: _description_
NotImplementedError: If the request type is not implemented for the
task.
Returns:
_type_: _description_
Tuple[dict[RequestType, list[Request]], dict[TaskExampleId, Doc]]: A
tuple containing the requests and the documents.
"""
docs: dict[TaskExampleId, Doc] = {}
requests: dict[RequestType, list[Request]] = collections.defaultdict(list)
Expand Down

0 comments on commit 4176f1e

Please sign in to comment.