add code to load glue benchmark for seq2seq generation

tanganke · May 22, 2024 · 93e9162 · 93e9162
1 parent dd30088
commit 93e9162
Show file tree

Hide file tree

Showing 6 changed files with 553 additions and 1 deletion.
diff --git a/docs/modelpool/gpt2_classification.md b/docs/modelpool/gpt2_classification.md
@@ -7,7 +7,7 @@ Here we provide a series of GPT-2 models fine-tuned for text classification task
 We provide seven GPT-2 models fine-tuned on the following tasks from GLUE Benchmark:
 CoLA, SST-2, MRPC, QQP, MNLI, RTE, and QNLI.
 These models are fine-tuned with the learning rate of 5e-5 for 3 epochs.
-The models are available on [HuggingFace](https://huggingface.co/collections/tanganke/gpt2-models-fine-tuned-on-tasks-from-glue-benchmark-664ab37d9e33e622679f541b) as Pytorch models.
+The models are available on [HuggingFace](https://huggingface.co/collections/tanganke/gpt-2-models-fine-tuned-on-tasks-from-glue-benchmark-664ab37d9e33e622679f541b) as Pytorch models.
 
 Evaluation results of these single-task models on the GLUE Benchmark are as follows:
 

diff --git a/fusion_bench/tasks/flan_t5_text_generation/__init__.py b/fusion_bench/tasks/flan_t5_text_generation/__init__.py
diff --git a/fusion_bench/tasks/flan_t5_text_generation/datasets_preprocess.py b/fusion_bench/tasks/flan_t5_text_generation/datasets_preprocess.py
@@ -0,0 +1,71 @@
+"""
+This scripts preprocess any NLP dataset into a text-to-text format.
+"""
+
+import json
+import os
+from pathlib import Path
+from typing import Any, Callable, Dict, Union
+
+from transformers import AutoTokenizer
+
+
+def preprocess(
+    tokenizer: AutoTokenizer,
+    input_text: str,
+    target_text: str,
+    tokenizer_kwawgs: Dict[str, Any] = None,
+):
+    """
+    standard preprocess function for dataset.
+    Preprocesses input and target text data using a tokenizer object and returns a dictionary of model inputs.
+
+    Args:
+        tokenizer: An instance of a tokenizer class used to preprocess text data.
+        input_text (str): A string containing the input text data to be tokenized.
+        target_text (str, optional): A string containing the target text data to be tokenized. If None, no target data is returned.
+
+    Returns:
+        A dictionary of model inputs containing the tokenized input and output data along with the modified labels tensor.
+    """
+    if tokenizer_kwawgs is None:
+        tokenizer_kwawgs = {}
+    model_inputs = tokenizer(input_text, **tokenizer_kwawgs)
+    if target_text is not None:
+        labels = tokenizer(target_text, **tokenizer_kwawgs)
+        labels = labels["input_ids"]
+        labels[labels == tokenizer.pad_token_id] = -100
+        model_inputs["labels"] = labels
+    return model_inputs
+
+
+class DatasetPreprocessor:
+    def __init__(
+        self,
+        tokenizer: AutoTokenizer,
+        tokenizer_kwargs: Dict[str, Any] = None,
+        template: Union[str, Path, Dict] = None,
+    ):
+        """
+        Initializes an instance of the datasets_preprocess class with a tokenizer object.
+
+        Args:
+            tokenizer: An instance of a tokenizer class used to preprocess text data.
+        """
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.tokenizer_kwargs = tokenizer_kwargs
+        if template is not None:
+            if isinstance(template, str):
+                template = template
+                assert os.path.exists(
+                    template
+                ), f"Template file not found at {template}"
+                with open(template, "r") as f:
+                    self.template = json.load(f)
+            elif isinstance(template, dict):
+                self.template = template
+            else:
+                raise ValueError(
+                    "Template must be a path to a json file or a dictionary"
+                )
diff --git a/fusion_bench/tasks/flan_t5_text_generation/glue_load_dataset.py b/fusion_bench/tasks/flan_t5_text_generation/glue_load_dataset.py
@@ -0,0 +1,51 @@
+import logging
+import os
+from typing import Optional
+
+from datasets import load_dataset, load_from_disk
+
+from fusion_bench.utils import timeit_context
+
+from .glue_preprocessors import glue_processors
+from .glue_prompt_templates import glue_prompt_templates
+
+log = logging.getLogger(__name__)
+
+
+def _load_glue_dataset(name, tokenizer):
+    dataset = load_dataset("glue", name)
+    preprocessor = glue_processors[name](
+        template=glue_prompt_templates[name],
+        tokenizer=tokenizer,
+        tokenizer_kwargs={
+            "padding": "max_length",
+            "truncate": True,
+            "return_tensors": "pt",
+        },
+    )
+    dataset = dataset.map(
+        preprocessor,
+        batched=True,
+        remove_columns=dataset["train"].column_names,
+        num_proc=1,
+    )
+    return dataset
+
+
+def load_glue_dataset(name, tokenizer, cache_dir: Optional[str]):
+    with timeit_context(f"Loading {name} dataset"):
+        if cache_dir is not None:
+            if not os.path.exists(cache_dir):
+                os.makedirs(cache_dir)
+            cache_path = os.path.join(
+                cache_dir, "flan-t5", f"_load_{name}_dataset_cached"
+            )
+            if os.path.exists(cache_path):
+                return load_from_disk(cache_path)
+            else:
+                dataset = _load_glue_dataset(name, tokenizer)
+                log.info(f"Saving {name} dataset to {cache_path}")
+                dataset.save_to_disk(cache_path)
+                return dataset
+        else:
+            return _load_glue_dataset(name, tokenizer)