Support for nanotron (#11)

Support for Nanotron models --- Co-authored-by: Nathan Habib <[email protected]> Co-authored-by: [email protected] <[email protected]> Co-authored-by: Clémentine Fourrier <[email protected]>
huggingface · Feb 7, 2024 · 8aaf51c · 8aaf51c
1 parent 1e837a9
commit 8aaf51c
Show file tree

Hide file tree

Showing 16 changed files with 487 additions and 471 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -37,4 +37,5 @@ repos:
     rev: 'v0.1.6'
     hooks:
       - id: ruff
+        args: ['--fix']
       - id: ruff-format
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ It is still an early, internal version - it should be nice to use but don't expe
 In case of problems or question, feel free to open an issue!
 
 ## How to install and use
-### Requirements
+### Installation
 0) Create your virtual environment using virtualenv or conda depending on your preferences. We require Python3.10
 
 1) Clone the package using `git clone`, then `cd lighteval-harness`, `pip install -e .` Once the dependencies are installed, `cd src`.
@@ -22,6 +22,12 @@ Optional:
 
 2) Add your user token to the environment variable `HUGGING_FACE_HUB_TOKEN` if you want to push your results to the hub
 
+For the linting:
+```bash
+pre-commit install
+pre-commit run --config .pre-commit-config.yaml --all-files
+```
+
 
 ### Usage
 - Launching on CPU

diff --git a/pyproject.toml b/pyproject.toml
@@ -82,8 +82,7 @@ optimum = ["optimum==1.12.0"]
 quantization = ["bitsandbytes>=0.41.0", "auto-gptq>=0.4.2"]
 adapters = ["peft==0.3.0"]
 nanotron = [
-  "nanotron@git+https://github.com/huggingface/nanotron@8c1a49588d0745a6404644a86547c2dd6a63640e",
-  "brrr@git+https://github.com/huggingface/brrr@e8a503e2ec08b34eed7522d331aec3bee8cdd29b",
+  "nanotron@git+https://github.com/huggingface/nanotron",
   "tensorboardX"
 ]
 

diff --git a/run_evals_accelerate.py b/run_evals_accelerate.py
@@ -0,0 +1,92 @@
+import argparse
+
+from lighteval.main_accelerate import CACHE_DIR, main
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    group = parser.add_mutually_exclusive_group(required=True)
+    task_type_group = parser.add_mutually_exclusive_group(required=True)
+
+    # Model type 1) Base model
+    weight_type_group = parser.add_mutually_exclusive_group()
+    weight_type_group.add_argument(
+        "--delta_weights",
+        action="store_true",
+        default=False,
+        help="set to True of your model should be merged with a base model, also need to provide the base model name",
+    )
+    weight_type_group.add_argument(
+        "--adapter_weights",
+        action="store_true",
+        default=False,
+        help="set to True of your model has been trained with peft, also need to provide the base model name",
+    )
+    parser.add_argument(
+        "--base_model", type=str, default=None, help="name of the base model to be used for delta or adapter weights"
+    )
+
+    task_type_group.add_argument("--model_args")
+    parser.add_argument("--model_dtype", type=str, default=None)
+    parser.add_argument(
+        "--multichoice_continuations_start_space",
+        action="store_true",
+        help="Whether to force multiple choice continuations to start with a space",
+    )
+    parser.add_argument(
+        "--no_multichoice_continuations_start_space",
+        action="store_true",
+        help="Whether to force multiple choice continuations to not start with a space",
+    )
+    parser.add_argument("--use_chat_template", default=False, action="store_true")
+    # Model type 2) TGI
+    task_type_group.add_argument("--inference_server_address", type=str)
+    parser.add_argument("--inference_server_auth", type=str, default=None)
+    # Model type 3) Inference endpoints
+    task_type_group.add_argument("--endpoint_model_name", type=str)
+    parser.add_argument("--accelerator", type=str, default=None)
+    parser.add_argument("--vendor", type=str, default=None)
+    parser.add_argument("--region", type=str, default=None)
+    parser.add_argument("--instance_size", type=str, default=None)
+    parser.add_argument("--instance_type", type=str, default=None)
+    parser.add_argument("--reuse_existing", default=False, action="store_true")
+    # Debug
+    parser.add_argument("--max_samples", type=int, default=None)
+    parser.add_argument("--job_id", type=str, help="Optional Job ID for future reference", default="")
+    # Saving
+    parser.add_argument("--push_results_to_hub", default=False, action="store_true")
+    parser.add_argument("--save_details", action="store_true")
+    parser.add_argument("--push_details_to_hub", default=False, action="store_true")
+    parser.add_argument(
+        "--public_run", default=False, action="store_true", help="Push results and details to a public repo"
+    )
+    parser.add_argument("--cache_dir", type=str, default=CACHE_DIR)
+    parser.add_argument(
+        "--results_org",
+        type=str,
+        help="Hub organisation where you want to store the results. Your current token must have write access to it",
+    )
+    # Common parameters
+    parser.add_argument("--output_dir", required=True)
+    parser.add_argument("--override_batch_size", type=int, default=-1)
+    parser.add_argument("--dataset_loading_processes", type=int, default=1)
+    parser.add_argument(
+        "--custom_tasks_file",
+        type=str,
+        default=None,
+        help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formating functions)",
+    )
+    group.add_argument(
+        "--tasks",
+        type=str,
+        default=None,
+        help="Id of a task, e.g. 'original|mmlu:abstract_algebra|5' or path to a texte file with a list of tasks",
+    )
+    parser.add_argument("--num_fewshot_seeds", type=int, default=1, help="Number of trials the few shots")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args, unknowns = parser.parse_known_args()
+    main(args)
diff --git a/run_evals_nanotron.py b/run_evals_nanotron.py
@@ -0,0 +1,33 @@
+# flake8: noqa: C901
+import argparse
+
+from lighteval.main_nanotron import main
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--checkpoint-config-path",
+        type=str,
+        required=True,
+        help="Path to the brr checkpoint YAML or python config file, potentially on S3",
+    )
+    parser.add_argument(
+        "--lighteval-override",
+        type=str,
+        help="Path to an optional YAML or python Lighteval config to override part of the checkpoint Lighteval config",
+    )
+    parser.add_argument(
+        "--cache-dir",
+        type=str,
+        default="",
+        help="Cache directory",
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args, unknowns = parser.parse_known_args()
+    main(args.checkpoint_config_path, args.lighteval_override, args.cache_dir)
diff --git a/src/lighteval/data.py b/src/lighteval/data.py
@@ -198,6 +198,37 @@ def _sorting_criteria(self, request: GreedyUntilRequest | GreedyUntilWithLogitsR
         return -(len(toks) + gen_length)
 
 
+class GenerativeTaskDatasetNanotron(DynamicBatchDataset):
+    def __getitem__(self, index) -> Request:
+        """
+        Get an item from the dataset depending on the split we are currently in.
+        For instance, if we are in split 0, we will get the item at index 0, if
+        we are in split 1, we will get the item at index self.split_size, etc.
+        Used for dynamic batching.
+
+        Args:
+            index (int): The index of the item.
+
+        Returns:
+            Any: The item at the specified index.
+        """
+        return index, self.sorted_data[index + self.split_start]
+
+    def _sorting_criteria(self, request) -> int:
+        """
+        Collate function for generating batches.
+
+        Args:
+            x (Any): The input data.
+
+        Returns:
+            Any: The collated data.
+        """
+        toks = request.tokenized_context
+        gen_length = request.generation_size
+        return -(len(toks) + gen_length)
+
+
 class GenDistributedSampler(DistributedSampler):
     """A distributed sampler that copy the last element only when drop_last is False so we keep a small padding in the batches
     as our samples are sorted by length.

diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
@@ -5,6 +5,8 @@
 import copy
 from typing import Dict, Union
 
+from pytablewriter import LatexTableWriter, MarkdownTableWriter
+
 from lighteval.logging.evaluation_tracker import EvaluationTracker
 from lighteval.logging.hierarchical_logger import hlog
 from lighteval.models.base_model import BaseModel
@@ -99,8 +101,6 @@ def evaluate(  # noqa: C901
 
 def make_results_table(result_dict):
     """Generate table of results."""
-    from pytablewriter import LatexTableWriter, MarkdownTableWriter
-
     md_writer = MarkdownTableWriter()
     latex_writer = LatexTableWriter()
     md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
@@ -18,13 +18,11 @@
     TaskConfigLogger,
     VersionsLogger,
 )
-from lighteval.utils import is_nanotron_available
+from lighteval.utils import is_nanotron_available, obj_to_markdown
 
 
 if is_nanotron_available():
-    from brrr.config import BrrrConfig
-    from brrr.experiment_loggers import obj_to_markdown
-    from nanotron.config import get_config_from_dict
+    from nanotron.config import Config, get_config_from_dict
 
 
 class EnhancedJSONEncoder(json.JSONEncoder):
@@ -104,81 +102,81 @@ def save(
 
         """
         hlog("Saving experiment tracker")
-        try:
-            date_id = datetime.now().isoformat().replace(":", "-")
-
-            output_dir_results = Path(output_dir) / "results" / self.general_config_logger.model_name
-            output_dir_details = Path(output_dir) / "details" / self.general_config_logger.model_name
-            output_dir_details_sub_folder = output_dir_details / date_id
-            output_dir_results.mkdir(parents=True, exist_ok=True)
-            output_dir_details_sub_folder.mkdir(parents=True, exist_ok=True)
-
-            output_results_file = output_dir_results / f"results_{date_id}.json"
-            output_results_in_details_file = output_dir_details / f"results_{date_id}.json"
-
-            hlog(f"Saving results to {output_results_file} and {output_results_in_details_file}")
-
-            to_dump = {
-                "config_general": asdict(self.general_config_logger),
-                "results": self.metrics_logger.metric_aggregated,
-                "versions": self.versions_logger.versions,
-                "config_tasks": self.task_config_logger.tasks_configs,
-                "summary_tasks": self.details_logger.compiled_details,
-                "summary_general": asdict(self.details_logger.compiled_details_over_all_tasks),
-            }
-            dumped = json.dumps(to_dump, cls=EnhancedJSONEncoder, indent=2)
-
-            with open(output_results_file, "w") as f:
-                f.write(dumped)
-
-            with open(output_results_in_details_file, "w") as f:
-                f.write(dumped)
-
-            for task_name, task_details in self.details_logger.details.items():
-                output_file_details = output_dir_details_sub_folder / f"details_{task_name}_{date_id}.parquet"
-                # Create a dataset from the dictionary
-                try:
-                    dataset = Dataset.from_list([asdict(detail) for detail in task_details])
-                except Exception:
-                    # We force cast to str to avoid formatting problems for nested objects
-                    dataset = Dataset.from_list(
-                        [{k: str(v) for k, v in asdict(detail).items()} for detail in task_details]
-                    )
+        # try:
+        date_id = datetime.now().isoformat().replace(":", "-")
 
-                # We don't keep 'id' around if it's there
-                column_names = dataset.column_names
-                if "id" in dataset.column_names:
-                    column_names = [t for t in dataset.column_names if t != "id"]
-
-                # Sort column names to make it easier later
-                dataset = dataset.select_columns(sorted(column_names))
-                # Save the dataset to a Parquet file
-                dataset.to_parquet(output_file_details.as_posix())
-
-            if push_results_to_hub:
-                self.api.upload_folder(
-                    repo_id=self.hub_results_repo if public else self.hub_private_results_repo,
-                    folder_path=output_dir_results,
-                    path_in_repo=self.general_config_logger.model_name,
-                    repo_type="dataset",
-                    commit_message=f"Updating model {self.general_config_logger.model_name}",
-                )
+        output_dir_results = Path(output_dir) / "results" / self.general_config_logger.model_name
+        output_dir_details = Path(output_dir) / "details" / self.general_config_logger.model_name
+        output_dir_details_sub_folder = output_dir_details / date_id
+        output_dir_results.mkdir(parents=True, exist_ok=True)
+        output_dir_details_sub_folder.mkdir(parents=True, exist_ok=True)
 
-            if push_details_to_hub:
-                self.details_to_hub(
-                    model_name=self.general_config_logger.model_name,
-                    results_file_path=output_results_in_details_file,
-                    details_folder_path=output_dir_details_sub_folder,
-                    push_as_public=public,
-                )
+        output_results_file = output_dir_results / f"results_{date_id}.json"
+        output_results_in_details_file = output_dir_details / f"results_{date_id}.json"
+
+        hlog(f"Saving results to {output_results_file} and {output_results_in_details_file}")
 
-            if push_results_to_tensorboard:
-                self.push_results_to_tensorboard(
-                    results=self.metrics_logger.metric_aggregated, details=self.details_logger.details
+        to_dump = {
+            "config_general": asdict(self.general_config_logger),
+            "results": self.metrics_logger.metric_aggregated,
+            "versions": self.versions_logger.versions,
+            "config_tasks": self.task_config_logger.tasks_configs,
+            "summary_tasks": self.details_logger.compiled_details,
+            "summary_general": asdict(self.details_logger.compiled_details_over_all_tasks),
+        }
+        dumped = json.dumps(to_dump, cls=EnhancedJSONEncoder, indent=2)
+
+        with open(output_results_file, "w") as f:
+            f.write(dumped)
+
+        with open(output_results_in_details_file, "w") as f:
+            f.write(dumped)
+
+        for task_name, task_details in self.details_logger.details.items():
+            output_file_details = output_dir_details_sub_folder / f"details_{task_name}_{date_id}.parquet"
+            # Create a dataset from the dictionary
+            try:
+                dataset = Dataset.from_list([asdict(detail) for detail in task_details])
+            except Exception:
+                # We force cast to str to avoid formatting problems for nested objects
+                dataset = Dataset.from_list(
+                    [{k: str(v) for k, v in asdict(detail).items()} for detail in task_details]
                 )
-        except Exception as e:
-            hlog("WARNING: Could not save results")
-            hlog(repr(e))
+
+            # We don't keep 'id' around if it's there
+            column_names = dataset.column_names
+            if "id" in dataset.column_names:
+                column_names = [t for t in dataset.column_names if t != "id"]
+
+            # Sort column names to make it easier later
+            dataset = dataset.select_columns(sorted(column_names))
+            # Save the dataset to a Parquet file
+            dataset.to_parquet(output_file_details.as_posix())
+
+        if push_results_to_hub:
+            self.api.upload_folder(
+                repo_id=self.hub_results_repo if public else self.hub_private_results_repo,
+                folder_path=output_dir_results,
+                path_in_repo=self.general_config_logger.model_name,
+                repo_type="dataset",
+                commit_message=f"Updating model {self.general_config_logger.model_name}",
+            )
+
+        if push_details_to_hub:
+            self.details_to_hub(
+                model_name=self.general_config_logger.model_name,
+                results_file_path=output_results_in_details_file,
+                details_folder_path=output_dir_details_sub_folder,
+                push_as_public=public,
+            )
+
+        if push_results_to_tensorboard:
+            self.push_results_to_tensorboard(
+                results=self.metrics_logger.metric_aggregated, details=self.details_logger.details
+            )
+        # except Exception as e:
+        #     hlog("WARNING: Could not save results")
+        #     hlog(repr(e))
 
     def generate_final_dict(self) -> dict:
         """Aggregates and returns all the logger's experiment information in a dictionary.
@@ -487,7 +485,7 @@ def push_results_to_tensorboard(  # noqa: C901
         if not is_nanotron_available():
             hlog_warn("You cannot push results to tensorboard with having nanotron installed. Skipping")
             return
-        config: BrrrConfig = get_config_from_dict(self.general_config_logger.config, config_class=BrrrConfig)
+        config: Config = get_config_from_dict(self.general_config_logger.config, config_class=Config)
         lighteval_config = config.lighteval
         try:
             global_step = config.general.step