Merge pull request #4 from zenml-io/feature/OSSK-521-peft-template

PEFT template
zenml-io · May 23, 2024 · 0dc4bfd · 0dc4bfd
2 parents 2c67d94 + 1156f4b
commit 0dc4bfd
Show file tree

Hide file tree

Showing 79 changed files with 955 additions and 11,357 deletions.
diff --git a/.gitignore b/.gitignore
@@ -160,3 +160,5 @@ cython_debug/
 #.idea/
 
 .ruff_cache/
+.local/
+.vscode/
diff --git a/copier.yaml b/copier.yaml
@@ -48,39 +48,49 @@ email:
 product_name:
     type: str
     help: The technical name of the data product you are building
-    default: llm_lora
+    default: llm-peft
     validator: >-
-        {% if not (product_name | regex_search('^[a-z][a-z0-9\_]*$')) %}
-        The product name can only contain alphanumeric characters and underscores and must start with a character.
+        {% if not (product_name | regex_search('^[a-z][-a-z0-9]*$')) %}
+        The product name can only contain alphanumeric characters and dashes and must start with a character.
         {% endif %}
 model_repository:
     type: str
     help: |
-        Huggingface repository of the model to finetune. Check out the Lit-GPT docs for the currently available models
-        here https://github.com/Lightning-AI/litgpt?tab=readme-ov-file#-lit-gpt-1
-    default: mistralai/Mistral-7B-Instruct-v0.1
-from_safetensors:
-    type: bool
-    help: |
-        Whether the Huggingface model repository stores the model weights as safetensors. Check out the Lit-GPT docs
-        to find the answer for your current selected model {{ model_repository }}:
-        https://github.com/Lightning-AI/litgpt?tab=readme-ov-file#-lit-gpt-1
-    default: false
+        Huggingface repository of the model to finetune.
+    default: microsoft/phi-2
+steps_of_finetuning:
+    type: int
+    help: The number of steps of finetuning job.
+    default: 300
 cuda_version:
     type: str
     help: The available cuda version. (Only relevant when using a remote orchestrator)
     choices:
         CUDA 11.8: cuda11.8
         CUDA 12.1: cuda12.1
     default: cuda11.8
-huggingface_merged_model_repository:
+system_prompt:
     type: str
-    help: The huggingface repository to which the finetuned model should be pushed [Optional]
-    default: ""
-huggingface_adapter_model_repository:
+    help: |
+        The system prompt to be used for the finetuning job.
+    default: |
+        Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
+        This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
+        The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
+dataset_name:
     type: str
-    help: The huggingface repository to which the finetuned adapter should be pushed [Optional]
-    default: ""
+    help: |
+        The dataset to be used for the finetuning job.
+    default: gem/viggo
+step_operator:
+    type: str
+    help: The operator to be used for the finetuning and evaluation jobs.
+    default: gcp_a100
+bf16:
+    type: bool
+    help: |
+        Whether to use bf16 for the finetuning job.
+    default: true
 zenml_server_url:
     type: str
     help: The URL of the ZenML server [Optional]
@@ -128,10 +138,7 @@ _tasks:
       {%- if zenml_server_url %}
       echo "    zenml connect --url {{ zenml_server_url }}"
       {%- endif %}
-      {%- if (huggingface_merged_model_repository or huggingface_adapter_model_repository) %}
-      echo "    zenml secret create huggingface_credentials --token=<HF_TOKEN>"
-      {%- endif %}
-      echo "    python run.py --finetuning-pipeline --config finetune-alpaca.yaml"
+      echo "    python run.py --config orchestrator_finetune.yaml"
       echo "Next, you should take a look at the '{{ _copier_conf.dst_path }}/README.md' file in the generated project."
       echo "Happy coding!"
 

diff --git a/template/.assets/model.png b/template/.assets/model.png
diff --git a/template/.assets/pipeline.png b/template/.assets/pipeline.png
diff --git a/template/.dockerignore b/template/.dockerignore
@@ -1,9 +1,5 @@
 *
+!/materializers/**
 !/pipelines/**
 !/steps/**
-!/materializers/**
-!/evaluate/**
-!/finetune/**
-!/generate/**
-!/lit_gpt/**
-!/scripts/**
+!/utils/**
diff --git a/template/README.md b/template/README.md
@@ -1,6 +1,6 @@
-# ☮️ Fine-tuning open source LLMs using MLOps pipelines
+# ☮️ Fine-tuning open source LLMs using MLOps pipelines with PEFT
 
-Welcome to your newly generated "{{project_name}}" project! This is
+Welcome to your newly generated "ZenML LLM PEFT Finetuning project" project! This is
 a great way to get hands-on with ZenML using production-like template. 
 The project contains a collection of ZenML steps, pipelines and other artifacts
 and useful resources that can serve as a solid starting point for finetuning open-source LLMs using ZenML.
@@ -15,13 +15,13 @@ Using these pipelines, we can run the data-preparation and model finetuning with
   <br/>
 </div>
 
-## :earth_americas: Inspiration and Credit
+## 🌎 Inspiration and Credit
 
-This project heavily relies on the [Lit-GPT project](https://github.com/Lightning-AI/litgpt) of the amazing people at Lightning AI. We used [this blogpost](https://lightning.ai/pages/community/lora-insights/#toc14) to get started with LoRA and QLoRA and modified the commands they recommend to make them work using ZenML.
+This project heavily relies on the [PEFT project](https://huggingface.co/docs/peft/en/index) by the amazing people at Hugging Face and the [`{{ model_repository }}`](https://huggingface.co/{{ model_repository }}) model from the amazing people at {{ model_repository.split("/") | first }}.
 
 ## 🏃 How to run
 
-In this project we provide a few predefined configuration files for finetuning models on the [Alpaca](https://huggingface.co/datasets/tatsu-lab/alpaca) dataset. Before we're able to run any pipeline, we need to set up our environment as follows:
+In this project, we provide a predefined configuration file to finetune models on the [{{ dataset_name }}](https://huggingface.co/datasets/{{ dataset_name }}) dataset. Before we're able to run any pipeline, we need to set up our environment as follows:
 
 ```bash
 # Set up a Python virtual environment, if you haven't already
@@ -30,60 +30,32 @@ source .venv/bin/activate
 
 # Install requirements
 pip install -r requirements.txt
-
-{%- if zenml_server_url %}
-# Connect to your remote ZenML server
-zenml connect --url {{ zenml_server_url }}
-{%- endif %}
-{%- if (huggingface_merged_model_repository or huggingface_adapter_model_repository) %}
-# Create a secret with your Huggingface token to push the finetuned model/adapter
-zenml secret create huggingface_credentials --token=<HF_TOKEN>
-{%- endif %}
-```
-
-### Combined feature engineering and finetuning pipeline
-
-The easiest way to get started with just a single command is to run the finetuning pipeline with the `finetune-alpaca.yaml` configuration file, which will do both feature engineering and finetuning:
-
-```shell
-python run.py --finetuning-pipeline --config finetune-alpaca.yaml
-```
-
-When running the pipeline like this, the trained adapter will be stored in the ZenML artifact store. You can optionally upload the adapter, the merged model or both by specifying the `adapter_output_repo` and `merged_output_repo` parameters in the configuration file.
-
-
-### Evaluation pipeline
-
-Before running this pipeline, you will need to fill in the `adapter_repo` in the `eval.yaml` configuration file. This should point to a huggingface repository that contains the finetuned adapter you got by running the finetuning pipeline.
-
-```shell
-python run.py --eval-pipeline --config eval.yaml
 ```
 
-### Merging pipeline
+### 👷 Combined feature engineering and finetuning pipeline
 
-In case you have trained an adapter using the finetuning pipeline, you can merge it with the base model by filling in the `adapter_repo` and `output_repo` parameters in the `merge.yaml` file, and then running:
+The easiest way to get started with just a single command is to run the finetuning pipeline with the `orchestrator_finetune.yaml` configuration file, which will do data preparation, model finetuning, evaluation with [Rouge](https://huggingface.co/spaces/evaluate-metric/rouge) and promotion:
 
 ```shell
-python run.py --merge-pipeline --config merge.yaml
+python run.py --config orchestrator_finetune.yaml
 ```
 
-### Feature Engineering followed by Finetuning
-
-If you want to finetune your model on a different dataset, you can do so by running the feature engineering pipeline followed by the finetuning pipeline. To define your dataset, take a look at the `scripts/prepare_*` scripts and set the dataset name in the `feature-alpaca.yaml` config file.
+When running the pipeline like this, the trained model will be stored in the ZenML artifact store.
 
-```shell
-python run.py --feature-pipeline --config feature-alpaca.yaml
-python run.py --finetuning-pipeline --config finetune-from-dataset.yaml
-```
+<div align="center">
+  <br/>
+    <a href="https://cloud.zenml.io">
+      <img alt="Model version metadata" src=".assets/pipeline.png">
+    </a>
+  <br/>
+</div>
 
-## ☁️ Running with a remote stack
+## ☁️ Running with a step operator in the stack
 
 To finetune an LLM on remote infrastructure, you can either use a remote orchestrator or a remote step operator. Follow these steps to set up a complete remote stack:
 - Register the [orchestrator](https://docs.zenml.io/stacks-and-components/component-guide/orchestrators) (or [step operator](https://docs.zenml.io/stacks-and-components/component-guide/step-operators)) and make sure to configure it in a way so that the finetuning step has access to a GPU with at least 24GB of VRAM. Check out our docs for more [details](https://docs.zenml.io/stacks-and-components/component-guide).
     - To access GPUs with this amount of VRAM, you might need to increase your GPU quota ([AWS](https://docs.aws.amazon.com/servicequotas/latest/userguide/request-quota-increase.html), [GCP](https://console.cloud.google.com/iam-admin/quotas), [Azure](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-manage-quotas?view=azureml-api-2#request-quota-and-limit-increases)).
     - The GPU instance that your finetuning will be running on will have CUDA drivers of a specific version installed. If that CUDA version is not compatible with the one provided by the default Docker image of the finetuning pipeline, you will need to modify it in the configuration file. See [here](https://hub.docker.com/r/pytorch/pytorch/tags) for a list of available PyTorch images.
-    - If you're running out of memory, you can experiment with quantized LoRA (QLoRA) by setting a different value for the `quantize` parameter in the configuration, or reduce the `global_batch_size`/`micro_batch_size`.
 - Register a remote [artifact store](https://docs.zenml.io/stacks-and-components/component-guide/artifact-stores) and [container registry](https://docs.zenml.io/stacks-and-components/component-guide/container-registries).
 - Register a stack with all these components
     ```shell
@@ -93,43 +65,30 @@ To finetune an LLM on remote infrastructure, you can either use a remote orchest
         [-s <STEP_OPERATOR_NAME>]
     ```
 
-## 💾 Running with custom data
-
-To finetune a model with your custom data, you will need to convert it to a CSV file with the columns described
-[here](https://github.com/Lightning-AI/litgpt/blob/main/tutorials/prepare_dataset.md#preparing-custom-datasets-from-a-csv-file).
-
-Next, update the `configs/feature-custom.yaml` file and set the value of the `csv_path` parameter to that CSV file.
-With all that in place, you can now run the feature engineering pipeline to convert your CSV into the correct format for training and then run the finetuning pipeline as follows:
-```shell
-python run.py --feature-pipeline --config feature-custom.yaml
-python run.py --finetuning-pipeline --config finetune-from-dataset.yaml
-```
-
 ## 📜 Project Structure
 
 The project loosely follows [the recommended ZenML project structure](https://docs.zenml.io/user-guide/starter-guide/follow-best-practices):
 
 ```
 .
 ├── configs                         # pipeline configuration files
-│   ├── eval.yaml                   # configuration for the evaluation pipeline
-│   ├── feature-alpaca.yaml         # configuration for the feature engineering pipeline
-│   ├── feature-custom.yaml         # configuration for the feature engineering pipeline
-│   ├── finetune-alpaca.yaml        # configuration for the finetuning pipeline
-│   ├── finetune-from-dataset.yaml  # configuration for the finetuning pipeline
-│   └── merge.yaml                  # configuration for the merging pipeline
+│   ├── orchestrator_finetune.yaml  # default local or remote orchestrator
+│   └── remote_finetune.yaml        # default step operator configuration
+├── materializers
+│   └── directory_materializer.py   # custom materializer to push whole directories to the artifact store and back
 ├── pipelines                       # `zenml.pipeline` implementations
-│   ├── evaluate.py                 # Evaluation pipeline
-│   ├── feature_engineering.py      # Feature engineering pipeline
-│   ├── finetuning.py               # Finetuning pipeline
-│   └── merge.py                    # Merging pipeline
+│   └── train.py                    # Finetuning and evaluation pipeline
 ├── steps                           # logically grouped `zenml.steps` implementations
-│   ├── evaluate.py                 # evaluate model performance
-│   ├── feature_engineering.py      # preprocess data
-│   ├── finetune.py                 # finetune a model
-│   ├── merge.py                    # merge model and adapter
-│   ├── params.py                   # shared parameters for steps
-│   └── utils.py                    # utility functions
+│   ├── evaluate_model.py           # evaluate base and finetuned models using Rouge metrics
+│   ├── finetune.py                 # finetune the base model
+│   ├── prepare_datasets.py         # load and tokenize dataset
+│   └── promote.py                  # promote good models to target environment
+├── utils                           # utility functions
+│   ├── callbacks.py                # custom callbacks
+│   ├── cuda.py                     # helpers for CUDA
+│   ├── loaders.py                  # loaders for models and data
+│   ├── logging.py                  # logging helpers
+│   └── tokenizer.py                # load and tokenize
 ├── .dockerignore
 ├── README.md                       # this file
 ├── requirements.txt                # extra Python dependencies 

diff --git a/template/configs/eval.yaml b/template/configs/eval.yaml
diff --git a/template/configs/feature-alpaca.yaml b/template/configs/feature-alpaca.yaml
diff --git a/template/configs/feature-custom.yaml b/template/configs/feature-custom.yaml
diff --git a/template/configs/finetune-alpaca.yaml b/template/configs/finetune-alpaca.yaml
-Original file line number
+Diff line change
@@ Expand Up / @@ -160,3 +160,5 @@ cython_debug/ @@
     #.idea/
     .ruff_cache/
+    .local/
+    .vscode/