diff --git a/tutorials/finetune-gemma-2b-on-l4/Dockerfile b/tutorials/finetune-gemma-2b-on-l4/Dockerfile new file mode 100644 index 000000000..9fb2761d8 --- /dev/null +++ b/tutorials/finetune-gemma-2b-on-l4/Dockerfile @@ -0,0 +1,13 @@ +FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 + +RUN apt-get update && \ + apt-get -y --no-install-recommends install python3-dev gcc python3-pip git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip3 install --no-cache-dir accelerate bitsandbytes datasets transformers peft trl torch + +COPY finetune.py /finetune.py + +ENV PYTHONUNBUFFERED 1 + +CMD python3 /finetune.py --device cuda diff --git a/tutorials/finetune-gemma-2b-on-l4/finetune-gemma-on-gke.ipynb b/tutorials/finetune-gemma-2b-on-l4/finetune-gemma-on-gke.ipynb new file mode 100644 index 000000000..26cbc8c04 --- /dev/null +++ b/tutorials/finetune-gemma-2b-on-l4/finetune-gemma-on-gke.ipynb @@ -0,0 +1,943 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "executionInfo": { + "elapsed": 275, + "status": "ok", + "timestamp": 1710450166603, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": 0 + }, + "id": "7d9bbf86da5e" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Run" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "99c1c3fc2ca5" + }, + "source": [ + "# Finetune Gemma to GKE using GPU" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3de7470326a2" + }, + "source": [ + "## Overview\n", + "\n", + "This notebook demonstrates downloading and fine tuning Gemma, open models from Google DeepMind using Pytorch and Hugging Face Libraries In this notebook we will finetune and publish Gemma model on Hugging Face. In this guide we specifically use L4 GPUs but this guide should also work for A100(40 GB), A100(80 GB), H100(80 GB) GPUs.\n", + "\n", + "\n", + "### Objective\n", + "\n", + "Finetune and Publish Gemma with Transformers and Lora on GPUs.\n", + "\n", + "### GPUs\n", + "\n", + "GPUs let you accelerate specific workloads running on your nodes such as machine learning and data processing. GKE provides a range of machine type options for node configuration, including machine types with NVIDIA H100, L4, and A100 GPUs.\n", + "\n", + "Before you use GPUs in GKE, we recommend that you complete the following learning path:\n", + "\n", + "Learn about [current GPU version availability](https://cloud.google.com/compute/docs/gpus)\n", + "\n", + "Learn about [GPUs in GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/gpus)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "264c07757582" + }, + "source": [ + "## Before you begin" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "35Dvbzb0hH3-" + }, + "source": [ + "### Configure Environment" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6c460088b873" + }, + "source": [ + "Set the following variables for the experiment environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "executionInfo": { + "elapsed": 276, + "status": "ok", + "timestamp": 1710459545977, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": 0 + }, + "id": "855d6b96f291" + }, + "outputs": [], + "source": [ + "# The HuggingFace token used to download models.\n", + "# Make sure Token has Write Permission\n", + "HF_TOKEN = \"\" # @param {type:\"string\"}\n", + "\n", + "# The size of the model to launch\n", + "MODEL_SIZE = \"2b\" # @param [\"2b\", \"7b\"]\n", + "\n", + "# Cloud project id.\n", + "PROJECT_ID = \"\" # @param {type:\"string\"}\n", + "\n", + "# Region for launching clusters.\n", + "REGION = \"us-central1\" # @param {type:\"string\"}\n", + "\n", + "# The cluster name to create\n", + "CLUSTER_NAME = \"gke-gemma-cluster\" # @param {type:\"string\"}\n", + "\n", + "# The number of GPUs to run\n", + "GPU_COUNT = 8" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "klPAnx16cVd7" + }, + "outputs": [], + "source": [ + "! gcloud auth login\n", + "! gcloud config set project \"$PROJECT_ID\"\n", + "! gcloud services enable container.googleapis.com\n", + "\n", + "# Add kubectl to the set of available tools.\n", + "! mkdir -p /tools/google-cloud-sdk/.install\n", + "! gcloud components install kubectl --quiet" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e828eb320337" + }, + "source": [ + "### Create a GKE cluster and a node pool" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PKhdKv1vK9Lg" + }, + "source": [ + "GKE creates the following resources for the model based on the MODEL_SIZE environment variable set above.\n", + "\n", + "- Autopilot cluster\n", + "\n", + "If you already have a cluster, you can skip to `Use an existing GKE cluster` instead." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "12cd25839741" + }, + "outputs": [], + "source": [ + "! gcloud container clusters create-auto {CLUSTER_NAME} \\\n", + " --project={PROJECT_ID} \\\n", + " --region={REGION} \\\n", + " --release-channel=rapid \\\n", + " --cluster-version=1.29" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6ydvYk7FLJz_" + }, + "source": [ + "### Use an existing GKE cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 1488, + "status": "ok", + "timestamp": 1710451982779, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": 0 + }, + "id": "DmpNpYF-LRut", + "outputId": "84dbdcb1-2daa-4be8-cb26-18bab4848d85" + }, + "outputs": [], + "source": [ + "! gcloud container clusters get-credentials {CLUSTER_NAME} --location {REGION}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2cc825514deb" + }, + "source": [ + "### Create Kubernetes secret for Hugging Face credentials" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rgZfNOSyOY7_" + }, + "source": [ + "Create a Kubernetes Secret that contains the Hugging Face token." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 1057, + "status": "ok", + "timestamp": 1710459552596, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": 0 + }, + "id": "b42bd4fa2b2d", + "outputId": "7b55174a-02db-41c6-ff77-00dba16d20df" + }, + "outputs": [], + "source": [ + "! kubectl create secret generic hf-secret \\\n", + "--from-literal=hf_api_token={HF_TOKEN} \\\n", + "--dry-run=client -o yaml | kubectl apply -f -" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gOADJkZAmt08" + }, + "source": [ + "## The Dataset\n", + "We use Lora to quickly finetune Gemma with `b-mc2/sql-create-context` dataset.\n", + "\n", + "This dataset has the following structure.\n", + "\n", + "| Answer | Question | Context |\n", + "|-----------------------------------------------------|-------------------------------------------------------------------------------|-------------------------------------------------------------------|\n", + "| SELECT COUNT(*) FROM head WHERE age > 56 | How many heads of the departments are older than 56 ? | CREATE TABLE head (age INTEGER) |\n", + "| SELECT name, born_state, age FROM head ORDER BY age | List the name, born state and age of the heads of departments ordered by age. | CREATE TABLE head (name VARCHAR, born_state VARCHAR, age VARCHAR) |\n", + "\n", + "We will finetune `google/gemma-2b` model to get SQL queries based on questions and context." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5JrfaiA5tAtW" + }, + "source": [ + "## Finetuning Gemma on GKE using GPU with Pytorch\n", + "\n", + "In this demo we will use Pytorch and Huggingface libraries to finetune Gemma. We use the `finetune.py` file.\n", + "\n", + "```python\n", + "import os\n", + "import torch\n", + "from datasets import load_dataset, Dataset\n", + "from transformers import (\n", + " AutoModelForCausalLM,\n", + " AutoTokenizer,\n", + " BitsAndBytesConfig,\n", + " HfArgumentParser,\n", + " TrainingArguments,\n", + " pipeline,\n", + " logging,\n", + ")\n", + "from peft import LoraConfig, PeftModel\n", + "\n", + "from trl import SFTTrainer\n", + "\n", + "# The model that you want to train from the Hugging Face hub\n", + "model_name = os.getenv(\"MODEL_NAME\", \"google/gemma-2b\")\n", + "\n", + "# The instruction dataset to use\n", + "dataset_name = \"b-mc2/sql-create-context\"\n", + "\n", + "# Fine-tuned model name\n", + "new_model = os.getenv(\"NEW_MODEL\", \"gemma-2b-sql\")\n", + "\n", + "################################################################################\n", + "# QLoRA parameters\n", + "################################################################################\n", + "\n", + "# LoRA attention dimension\n", + "lora_r = int(os.getenv(\"LORA_R\", \"4\"))\n", + "\n", + "# Alpha parameter for LoRA scaling\n", + "lora_alpha = int(os.getenv(\"LORA_ALPHA\", \"8\"))\n", + "\n", + "# Dropout probability for LoRA layers\n", + "lora_dropout = 0.1\n", + "\n", + "################################################################################\n", + "# bitsandbytes parameters\n", + "################################################################################\n", + "\n", + "# Activate 4-bit precision base model loading\n", + "use_4bit = True\n", + "\n", + "# Compute dtype for 4-bit base models\n", + "bnb_4bit_compute_dtype = \"float16\"\n", + "\n", + "# Quantization type (fp4 or nf4)\n", + "bnb_4bit_quant_type = \"nf4\"\n", + "\n", + "# Activate nested quantization for 4-bit base models (double quantization)\n", + "use_nested_quant = False\n", + "\n", + "################################################################################\n", + "# TrainingArguments parameters\n", + "################################################################################\n", + "\n", + "# Output directory where the model predictions and checkpoints will be stored\n", + "output_dir = \"./results\"\n", + "\n", + "# Number of training epochs\n", + "num_train_epochs = 1\n", + "\n", + "# Enable fp16/bf16 training (set bf16 to True with an A100)\n", + "fp16 = True\n", + "bf16 = False\n", + "\n", + "# Batch size per GPU for training\n", + "per_device_train_batch_size = int(os.getenv(\"TRAIN_BATCH_SIZE\", \"1\"))\n", + "\n", + "# Batch size per GPU for evaluation\n", + "per_device_eval_batch_size = int(os.getenv(\"EVAL_BATCH_SIZE\", \"2\"))\n", + "\n", + "# Number of update steps to accumulate the gradients for\n", + "gradient_accumulation_steps = int(os.getenv(\"GRADIENT_ACCUMULATION_STEPS\", \"1\"))\n", + "\n", + "# Enable gradient checkpointing\n", + "gradient_checkpointing = True\n", + "\n", + "# Maximum gradient normal (gradient clipping)\n", + "max_grad_norm = 0.3\n", + "\n", + "# Initial learning rate (AdamW optimizer)\n", + "learning_rate = 2e-4\n", + "\n", + "# Weight decay to apply to all layers except bias/LayerNorm weights\n", + "weight_decay = 0.001\n", + "\n", + "# Optimizer to use\n", + "optim = \"paged_adamw_32bit\"\n", + "\n", + "# Learning rate schedule\n", + "lr_scheduler_type = \"cosine\"\n", + "\n", + "# Number of training steps (overrides num_train_epochs)\n", + "max_steps = -1\n", + "\n", + "# Ratio of steps for a linear warmup (from 0 to learning rate)\n", + "warmup_ratio = 0.03\n", + "\n", + "# Group sequences into batches with same length\n", + "# Saves memory and speeds up training considerably\n", + "group_by_length = True\n", + "\n", + "# Save checkpoint every X updates steps\n", + "save_steps = 0\n", + "\n", + "# Log every X updates steps\n", + "logging_steps = int(os.getenv(\"LOGGING_STEPS\", \"50\"))\n", + "\n", + "################################################################################\n", + "# SFT parameters\n", + "################################################################################\n", + "\n", + "# Maximum sequence length to use\n", + "max_seq_length = int(os.getenv(\"MAX_SEQ_LENGTH\", \"512\"))\n", + "\n", + "# Pack multiple short examples in the same input sequence to increase efficiency\n", + "packing = False\n", + "\n", + "# Load the entire model on the GPU 0\n", + "device_map = {'':torch.cuda.current_device()}\n", + "\n", + "# Set limit to a positive number\n", + "limit = int(os.getenv(\"DATASET_LIMIT\", \"5000\"))\n", + "\n", + "dataset = load_dataset(dataset_name, split=\"train\")\n", + "if limit != -1:\n", + " dataset = dataset.shuffle(seed=42).select(range(limit))\n", + "\n", + "\n", + "def transform(data):\n", + " question = data['question']\n", + " context = data['context']\n", + " answer = data['answer']\n", + " template = \"Question: {question}\\nContext: {context}\\nAnswer: {answer}\"\n", + " return {'text': template.format(question=question, context=context, answer=answer)}\n", + "\n", + "\n", + "transformed = dataset.map(transform)\n", + "\n", + "# Load tokenizer and model with QLoRA configuration\n", + "compute_dtype = getattr(torch, bnb_4bit_compute_dtype)\n", + "\n", + "bnb_config = BitsAndBytesConfig(\n", + " load_in_4bit=use_4bit,\n", + " bnb_4bit_quant_type=bnb_4bit_quant_type,\n", + " bnb_4bit_compute_dtype=compute_dtype,\n", + " bnb_4bit_use_double_quant=use_nested_quant,\n", + ")\n", + "\n", + "# Check GPU compatibility with bfloat16\n", + "if compute_dtype == torch.float16 and use_4bit:\n", + " major, _ = torch.cuda.get_device_capability()\n", + " if major >= 8:\n", + " print(\"=\" * 80)\n", + " print(\"Your GPU supports bfloat16\")\n", + " print(\"=\" * 80)\n", + "\n", + "# Load base model\n", + "# model = AutoModelForCausalLM.from_pretrained(\"google/gemma-7b\")\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " quantization_config=bnb_config,\n", + " device_map=device_map,\n", + " torch_dtype=torch.float16,\n", + ")\n", + "model.config.use_cache = False\n", + "model.config.pretraining_tp = 1\n", + "\n", + "# Load LLaMA tokenizer\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "tokenizer.padding_side = \"right\" # Fix weird overflow issue with fp16 training\n", + "\n", + "# Load LoRA configuration\n", + "peft_config = LoraConfig(\n", + " lora_alpha=lora_alpha,\n", + " lora_dropout=lora_dropout,\n", + " r=lora_r,\n", + " bias=\"none\",\n", + " task_type=\"CAUSAL_LM\",\n", + " target_modules=[\"q_proj\", \"v_proj\"]\n", + ")\n", + "\n", + "# Set training parameters\n", + "training_arguments = TrainingArguments(\n", + " output_dir=output_dir,\n", + " num_train_epochs=num_train_epochs,\n", + " per_device_train_batch_size=per_device_train_batch_size,\n", + " gradient_accumulation_steps=gradient_accumulation_steps,\n", + " optim=optim,\n", + " save_steps=save_steps,\n", + " logging_steps=logging_steps,\n", + " learning_rate=learning_rate,\n", + " weight_decay=weight_decay,\n", + " fp16=fp16,\n", + " bf16=bf16,\n", + " max_grad_norm=max_grad_norm,\n", + " max_steps=max_steps,\n", + " warmup_ratio=warmup_ratio,\n", + " group_by_length=group_by_length,\n", + " lr_scheduler_type=lr_scheduler_type,\n", + ")\n", + "\n", + "trainer = SFTTrainer(\n", + " model=model,\n", + " train_dataset=transformed,\n", + " peft_config=peft_config,\n", + " dataset_text_field=\"text\",\n", + " max_seq_length=max_seq_length,\n", + " tokenizer=tokenizer,\n", + " args=training_arguments,\n", + " packing=packing,\n", + ")\n", + "\n", + "trainer.train()\n", + "\n", + "trainer.model.save_pretrained(new_model)\n", + "\n", + "# Reload model in FP16 and merge it with LoRA weights\n", + "base_model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " low_cpu_mem_usage=True,\n", + " return_dict=True,\n", + " torch_dtype=torch.float16,\n", + " device_map=device_map,\n", + ")\n", + "model = PeftModel.from_pretrained(base_model, new_model)\n", + "model = model.merge_and_unload()\n", + "\n", + "\n", + "\n", + "model.push_to_hub(new_model, check_pr=True)\n", + "\n", + "tokenizer.push_to_hub(new_model, check_pr=True)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8XceN-WKxRR-" + }, + "source": [ + "## Create a Container Manifest with Dockerfile\n", + "\n", + "Use the following `Dockerfile` to create a container image.\n", + "\n", + "```bash\n", + "FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04\n", + "\n", + "RUN apt-get update && \\\n", + " apt-get -y --no-install-recommends install python3-dev gcc python3-pip git && \\\n", + " rm -rf /var/lib/apt/lists/*\n", + "\n", + "RUN pip3 install --no-cache-dir accelerate bitsandbytes datasets transformers peft trl torch\n", + "\n", + "COPY finetune.py /finetune.py\n", + "\n", + "ENV PYTHONUNBUFFERED 1\n", + "\n", + "CMD python3 /finetune.py --device cuda\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "executionInfo": { + "elapsed": 273, + "status": "ok", + "timestamp": 1710457873562, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": 0 + }, + "id": "pCwBFXuTxaeU" + }, + "outputs": [], + "source": [ + "DOCKERFILE = \"\"\"\n", + "FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04\n", + "\n", + "RUN apt-get update && \\\n", + " apt-get -y --no-install-recommends install python3-dev gcc python3-pip git && \\\n", + " rm -rf /var/lib/apt/lists/*\n", + "\n", + "RUN pip3 install --no-cache-dir accelerate bitsandbytes datasets transformers peft trl torch\n", + "\n", + "COPY finetune.py /finetune.py\n", + "\n", + "ENV PYTHONUNBUFFERED 1\n", + "\n", + "CMD python3 /finetune.py --device cuda\n", + "\"\"\"\n", + "\n", + "with open(\"Dockerfile\", \"w\") as f:\n", + " f.write(DOCKERFILE)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZdqyOdL4tlOP" + }, + "source": [ + "### Containerize the Code with Docker and Cloud Build\n", + "\n", + "Using Cloud Build and the following Dockerfile we build and push the image in Artifact Registry Docker Repository." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dKk9B_8cwNrL" + }, + "outputs": [], + "source": [ + "# Create a Artifact Registry Repo\n", + "! gcloud artifacts repositories create gemma \\\n", + " --project={PROJECT_ID} \\\n", + " --repository-format=docker \\\n", + " --location=us \\\n", + " --description=\"Gemma Repo\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 636524, + "status": "ok", + "timestamp": 1710455573555, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": 0 + }, + "id": "mYo1j99-zXWe", + "outputId": "e6fefc69-27d5-4b98-fa4e-cc5584be97d4" + }, + "outputs": [], + "source": [ + "# Build and push the image using Cloud Build\n", + "! gcloud builds submit \\\n", + " --tag us-docker.pkg.dev/{PROJECT_ID}/gemma/finetune-gemma-gpu:1.0.0 ." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0c250872074f" + }, + "source": [ + "## Run Finetune Job on GKE Autopilot\n", + "\n", + "Use the YAML to run Gemma Finetune on GKE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "executionInfo": { + "elapsed": 255, + "status": "ok", + "timestamp": 1710457951607, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": 0 + }, + "id": "6psJZY_zUDgj" + }, + "outputs": [], + "source": [ + "K8S_JOB_YAML = f\"\"\"\n", + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License.\n", + "\n", + "apiVersion: batch/v1\n", + "kind: Job\n", + "metadata:\n", + " name: finetune-job\n", + " namespace: default\n", + "spec:\n", + " backoffLimit: 2\n", + " template:\n", + " metadata:\n", + " annotations:\n", + " kubectl.kubernetes.io/default-container: finetuner\n", + " spec:\n", + " terminationGracePeriodSeconds: 600\n", + " containers:\n", + " - name: finetuner\n", + " image: \n", + " resources:\n", + " limits:\n", + " nvidia.com/gpu: 8\n", + " env:\n", + " - name: MODEL_NAME\n", + " value: \"google/gemma-2b\"\n", + " - name: NEW_MODEL\n", + " value: \"gemma-2b-sql-kubecon-eu-2024\"\n", + " - name: LORA_R\n", + " value: \"8\"\n", + " - name: LORA_ALPHA\n", + " value: \"16\"\n", + " - name: TRAIN_BATCH_SIZE\n", + " value: \"1\"\n", + " - name: EVAL_BATCH_SIZE\n", + " value: \"2\"\n", + " - name: GRADIENT_ACCUMULATION_STEPS\n", + " value: \"2\"\n", + " - name: DATASET_LIMIT\n", + " value: \"1000\"\n", + " - name: MAX_SEQ_LENGTH\n", + " value: \"512\"\n", + " - name: LOGGING_STEPS\n", + " value: \"5\"\n", + " - name: HF_TOKEN\n", + " valueFrom:\n", + " secretKeyRef:\n", + " name: hf-secret\n", + " key: hf_api_token\n", + " volumeMounts:\n", + " - mountPath: /dev/shm\n", + " name: dshm\n", + " volumes:\n", + " - name: dshm\n", + " emptyDir:\n", + " medium: Memory\n", + " nodeSelector:\n", + " cloud.google.com/gke-accelerator: nvidia-l4\n", + " restartPolicy: OnFailure\n", + "\"\"\"\n", + "\n", + "with open(\"finetune.yaml\", \"w\") as f:\n", + " f.write(K8S_JOB_YAML)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 536, + "status": "ok", + "timestamp": 1710459879982, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": 0 + }, + "id": "yTmQqcRj9kz_", + "outputId": "5da8309a-56f0-493b-ce6e-1b786bfdf97f" + }, + "outputs": [], + "source": [ + "!kubectl apply -f finetune.yaml" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GYMesXi7WqCu" + }, + "source": [ + "#### Waiting for the container to create" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NKwbzKXuWvoL" + }, + "source": [ + "Use the command below to check on the status of the container." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 288136, + "status": "ok", + "timestamp": 1710460170411, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": 0 + }, + "id": "PXbPCrWtWqbk", + "outputId": "855c8166-1436-4529-e061-39c6d55d8eea" + }, + "outputs": [], + "source": [ + "! kubectl get po -l job-name=finetune-job -w" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MzINwFr_WVAB" + }, + "source": [ + "### View the logs from the running Job\n", + "\n", + "This will download the needed artifacts and run the finetuning code, this process will take close to 30 minutes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 32451, + "status": "ok", + "timestamp": 1710460809272, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": 0 + }, + "id": "gAkXSoy9Ufuo", + "outputId": "c41f3666-92a3-4627-96a9-9f4e15bc33a7" + }, + "outputs": [], + "source": [ + "! kubectl logs -f job/finetune-job" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VosC4Fbb01S9" + }, + "source": [ + "## Find the model on Huggingface\n", + "\n", + "If the Job ran successfully you can now go find the model on your Huggingface profile." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "af21a3cff1e0" + }, + "source": [ + "## Clean up resources" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "911406c1561e" + }, + "outputs": [], + "source": [ + "! kubectl delete job finetune-job" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mH9DlYk3IqA5" + }, + "outputs": [], + "source": [ + "! kubectl delete secrets hf-secret" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2acSuqPeNjJJ" + }, + "outputs": [], + "source": [ + "! gcloud container clusters delete {CLUSTER_NAME} \\\n", + " --region={REGION} \\\n", + " --quiet" + ] + } + ], + "metadata": { + "colab": { + "name": "model_garden_gemma_deployment_on_gke.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/tutorials/finetune-gemma-2b-on-l4/finetune.py b/tutorials/finetune-gemma-2b-on-l4/finetune.py new file mode 100644 index 000000000..8bb7b4832 --- /dev/null +++ b/tutorials/finetune-gemma-2b-on-l4/finetune.py @@ -0,0 +1,234 @@ +import os +import torch +from datasets import load_dataset, Dataset +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + TrainingArguments, +) +from peft import LoraConfig, PeftModel + +from trl import SFTTrainer + +# The model that you want to train from the Hugging Face hub +model_name = os.getenv("MODEL_NAME", "google/gemma-2b") + +# The instruction dataset to use +dataset_name = "b-mc2/sql-create-context" + +# Fine-tuned model name +new_model = os.getenv("NEW_MODEL", "gemma-2b-sql") + +################################################################################ +# QLoRA parameters +################################################################################ + +# LoRA attention dimension +lora_r = int(os.getenv("LORA_R", "4")) + +# Alpha parameter for LoRA scaling +lora_alpha = int(os.getenv("LORA_ALPHA", "8")) + +# Dropout probability for LoRA layers +lora_dropout = 0.1 + +################################################################################ +# bitsandbytes parameters +################################################################################ + +# Activate 4-bit precision base model loading +use_4bit = True + +# Compute dtype for 4-bit base models +bnb_4bit_compute_dtype = "float16" + +# Quantization type (fp4 or nf4) +bnb_4bit_quant_type = "nf4" + +# Activate nested quantization for 4-bit base models (double quantization) +use_nested_quant = False + +################################################################################ +# TrainingArguments parameters +################################################################################ + +# Output directory where the model predictions and checkpoints will be stored +output_dir = "./results" + +# Number of training epochs +num_train_epochs = 1 + +# Enable fp16/bf16 training (set bf16 to True with an A100) +fp16 = True +bf16 = False + +# Batch size per GPU for training +per_device_train_batch_size = int(os.getenv("TRAIN_BATCH_SIZE", "1")) + +# Batch size per GPU for evaluation +per_device_eval_batch_size = int(os.getenv("EVAL_BATCH_SIZE", "2")) + +# Number of update steps to accumulate the gradients for +gradient_accumulation_steps = int(os.getenv("GRADIENT_ACCUMULATION_STEPS", "1")) + +# Enable gradient checkpointing +gradient_checkpointing = True + +# Maximum gradient normal (gradient clipping) +max_grad_norm = 0.3 + +# Initial learning rate (AdamW optimizer) +learning_rate = 2e-4 + +# Weight decay to apply to all layers except bias/LayerNorm weights +weight_decay = 0.001 + +# Optimizer to use +optim = "paged_adamw_32bit" + +# Learning rate schedule +lr_scheduler_type = "cosine" + +# Number of training steps (overrides num_train_epochs) +max_steps = -1 + +# Ratio of steps for a linear warmup (from 0 to learning rate) +warmup_ratio = 0.03 + +# Group sequences into batches with same length +# Saves memory and speeds up training considerably +group_by_length = True + +# Save checkpoint every X updates steps +save_steps = 0 + +# Log every X updates steps +logging_steps = int(os.getenv("LOGGING_STEPS", "50")) + +################################################################################ +# SFT parameters +################################################################################ + +# Maximum sequence length to use +max_seq_length = int(os.getenv("MAX_SEQ_LENGTH", "512")) + +# Pack multiple short examples in the same input sequence to increase efficiency +packing = False + +# Load the entire model on the GPU 0 +device_map = {'':torch.cuda.current_device()} + +# Set limit to a positive number +limit = int(os.getenv("DATASET_LIMIT", "5000")) + +dataset = load_dataset(dataset_name, split="train") +if limit != -1: + dataset = dataset.shuffle(seed=42).select(range(limit)) + + +def transform(data): + question = data['question'] + context = data['context'] + answer = data['answer'] + template = "Question: {question}\nContext: {context}\nAnswer: {answer}" + return {'text': template.format(question=question, context=context, answer=answer)} + + +transformed = dataset.map(transform) + +# Load tokenizer and model with QLoRA configuration +compute_dtype = getattr(torch, bnb_4bit_compute_dtype) + +bnb_config = BitsAndBytesConfig( + load_in_4bit=use_4bit, + bnb_4bit_quant_type=bnb_4bit_quant_type, + bnb_4bit_compute_dtype=compute_dtype, + bnb_4bit_use_double_quant=use_nested_quant, +) + +# Check GPU compatibility with bfloat16 +if compute_dtype == torch.float16 and use_4bit: + major, _ = torch.cuda.get_device_capability() + if major >= 8: + print("=" * 80) + print("Your GPU supports bfloat16") + print("=" * 80) + +# Load base model +# model = AutoModelForCausalLM.from_pretrained("google/gemma-7b") +model = AutoModelForCausalLM.from_pretrained( + model_name, + quantization_config=bnb_config, + device_map=device_map, + torch_dtype=torch.float16, +) +model.config.use_cache = False +model.config.pretraining_tp = 1 + +# Load LLaMA tokenizer +tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) +tokenizer.pad_token = tokenizer.eos_token +tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training + +# Load LoRA configuration +peft_config = LoraConfig( + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + r=lora_r, + bias="none", + task_type="CAUSAL_LM", + target_modules=["q_proj", "v_proj"] +) + +# Set training parameters +training_arguments = TrainingArguments( + output_dir=output_dir, + num_train_epochs=num_train_epochs, + per_device_train_batch_size=per_device_train_batch_size, + gradient_accumulation_steps=gradient_accumulation_steps, + optim=optim, + save_steps=save_steps, + logging_steps=logging_steps, + learning_rate=learning_rate, + weight_decay=weight_decay, + fp16=fp16, + bf16=bf16, + max_grad_norm=max_grad_norm, + max_steps=max_steps, + warmup_ratio=warmup_ratio, + group_by_length=group_by_length, + lr_scheduler_type=lr_scheduler_type, +) + +trainer = SFTTrainer( + model=model, + train_dataset=transformed, + peft_config=peft_config, + dataset_text_field="text", + max_seq_length=max_seq_length, + tokenizer=tokenizer, + args=training_arguments, + packing=packing, +) + +trainer.train() + +trainer.model.save_pretrained(new_model) + +# Reload model in FP16 and merge it with LoRA weights +base_model = AutoModelForCausalLM.from_pretrained( + model_name, + low_cpu_mem_usage=True, + return_dict=True, + torch_dtype=torch.float16, + device_map=device_map, +) +model = PeftModel.from_pretrained(base_model, new_model) +model = model.merge_and_unload() + + + +model.push_to_hub(new_model, check_pr=True) + +tokenizer.push_to_hub(new_model, check_pr=True) diff --git a/tutorials/finetune-gemma-2b-on-l4/finetune.yaml b/tutorials/finetune-gemma-2b-on-l4/finetune.yaml new file mode 100644 index 000000000..91247b1da --- /dev/null +++ b/tutorials/finetune-gemma-2b-on-l4/finetune.yaml @@ -0,0 +1,69 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: batch/v1 +kind: Job +metadata: + name: finetune-job + namespace: default +spec: + backoffLimit: 2 + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: finetuner + spec: + terminationGracePeriodSeconds: 600 + containers: + - name: finetuner + image: + resources: + limits: + nvidia.com/gpu: "8" + env: + - name: MODEL_NAME + value: "google/gemma-2b" + - name: NEW_MODEL + value: "" + - name: LORA_R + value: "8" + - name: LORA_ALPHA + value: "16" + - name: TRAIN_BATCH_SIZE + value: "1" + - name: EVAL_BATCH_SIZE + value: "2" + - name: GRADIENT_ACCUMULATION_STEPS + value: "2" + - name: DATASET_LIMIT + value: "1000" + - name: MAX_SEQ_LENGTH + value: "512" + - name: LOGGING_STEPS + value: "5" + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + volumeMounts: + - mountPath: /dev/shm + name: dshm + volumes: + - name: dshm + emptyDir: + medium: Memory + nodeSelector: + cloud.google.com/gke-accelerator: nvidia-l4 + restartPolicy: OnFailure diff --git a/tutorials/finetune-gemma-7b-on-tpu/Dockerfile b/tutorials/finetune-gemma-7b-on-tpu/Dockerfile new file mode 100644 index 000000000..a5b12af1b --- /dev/null +++ b/tutorials/finetune-gemma-7b-on-tpu/Dockerfile @@ -0,0 +1,9 @@ +FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_20240229 + +RUN pip install -U git+https://github.com/huggingface/transformers.git +RUN pip install -U git+https://github.com/huggingface/trl.git +RUN pip install -U datasets peft + +COPY . . + +CMD python fsdp.py diff --git a/tutorials/finetune-gemma-7b-on-tpu/finetune-gemma-on-gke-using-TPU.ipynb b/tutorials/finetune-gemma-7b-on-tpu/finetune-gemma-on-gke-using-TPU.ipynb new file mode 100644 index 000000000..c72b6706c --- /dev/null +++ b/tutorials/finetune-gemma-7b-on-tpu/finetune-gemma-on-gke-using-TPU.ipynb @@ -0,0 +1,819 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7d9bbf86da5e" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Run" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "99c1c3fc2ca5" + }, + "source": [ + "# Finetune Gemma on GKE using TPU" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3de7470326a2" + }, + "source": [ + "## Overview\n", + "\n", + "This notebook demonstrates downloading and fine tuning Gemma, open models from Google DeepMind using Pytorch and Hugging Face Libraries. In this notebook we will finetune and publish Gemma model on Hugging Face. In this guide we specifically use TPU V4 but this guide should also work for any TPU version with enough memory.\n", + "\n", + "\n", + "### Objective\n", + "\n", + "Finetune and Publish Gemma with Transformers and Lora on TPUs.\n", + "\n", + "### TPUs\n", + "\n", + "Tensor Processing Units (TPUs) are Google's custom-developed application-specific integrated circuits (ASICs) used to accelerate machine learning workloads.\n", + "\n", + "Before you use TPUs in GKE, we recommend that you complete the following learning path:\n", + "\n", + "Learn about [TPUs in GKE](https://cloud.google.com/tpu/docs/tpus-in-gke)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "264c07757582" + }, + "source": [ + "## Before you begin" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "35Dvbzb0hH3-" + }, + "source": [ + "### Configure Environment" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6c460088b873" + }, + "source": [ + "Set the following variables for the experiment environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "855d6b96f291" + }, + "outputs": [], + "source": [ + "# The HuggingFace token used to download models.\n", + "# Make sure Token has Write Permission\n", + "HF_TOKEN = \"\" # @param {type:\"string\"}\n", + "\n", + "# The size of the model to launch\n", + "MODEL_SIZE = \"7b\" # @param [\"2b\", \"7b\"]\n", + "\n", + "# Cloud project id.\n", + "PROJECT_ID = \"\" # @param {type:\"string\"}\n", + "\n", + "# Region for launching clusters.\n", + "REGION = \"us-central2\" # @param {type:\"string\"}\n", + "\n", + "LOCATION = \"us-central2-a\" # @param {type:\"string\"}\n", + "\n", + "# The cluster name to create\n", + "CLUSTER_NAME = \"keras\" # @param {type:\"string\"}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 136218, + "status": "ok", + "timestamp": 1710694911968, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": -60 + }, + "id": "klPAnx16cVd7", + "outputId": "55548855-83ec-4f14-aabc-de0d5cf40b4d" + }, + "outputs": [], + "source": [ + "! gcloud auth login\n", + "! gcloud config set project \"$PROJECT_ID\"\n", + "! gcloud services enable container.googleapis.com\n", + "# If using in public colab need to login with gcloud auth login\n", + "\n", + "# Add kubectl to the set of available tools.\n", + "! mkdir -p /tools/google-cloud-sdk/.install\n", + "! gcloud components install kubectl --quiet" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e828eb320337" + }, + "source": [ + "### Create a GKE cluster and a node pool" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PKhdKv1vK9Lg" + }, + "source": [ + "GKE creates the following resources for the model based on the MODEL_SIZE environment variable set above.\n", + "\n", + "- Standard cluster\n", + "\n", + "---\n", + "\n", + "\n", + "\n", + "If you already have a cluster, you can skip to `Use an existing GKE cluster` instead." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "12cd25839741" + }, + "outputs": [], + "source": [ + "! gcloud container --project {CLUSTER_NAME} clusters create {CLUSTER_NAME} \\\n", + " --cluster-version \"1.29.2-gke.1217000\" \\\n", + " --release-channel \"rapid\" \\\n", + " --machine-type \"n1-standard-4\" \\\n", + " --num-nodes \"1\" \\\n", + " --node-locations {LOCATION}\n", + "\n", + "! gcloud container --project {CLUSTER_NAME} node-pools create \"tpu\" \\\n", + " --cluster {CLUSTER_NAME} \\\n", + " --node-version \"1.29.2-gke.1217000\" \\\n", + " --machine-type \"ct4p-hightpu-4t\" \\\n", + " --num-nodes \"4\" \\\n", + " --placement-type=COMPACT \\\n", + " --tpu-topology=2x2x4" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6ydvYk7FLJz_" + }, + "source": [ + "### Use an existing GKE cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 1572, + "status": "ok", + "timestamp": 1710695045224, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": -60 + }, + "id": "DmpNpYF-LRut", + "outputId": "c8aa9c79-fc87-479a-814a-b79704c99684" + }, + "outputs": [], + "source": [ + "! gcloud container clusters \\\n", + " get-credentials {CLUSTER_NAME} \\\n", + " --location {LOCATION}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2cc825514deb" + }, + "source": [ + "### Create Kubernetes secret for Hugging Face credentials" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rgZfNOSyOY7_" + }, + "source": [ + "Create a Kubernetes Secret that contains the Hugging Face token." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 4478, + "status": "ok", + "timestamp": 1710695064954, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": -60 + }, + "id": "b42bd4fa2b2d", + "outputId": "63b871d5-606b-4ee1-9cae-5a254c62f61f" + }, + "outputs": [], + "source": [ + "! kubectl create secret generic hf-secret \\\n", + "--from-literal=hf_api_token={HF_TOKEN} \\\n", + "--dry-run=client -o yaml | kubectl apply -f -" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gOADJkZAmt08" + }, + "source": [ + "## The Dataset\n", + "We use Lora to quickly finetune Gemma with `b-mc2/sql-create-context` dataset.\n", + "\n", + "This dataset has the following structure.\n", + "\n", + "| Answer | Question | Context |\n", + "|-----------------------------------------------------|-------------------------------------------------------------------------------|-------------------------------------------------------------------|\n", + "| SELECT COUNT(*) FROM head WHERE age > 56 | How many heads of the departments are older than 56 ? | CREATE TABLE head (age INTEGER) |\n", + "| SELECT name, born_state, age FROM head ORDER BY age | List the name, born state and age of the heads of departments ordered by age. | CREATE TABLE head (name VARCHAR, born_state VARCHAR, age VARCHAR) |\n", + "\n", + "We will finetune `google/gemma-7b` model to get SQL queries based on questions and context." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5JrfaiA5tAtW" + }, + "source": [ + "## Finetuning Gemma on GKE using GPU with Pytorch\n", + "\n", + "In this demo we will use Pytorch-XLA and Huggingface libraries to finetune Gemma. Save the following code in a file named `fsdp.py`\n", + "\n", + "```python\n", + "# Make sure to run the script with the following envs:\n", + "# PJRT_DEVICE=TPU XLA_USE_SPMD=1\n", + "import os\n", + "import torch\n", + "import torch_xla\n", + "\n", + "import torch_xla.core.xla_model as xm\n", + "\n", + "from datasets import load_dataset\n", + "from peft import LoraConfig, PeftModel\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments\n", + "from trl import SFTTrainer\n", + "\n", + "import transformers\n", + "\n", + "print(\"TORCH: \", torch.__version__)\n", + "print(\"TRANSFORMERS: \", transformers.__version__)\n", + "\n", + "# Set up TPU device.\n", + "device = xm.xla_device()\n", + "model_id = os.getenv(\"MODEL_ID\",\"google/gemma-7b\")\n", + "new_model_id = os.getenv(\"NEW_MODEL_ID\",\"gemma-7b-sql-context\")\n", + "\n", + "job_index = os.getenv(\"JOB_COMPLETION_INDEX\")\n", + "\n", + "print(\"### LOAD TOKENIZER ###\")\n", + "# Load the pretrained model and tokenizer.\n", + "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "tokenizer.padding_side = \"right\" # Fix weird overflow issue with fp16 training\n", + "\n", + "\n", + "print(\"### LOAD MODEL ###\")\n", + "model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)\n", + "\n", + "print(model)\n", + "\n", + "# Set up PEFT LoRA for fine-tuning.\n", + "lora_config = LoraConfig(\n", + " r=8,\n", + " lora_alpha = 16,\n", + " lora_dropout = 0.1,\n", + " bias=\"none\",\n", + " target_modules=[\"q_proj\", \"v_proj\"],\n", + " task_type=\"CAUSAL_LM\",\n", + ")\n", + "\n", + "print(\"### LOAD DATASET ###\")\n", + "\n", + "limit = int(os.getenv(\"LIMIT\", \"5000\"))\n", + "\n", + "dataset_name = \"b-mc2/sql-create-context\"\n", + "# Load the dataset and format it for training.\n", + "dataset = load_dataset(dataset_name, split=\"train\")\n", + "dataset = dataset.shuffle(seed=42).select(range(limit))\n", + "\n", + "def transform(data):\n", + " question = data['question']\n", + " context = data['context']\n", + " answer = data['answer']\n", + " template = \"Question: {question}\\nContext: {context}\\nAnswer: {answer}\"\n", + " return {'text': template.format(question=question, context=context, answer=answer)}\n", + "\n", + "print(\"### TRANSFORM DATASET ###\")\n", + "dataset = dataset.map(transform)\n", + "\n", + "\n", + "max_seq_length = 512\n", + "\n", + "# Set up the FSDP config. To enable FSDP via SPMD, set xla_fsdp_v2 to True.\n", + "fsdp_config = {\"fsdp_transformer_layer_cls_to_wrap\": [\n", + " \"GemmaDecoderLayer\"\n", + " ],\n", + " \"xla\": True,\n", + " \"xla_fsdp_v2\": True,\n", + " \"xla_fsdp_grad_ckpt\": True}\n", + "\n", + "print(\"### CREATE SFTTRAINER###\")\n", + "# Finally, set up the trainer and train the model.\n", + "trainer = SFTTrainer(\n", + " model=model,\n", + " train_dataset=dataset,\n", + " args=TrainingArguments(\n", + " per_device_train_batch_size=64, # This is actually the global batch size for SPMD.\n", + " num_train_epochs=1,\n", + " max_steps=-1,\n", + " output_dir=\"./output\",\n", + " optim=\"adafactor\",\n", + " logging_steps=1,\n", + " dataloader_drop_last = True, # Required for SPMD.\n", + " fsdp=\"full_shard\",\n", + " fsdp_config=fsdp_config,\n", + " ),\n", + " peft_config=lora_config,\n", + " dataset_text_field=\"text\",\n", + " max_seq_length=max_seq_length,\n", + " packing=True,\n", + ")\n", + "\n", + "\n", + "print(\"### STARTING TRAINING ###\")\n", + "trainer.train()\n", + "print(\"### TRAINING ENDED ###\")\n", + "\n", + "\n", + "print(\"JOB INDEX: \", job_index)\n", + "\n", + "print(\"### COMBINE AND MODEL WEIGHT ###\")\n", + "trainer.save_model(new_model_id)\n", + "# Reload model in FP16 and merge it with LoRA weights\n", + "base_model = AutoModelForCausalLM.from_pretrained(\n", + " model_id,\n", + " low_cpu_mem_usage=True,\n", + " return_dict=True,\n", + " torch_dtype=torch.bfloat16,\n", + ")\n", + "\n", + "model = PeftModel.from_pretrained(base_model, new_model_id)\n", + "model = model.merge_and_unload()\n", + "\n", + "print(\"### DONE MERGING ###\")\n", + "\n", + "if job_index == \"0\":\n", + " print(\"### UPLOAD MODEL TO HUGGING FACE ###\")\n", + " # model.config.to_json_file(\"adapter_config.json\")\n", + " print(model)\n", + " os.listdir(new_model_id)\n", + " model.push_to_hub(repo_id=new_model_id)\n", + " tokenizer.push_to_hub(repo_id=new_model_id)\n", + "else:\n", + " print(\"Model will be uploaded by job 0\")\n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8XceN-WKxRR-" + }, + "source": [ + "## Create a Container Manifest with Dockerfile\n", + "\n", + "Use the following `Dockerfile` to create a container image.\n", + "\n", + "```bash\n", + "FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_20240229\n", + "\n", + "RUN pip install -U git+https://github.com/huggingface/transformers.git\n", + "RUN pip install -U git+https://github.com/huggingface/trl.git\n", + "RUN pip install -U datasets peft\n", + "\n", + "COPY . .\n", + "\n", + "CMD python fsdp.py\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "executionInfo": { + "elapsed": 482, + "status": "ok", + "timestamp": 1710701636584, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": -60 + }, + "id": "pCwBFXuTxaeU" + }, + "outputs": [], + "source": [ + "DOCKERFILE = \"\"\"\n", + "FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_20240229\n", + "\n", + "RUN pip install -U git+https://github.com/huggingface/transformers.git\n", + "RUN pip install -U git+https://github.com/huggingface/trl.git\n", + "RUN pip install -U datasets peft\n", + "\n", + "COPY . .\n", + "\n", + "CMD python fsdp.py\n", + "\"\"\"\n", + "\n", + "with open(\"Dockerfile\", \"w\") as f:\n", + " f.write(DOCKERFILE)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZdqyOdL4tlOP" + }, + "source": [ + "### Containerize the Code with Docker and Cloud Build\n", + "\n", + "Using Cloud Build and the following Dockerfile we build and push the image in Artifact Registry Docker Repository." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dKk9B_8cwNrL" + }, + "outputs": [], + "source": [ + "# Create a Artifact Registry Repo\n", + "! gcloud artifacts repositories create gemma \\\n", + " --project={PROJECT_ID} \\\n", + " --repository-format=docker \\\n", + " --location=us \\\n", + " --description=\"Gemma Repo\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 172237, + "status": "ok", + "timestamp": 1710698643369, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": -60 + }, + "id": "mYo1j99-zXWe", + "outputId": "2f71febf-d982-494f-d5bb-e8262c827798" + }, + "outputs": [], + "source": [ + "# Build and push the image using Cloud Build\n", + "! gcloud builds submit \\\n", + " --tag us-docker.pkg.dev/{PROJECT_ID}/gemma/finetune-gemma-tpu:1.0.1 ." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0c250872074f" + }, + "source": [ + "## Run Finetune Job on GKE\n", + "\n", + "Use the YAML to run Gemma Finetune on GKE. Notice we have a job with a headless service. This is because we have a TPU V4 2x2x4 slice which is 4 Nodes with 4 TPU devices each connected via high speed interconnect. We create a indexed job and headless service to give each instance of the job to have be able to communicate with each other via network." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6psJZY_zUDgj" + }, + "outputs": [], + "source": [ + "K8S_JOB_YAML = f\"\"\"\n", + "apiVersion: v1\n", + "kind: Service\n", + "metadata:\n", + " name: headless-svc\n", + "spec:\n", + " clusterIP: None\n", + " selector:\n", + " job-name: tpu-job\n", + "---\n", + "apiVersion: batch/v1\n", + "kind: Job\n", + "metadata:\n", + " name: tpu-job\n", + "spec:\n", + " backoffLimit: 0\n", + " completions: 4\n", + " parallelism: 4\n", + " completionMode: Indexed\n", + " template:\n", + " spec:\n", + " subdomain: headless-svc\n", + " restartPolicy: Never\n", + " nodeSelector:\n", + " cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice\n", + " cloud.google.com/gke-tpu-topology: 2x2x4\n", + " containers:\n", + " - name: tpu-job\n", + " image: us-docker.pkg.dev/{PROJECT_ID}/gemma/finetune-gemma-tpu:1.0.1\n", + " ports:\n", + " - containerPort: 8471 # Default port using which TPU VMs communicate\n", + " - containerPort: 8431 # Port to export TPU runtime metrics, if supported.\n", + " securityContext:\n", + " privileged: true\n", + " resources:\n", + " requests:\n", + " google.com/tpu: 4\n", + " limits:\n", + " google.com/tpu: 4\n", + " env:\n", + " - name: PJRT_DEVICE\n", + " value: \"TPU\"\n", + " - name: XLA_USE_SPMD\n", + " value: \"1\"\n", + " - name: XLA_USE_BF16\n", + " value: \"1\"\n", + " - name: HF_TOKEN\n", + " valueFrom:\n", + " secretKeyRef:\n", + " name: hf-secret\n", + " key: hf_api_token\n", + " - name: NEW_MODEL_ID\n", + " value: gemma-7b-sql-kubecon-eu-2024\n", + " - name: LIMIT\n", + " value: \"10000\"\n", + " volumeMounts:\n", + " - mountPath: /dev/shm\n", + " name: dshm\n", + " volumes:\n", + " - name: dshm\n", + " emptyDir:\n", + " medium: Memory\n", + "\"\"\"\n", + "\n", + "with open(\"finetune.yaml\", \"w\") as f:\n", + " f.write(K8S_JOB_YAML)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 866, + "status": "ok", + "timestamp": 1710698770574, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": -60 + }, + "id": "yTmQqcRj9kz_", + "outputId": "ff1fc799-8a02-4614-aaa1-32ebdab5eb1d" + }, + "outputs": [], + "source": [ + "!kubectl apply -f finetune.yaml" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GYMesXi7WqCu" + }, + "source": [ + "#### Waiting for the container to create" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NKwbzKXuWvoL" + }, + "source": [ + "Use the command below to check on the status of the container." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 587, + "status": "ok", + "timestamp": 1710699723828, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": -60 + }, + "id": "PXbPCrWtWqbk", + "outputId": "f0747d91-d25c-4183-c51a-a9da91c723e0" + }, + "outputs": [], + "source": [ + "! kubectl get po -l job-name=tpu-job" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MzINwFr_WVAB" + }, + "source": [ + "### View the logs from the running Job\n", + "\n", + "This will download the needed artifacts and run the finetuning code, this process will take close to 30 minutes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gAkXSoy9Ufuo" + }, + "outputs": [], + "source": [ + "! kubectl logs -f job/tpu-job" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VosC4Fbb01S9" + }, + "source": [ + "## Find the model on Huggingface\n", + "\n", + "If the Job ran successfully you can now go find the model on your Huggingface profile." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "af21a3cff1e0" + }, + "source": [ + "## Clean up resources" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 1528, + "status": "ok", + "timestamp": 1710700348433, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": -60 + }, + "id": "911406c1561e", + "outputId": "8eb81d2a-7808-4c36-cde5-4511759e23fb" + }, + "outputs": [], + "source": [ + "! kubectl delete job tpu-job\n", + "! kubectl delete svc headless-svc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mH9DlYk3IqA5" + }, + "outputs": [], + "source": [ + "! kubectl delete secrets hf-secret" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2acSuqPeNjJJ" + }, + "outputs": [], + "source": [ + "! gcloud container clusters delete {CLUSTER_NAME} \\\n", + " --region={LOCATION} \\\n", + " --quiet" + ] + } + ], + "metadata": { + "colab": { + "name": "model_garden_gemma_deployment_on_gke.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/tutorials/finetune-gemma-7b-on-tpu/finetune.yaml b/tutorials/finetune-gemma-7b-on-tpu/finetune.yaml new file mode 100644 index 000000000..b3603980e --- /dev/null +++ b/tutorials/finetune-gemma-7b-on-tpu/finetune.yaml @@ -0,0 +1,61 @@ +apiVersion: v1 +kind: Service +metadata: + name: headless-svc +spec: + clusterIP: None + selector: + job-name: tpu-job +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: tpu-job +spec: + backoffLimit: 0 + completions: 4 + parallelism: 4 + completionMode: Indexed + template: + spec: + subdomain: headless-svc + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice + cloud.google.com/gke-tpu-topology: 2x2x4 + containers: + - name: tpu-job + image: + ports: + - containerPort: 8471 # Default port using which TPU VMs communicate + - containerPort: 8431 # Port to export TPU runtime metrics, if supported. + securityContext: + privileged: true + resources: + requests: + google.com/tpu: "4" + limits: + google.com/tpu: "4" + env: + - name: PJRT_DEVICE + value: "TPU" + - name: XLA_USE_SPMD + value: "1" + - name: XLA_USE_BF16 + value: "1" + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: NEW_MODEL_ID + value: + - name: LIMIT + value: "10000" + volumeMounts: + - mountPath: /dev/shm + name: dshm + volumes: + - name: dshm + emptyDir: + medium: Memory diff --git a/tutorials/finetune-gemma-7b-on-tpu/fsdp.py b/tutorials/finetune-gemma-7b-on-tpu/fsdp.py new file mode 100644 index 000000000..4aec14a5b --- /dev/null +++ b/tutorials/finetune-gemma-7b-on-tpu/fsdp.py @@ -0,0 +1,131 @@ +# Make sure to run the script with the following envs: +# PJRT_DEVICE=TPU XLA_USE_SPMD=1 +import os +import torch +import torch_xla + +import torch_xla.core.xla_model as xm + +from datasets import load_dataset +from peft import LoraConfig, PeftModel +from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments +from trl import SFTTrainer + +import transformers + +print("TORCH: ", torch.__version__) +print("TRANSFORMERS: ", transformers.__version__) + +# Set up TPU device. +device = xm.xla_device() +model_id = os.getenv("MODEL_ID","google/gemma-7b") +new_model_id = os.getenv("NEW_MODEL_ID","gemma-7b-sql-context") + +job_index = os.getenv("JOB_COMPLETION_INDEX") + +print("### LOAD TOKENIZER ###") +# Load the pretrained model and tokenizer. +tokenizer = AutoTokenizer.from_pretrained(model_id) +tokenizer.pad_token = tokenizer.eos_token +tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training + + +print("### LOAD MODEL ###") +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16) + +print(model) + +# Set up PEFT LoRA for fine-tuning. +lora_config = LoraConfig( + r=8, + lora_alpha = 16, + lora_dropout = 0.1, + bias="none", + target_modules=["q_proj", "v_proj"], + task_type="CAUSAL_LM", +) + +print("### LOAD DATASET ###") + +limit = int(os.getenv("LIMIT", "5000")) + +dataset_name = "b-mc2/sql-create-context" +# Load the dataset and format it for training. +dataset = load_dataset(dataset_name, split="train") +dataset = dataset.shuffle(seed=42).select(range(limit)) + +def transform(data): + question = data['question'] + context = data['context'] + answer = data['answer'] + template = "Question: {question}\nContext: {context}\nAnswer: {answer}" + return {'text': template.format(question=question, context=context, answer=answer)} + +print("### TRANSFORM DATASET ###") +dataset = dataset.map(transform) + + +max_seq_length = 512 + +# Set up the FSDP config. To enable FSDP via SPMD, set xla_fsdp_v2 to True. +fsdp_config = {"fsdp_transformer_layer_cls_to_wrap": [ + "GemmaDecoderLayer" + ], + "xla": True, + "xla_fsdp_v2": True, + "xla_fsdp_grad_ckpt": True} + +print("### CREATE SFTTRAINER###") +# Finally, set up the trainer and train the model. +trainer = SFTTrainer( + model=model, + train_dataset=dataset, + args=TrainingArguments( + per_device_train_batch_size=64, # This is actually the global batch size for SPMD. + num_train_epochs=1, + max_steps=-1, + output_dir="./output", + optim="adafactor", + logging_steps=1, + dataloader_drop_last = True, # Required for SPMD. + fsdp="full_shard", + fsdp_config=fsdp_config, + ), + peft_config=lora_config, + dataset_text_field="text", + max_seq_length=max_seq_length, + packing=True, +) + + +print("### STARTING TRAINING ###") +trainer.train() +print("### TRAINING ENDED ###") + + +print("JOB INDEX: ", job_index) + +print("### COMBINE AND MODEL WEIGHT ###") +trainer.save_model(new_model_id) +# Reload model in FP16 and merge it with LoRA weights +base_model = AutoModelForCausalLM.from_pretrained( + model_id, + low_cpu_mem_usage=True, + return_dict=True, + torch_dtype=torch.bfloat16, +) + +model = PeftModel.from_pretrained(base_model, new_model_id) +model = model.merge_and_unload() + +print("### DONE MERGING ###") + +if job_index == "0": + print("### UPLOAD MODEL TO HUGGING FACE ###") + # model.config.to_json_file("adapter_config.json") + print(model) + os.listdir(new_model_id) + model.push_to_hub(repo_id=new_model_id) + tokenizer.push_to_hub(repo_id=new_model_id) +else: + print("Model will be uploaded by job 0")