diff --git a/finetune.ipynb b/finetune.ipynb index 086ffd7..8b39919 100644 --- a/finetune.ipynb +++ b/finetune.ipynb @@ -50,7 +50,25 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Data needs to be in `jsonl` format with each line containing a json encoded string containing two keys `prompt` and `completion`\n", + "We support two different data formats:\n", + "\n", + "### `Chat`\n", + "\n", + "Data needs to be in `jsonl` format with each line containing a whole conversation in OpenAI chat format i.e. each line contains a key called `messages`. Each `messages` key contains a list of messages, where each message is a dictionary with `role` and `content` keys. The `role` key can be either `user`, `assistant` or `system` and the `content` key contains the message content.\n", + "\n", + "```jsonl\n", + "{\"messages\": [{\"role\": \"system\", \"content\": \"Marv is a factual chatbot that is also sarcastic.\"}, {\"role\": \"user\", \"content\": \"What's the capital of France?\"}, {\"role\": \"assistant\", \"content\": \"Paris\"}, {\"role\": \"user\", \"content\": \"Can you be more sarcastic?\"}, {\"role\": \"assistant\", \"content\": \"Paris, as if everyone doesn't know that already.\"}]}\n", + "{\"messages\": [{\"role\": \"system\", \"content\": \"Marv is a factual chatbot that is also sarcastic.\"}, {\"role\": \"user\", \"content\": \"Who wrote 'Romeo and Juliet'?\"}, {\"role\": \"assistant\", \"content\": \"William Shakespeare\"}, {\"role\": \"user\", \"content\": \"Can you be more sarcastic?\"}, {\"role\": \"assistant\", \"content\": \"Oh, just some guy named William Shakespeare. Ever heard of him?\"}]}\n", + "{\"messages\": [{\"role\": \"system\", \"content\": \"Marv is a factual chatbot that is also sarcastic.\"}, {\"role\": \"user\", \"content\": \"How far is the Moon from Earth?\"}, {\"role\": \"assistant\", \"content\": \"384,400 kilometers\"}, {\"role\": \"user\", \"content\": \"Can you be more sarcastic?\"}, {\"role\": \"assistant\", \"content\": \"Around 384,400 kilometers. Give or take a few, like that really matters.\"}]}\n", + "...\n", + "```\n", + "\n", + "\n", + "Reference: https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset\n", + "\n", + "\n", + "### `Completion`\n", + "Data needs to be in `jsonl` format with each line containing a json encoded string containing two keys `prompt` and `completion`.\n", "\n", "```jsonl\n", "{\"prompt\": \"What is 2 + 2?\", \"completion\": \"The answer to 2 + 2 is 4\"}\n", @@ -59,7 +77,11 @@ "...\n", "```\n", "\n", - "Once you have your data on `.jsonl` files, you can upload them to the file tree on the left and change the `train_data` and `eval_data` variables in the `Data Parameters` section\n", + "Reference: https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset\n", + "\n", + "### Uploading data to notebook\n", + "\n", + "Once you have your data on `.jsonl` files, you can upload them to the file tree on the left and change the `train_data_uri` and `eval_data_uri` variables in the `Data Parameters` section\n", "\n", "![Upload Data](./assets/upload-data.png)\n", "\n", @@ -97,33 +119,34 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Note: Only first 100 data points will be used. This is okay for quick testing. To use all data points please set `max_num_samples` to 0\n" - ] - } - ], + "outputs": [], "source": [ + "from typing import List, Dict, Optional, Any\n", + "from data_utils import DatasetType\n", + "\n", + "# Type of dataset - Either `completion` or `chat`\n", + "dataset_type = DatasetType.completion\n", + "\n", "# URI to training data. Can be a file on disk or an mlfoundry artifact fqn\n", - "train_data = \"./standford_alpaca_train_49k.jsonl\"\n", + "train_data_uri: str = \"./standford_alpaca_train_49k.jsonl\"\n", "\n", "# URI to evaluation data. Can be a file on disk or an mlfoundry artifact fqn. \n", - "# Set to \"NA\" if you want to split from train data\n", - "eval_data = \"./standford_alpaca_test_2k.jsonl\"\n", + "# Set to \"None\" if you want to split from train data\n", + "eval_data_uri: Optional[str] = \"./standford_alpaca_test_2k.jsonl\"\n", "\n", - "# When eval_data is set to \"NA\", use this portion of the train_data to use as eval\n", + "# When eval_data is set to `None`, use this portion of the train_data to use as eval\n", "eval_size = 0.1\n", "\n", - "# How many samples to use for training. 0 means all data. Useful to test quickly\n", - "max_num_samples = 0\n", + "# If your dataset is small (< 10 examples), set this to False\n", + "sample_packing = True\n", "\n", - "if max_num_samples != 0:\n", - " print(f\"Note: Only first {max_num_samples} data points will be used. This is okay for quick testing. To use all data points please set `max_num_samples` to 0\")" + "# How many steps to use for training. None means all data. Useful to test quickly\n", + "max_steps: Optional[int] = None\n", + "\n", + "if max_steps is not None:\n", + " print(f\"Note: max_steps is set, this might not use the entire training data. This is okay for quick testing. To use all data points please set `max_steps` to `None`\")" ] }, { @@ -158,9 +181,6 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", - "import torch\n", - "\n", "# Huggingface hub model id to finetune e.g. \"stas/tiny-random-llama-2\"\n", "# If you created this notebook instance from Truefoundry's Model Catalogue, the model id will be set in `launch_parameters`\n", "model_id = launch_parameters.model_id\n", @@ -182,21 +202,19 @@ "metadata": {}, "outputs": [], "source": [ - "# Enable LoRa with Quantization\n", - "use_qlora = True\n", + "adapter = \"qlora\"\n", "\n", - "# If you want to disable quantization, set `use_qlora` to False and set `use_lora` to True\n", - "use_lora = False\n", - "\n", - "# qlora r. Increasing this will increase GPU memory requirement and training time but can give better results\n", + "# lora r. Increasing this will increase GPU memory requirement and training time but can give better results\n", "lora_r = 32\n", "\n", - "# qlora alpha\n", + "# lora alpha\n", "lora_alpha = max(16, 2 * lora_r)\n", "\n", + "# Whether to apply Lora to all linear layers\n", + "lora_target_linear = True\n", "\n", - "if use_qlora and use_lora:\n", - " raise ValueError(\"Both `use_qlora` and `use_lora` cannot be True at the same time!\")" + "# The names of the modules to apply Lora to. These will be added to modules found by `lora_target_linear` if that is enabled\n", + "lora_target_modules: Optional[List[str]] = None" ] }, { @@ -213,7 +231,7 @@ "outputs": [], "source": [ "# Where to dump checkpoints and model\n", - "output_dir = \"./model\"\n", + "output_dir = \"./outputs\"\n", "\n", "# If to delete `output_dir` before starting\n", "cleanup_output_dir_on_start = False\n", @@ -221,23 +239,23 @@ "# Max Sequence Length. \n", "# Increasing this will allow longer sequences but will significantly increase GPU memory requirement and training time.\n", "# This cannot be greater than model's max sequence length\n", - "max_length = launch_parameters.max_length\n", + "max_sequence_length = launch_parameters.max_length\n", "\n", - "# Max batch size per GPU. \n", + "# Batch size per GPU. \n", "# Increasing this will increase GPU memory requirement and training time\n", - "per_device_train_batch_size = launch_parameters.batch_size\n", + "micro_batch_size = launch_parameters.batch_size\n", "\n", "# Learning rate\n", "learning_rate = 0.00003\n", "\n", "# How many epochs to run training for\n", - "num_train_epochs = 10\n", + "num_epochs = 10\n", "\n", "# How often to evaluate. Value less than 1 denotes every X% of total run\n", - "eval_steps = 0.05\n", + "eval_steps = 0.1\n", "\n", "# How often to save checkpoints. Value less than 1 denotes every X% of total run\n", - "save_steps = 0.05" + "save_steps = 0.1" ] }, { @@ -253,10 +271,11 @@ "metadata": {}, "outputs": [], "source": [ + "import os\n", "from mlfoundry_utils import generate_run_name, get_or_create_run\n", "\n", "# Enable reporting metrics to mlfoundry\n", - "mlfoundry_enable_reporting = True\n", + "mlfoundry_enable_reporting = False\n", "\n", "# Which ML Repo to log metrics and checkpoints to. \n", "# You can create new ML Repos from the https://.truefoundry.cloud/mlfoundry page\n", @@ -267,7 +286,7 @@ "mlfoundry_log_checkpoints = True\n", "\n", "# Run to which metrics and checkpoints will be logged\n", - "mlfoundry_run_name = generate_run_name(model_id)\n", + "mlfoundry_run_name = generate_run_name(model_id, seed=os.getpid())\n", "\n", "# If to upload checkpoints to ML Repo when they are saved\n", "mlfoundry_checkpoint_artifact_name = f\"ckpt-{mlfoundry_run_name}\"\n", @@ -295,7 +314,7 @@ " from urllib.parse import urljoin\n", " from tensorboard import notebook\n", "\n", - " tb_logs = os.path.join(\".\", \"tensorboard_logs\")\n", + " tb_logs = os.path.join(os.path.abspath(output_dir), \"model\", \"runs\")\n", " os.makedirs(tb_logs, exist_ok=True)\n", " os.environ[\"TENSORBOARD_PROXY_URL\"] = urljoin(os.getenv(\"NB_PREFIX\", \"/\"), \"proxy/%PORT%/\")\n", " notebook.start(f\"--logdir {tb_logs} --reload_interval 30.0 --reload_multifile True\")\n", @@ -317,55 +336,56 @@ "metadata": {}, "outputs": [], "source": [ + "import os\n", + "import torch\n", + "\n", "# Mixed Precision Training. We automatically select the precision based on GPU capability\n", "mixed_precision = \"bf16\" if torch.cuda.is_bf16_supported() else \"fp16\"\n", - "bf16 = (mixed_precision == \"bf16\")\n", - "fp16 = (mixed_precision == \"fp16\")\n", "\n", "COMMAND = f\"\"\"\n", "accelerate launch \\\n", "--mixed_precision {mixed_precision} \\\n", "--use_deepspeed \\\n", "train.py \\\n", + "config-base.yaml \\\n", "--deepspeed ./deepspeed_configs/3_ds_z2_config.json \\\n", - "--bf16 {bf16} \\\n", - "--fp16 {fp16} \\\n", - "--model_id {model_id} \\\n", + "--flash_attention True \\\n", + "--gradient_checkpointing True \\\n", + "--base_model {model_id} \\\n", "--output_dir {output_dir} \\\n", - "--train_data {train_data} \\\n", - "--eval_data {eval_data} \\\n", - "--eval_size {eval_size} \\\n", - "--max_num_samples {max_num_samples} \\\n", - "--train_on_prompt False \\\n", - "--max_length {max_length} \\\n", - "--use_qlora {use_qlora} \\\n", - "--use_lora {use_lora} \\\n", - "--qlora_bit_length 4 \\\n", - "--lora_target_modules auto \\\n", + "--dataset_type {dataset_type} \\\n", + "--train_data_uri {train_data_uri} \\\n", + "--val_data_uri {eval_data_uri} \\\n", + "--val_set_size {eval_size} \\\n", + "--max_steps {max_steps} \\\n", + "--sequence_len {max_sequence_length} \\\n", + "--train_on_inputs False \\\n", + "--sample_packing {sample_packing} \\\n", + "--pad_to_sequence_len True \\\n", + "--num_epochs {num_epochs} \\\n", + "--micro_batch_size {micro_batch_size} \\\n", + "--learning_rate {learning_rate} \\\n", + "--warmup_ratio 0.1 \\\n", + "--gradient_accumulation_steps 4 \\\n", + "--early_stopping_patience 10 \\\n", + "--adapter qlora \\\n", + "--lora_target_linear {lora_target_linear} \\\n", + "--lora_target_modules {lora_target_modules} \\\n", "--lora_r {lora_r} \\\n", "--lora_alpha {lora_alpha} \\\n", "--lora_dropout 0.05 \\\n", - "--lora_bias none \\\n", - "--num_train_epochs {num_train_epochs} \\\n", - "--early_stopping_patience 10 \\\n", - "--early_stopping_threshold 0.0 \\\n", - "--auto_find_batch_size false \\\n", - "--per_device_train_batch_size {per_device_train_batch_size} \\\n", - "--per_device_eval_batch_size {per_device_train_batch_size} \\\n", - "--gradient_accumulation_steps 4 \\\n", - "--learning_rate {learning_rate} \\\n", - "--logging_strategy steps \\\n", "--logging_steps 5 \\\n", "--evaluation_strategy steps \\\n", "--eval_steps {eval_steps} \\\n", "--save_strategy steps \\\n", "--save_steps {save_steps} \\\n", + "--seed 42 \\\n", "--mlfoundry_enable_reporting {mlfoundry_enable_reporting} \\\n", "--mlfoundry_ml_repo {mlfoundry_ml_repo} \\\n", "--mlfoundry_run_name {mlfoundry_run_name} \\\n", "--mlfoundry_checkpoint_artifact_name {mlfoundry_checkpoint_artifact_name} \\\n", "--mlfoundry_log_checkpoints {mlfoundry_log_checkpoints} \\\n", - "--cleanup_output_dir_on_start False \\\n", + "--cleanup_output_dir_on_start {cleanup_output_dir_on_start} \\\n", "--resume_from_checkpoint True \\\n", "| tee train.log\n", "\"\"\"\n", @@ -382,7 +402,7 @@ }, "outputs": [], "source": [ - "!{COMMAND} " + "!{COMMAND}" ] } ], diff --git a/mlfoundry_utils.py b/mlfoundry_utils.py index 6ff9a39..5aafc69 100644 --- a/mlfoundry_utils.py +++ b/mlfoundry_utils.py @@ -184,12 +184,12 @@ def sanitize_name(value): return re.sub(rf"[{re.escape(string.punctuation)}]+", "-", value.encode("ascii", "ignore").decode("utf-8")) -def generate_run_name(model_id): +def generate_run_name(model_id, seed: Optional[int] = None): *_, model_name = model_id.split("/", 1) sanitized_model_name = sanitize_name(model_name) alphabet = string.ascii_lowercase + string.digits - random.choices(alphabet, k=8) - random_id = "".join(random.choices(alphabet, k=6)) + rng = random.Random(seed) if seed is not None else random + random_id = "".join(rng.choices(alphabet, k=6)) run_name = f"ft-{sanitized_model_name}-{random_id}" return run_name diff --git a/utils.py b/utils.py index c2dfa15..ed6c819 100644 --- a/utils.py +++ b/utils.py @@ -110,7 +110,7 @@ class Config: extra = "ignore" model_id: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" - max_length: Optional[int] = None + max_length: Optional[int] = 2048 batch_size: int = 1