From cf6920f26dc9613e91d80290c6d31c321354da24 Mon Sep 17 00:00:00 2001 From: phoebeklett Date: Tue, 6 Feb 2024 16:52:12 -0500 Subject: [PATCH 01/12] Add first laplace example files for lora --- examples/laplace-lora/load.py | 84 +++++ examples/laplace-lora/yelp_laplace_lora.ipynb | 301 ++++++++++++++++++ 2 files changed, 385 insertions(+) create mode 100644 examples/laplace-lora/load.py create mode 100644 examples/laplace-lora/yelp_laplace_lora.ipynb diff --git a/examples/laplace-lora/load.py b/examples/laplace-lora/load.py new file mode 100644 index 00000000..13c352be --- /dev/null +++ b/examples/laplace-lora/load.py @@ -0,0 +1,84 @@ +from functools import partial +from datasets import load_dataset +from optree import tree_map, tree_reduce +import torch +from torch.utils.data import DataLoader +from torch.distributions import Categorical +from transformers import AutoTokenizer, AutoModelForCausalLM + +from uqlib import model_to_function + + +# From https://huggingface.co/docs/transformers/training#train-in-native-pytorch + + +def load_dataloaders(small=False, batch_size=8): + dataset = load_dataset("yelp_review_full") + + tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") + tokenizer.pad_token = tokenizer.eos_token + + def tokenize_function(examples): + return tokenizer(examples["text"], padding="max_length", max_length=50, truncation=True) + + tokenized_datasets = dataset.map(tokenize_function, batched=True) + tokenized_datasets['train'] = tokenized_datasets['train'].add_column('labels', tokenized_datasets['train']['input_ids']) + tokenized_datasets['test'] = tokenized_datasets['test'].add_column('labels', tokenized_datasets['test']['input_ids']) + + tokenized_datasets = tokenized_datasets.remove_columns(["text", "label"]) + tokenized_datasets.set_format("torch") + + if small: + train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) + eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) + else: + train_dataset = tokenized_datasets["train"] + eval_dataset = tokenized_datasets["test"] + + train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size) + eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size) + + return train_dataloader, eval_dataloader + + +def load_model( + prior_sd=1.0, + num_data=None, + per_sample=False, +): + model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", + ) + + model_func = model_to_function(model) + + def categorical_log_likelihood(labels, logits): + return Categorical(logits=logits, validate_args=False).log_prob(labels) + + def univariate_normal_log_prob(x, mean, sd): + return -0.5 * ((x - mean) / sd) ** 2 + + def normal_log_prior(p) -> float: + per_group_vals = tree_map( + lambda p: univariate_normal_log_prob(p, 0, prior_sd).sum(), p + ) + return tree_reduce(torch.add, per_group_vals) + + def param_to_log_posterior_per_sample(p, batch, num_data) -> torch.tensor: + output = model_func(p, **batch) + return ( + categorical_log_likelihood(batch["labels"], output.logits) + ) + normal_log_prior(p) / num_data, output + + if per_sample: + param_to_log_posterior = param_to_log_posterior_per_sample + else: + + def param_to_log_posterior(p, batch, num_data) -> float: + log_probs, aux = param_to_log_posterior_per_sample(p, batch, num_data) + return log_probs.mean(), aux + + if num_data is not None: + param_to_log_posterior = partial(param_to_log_posterior, num_data=num_data) + + return model, param_to_log_posterior diff --git a/examples/laplace-lora/yelp_laplace_lora.ipynb b/examples/laplace-lora/yelp_laplace_lora.ipynb new file mode 100644 index 00000000..ed890217 --- /dev/null +++ b/examples/laplace-lora/yelp_laplace_lora.ipynb @@ -0,0 +1,301 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Post-hoc Laplace approx to LoRA parameters at model checkpoints theta_MAP obtained from standard fine-tuning. \n", + "\n", + "Done below: 1 checkpoint of the Laplace approx to the posterior of the fine-tuned parameters\n", + "To do: Change fine tuning to LoRA fine tuning, loop and eval on benchmark using both analytical and empirical methods" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from tqdm.auto import tqdm\n", + "from torch.optim import AdamW\n", + "from transformers import get_scheduler\n", + "from optree import tree_map_, tree_map\n", + "import pickle\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import uqlib\n", + "\n", + "from load import load_dataloaders, load_model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load data\n", + "train_dataloader, eval_dataloader = load_dataloaders(small=True, batch_size=32)\n", + "num_data = len(train_dataloader.dataset)\n", + "print(\"Training data size: \", num_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load model (with standard Gaussian prior)\n", + "model, param_to_log_posterior = load_model(num_data=num_data, prior_sd=1e3)\n", + "\n", + "# Turn off Dropout\n", + "model.eval()\n", + "\n", + "# Load to GPU\n", + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "model.to(device);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Only train the last layer\n", + "for name, param in model.named_parameters():\n", + " if 'models.layers.31.self_attn' not in name:\n", + " param.requires_grad = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract only the parameters to be trained\n", + "sub_params, sub_param_to_log_posterior = uqlib.extract_requires_grad_and_func(dict(model.named_parameters()), param_to_log_posterior)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Store initial values of sub_params to check against later\n", + "init_sub_params = tree_map(lambda x: x.detach().clone(), sub_params)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Train (as usual, using native PyTorch) for MAP\n", + "optimizer = AdamW(sub_params.values(), lr=1e-3, maximize=True)\n", + "\n", + "num_epochs = 30\n", + "num_training_steps = num_epochs * len(train_dataloader)\n", + "lr_scheduler = get_scheduler(\n", + " name=\"linear\",\n", + " optimizer=optimizer,\n", + " num_warmup_steps=0,\n", + " num_training_steps=num_training_steps,\n", + ")\n", + "\n", + "\n", + "progress_bar = tqdm(range(num_training_steps))\n", + "\n", + "log_posts = []\n", + "\n", + "# model.train()\n", + "for epoch in range(num_epochs):\n", + " for batch in train_dataloader: \n", + " batch = {k: v.to(device) for k, v in batch.items()}\n", + "\n", + " log_post, out = sub_param_to_log_posterior(sub_params, batch)\n", + "\n", + " log_post.backward()\n", + " log_posts.append(log_post.item())\n", + " \n", + " print(log_posts[-1], end='\\r')\n", + "\n", + " optimizer.step()\n", + " lr_scheduler.step()\n", + " optimizer.zero_grad()\n", + " progress_bar.update(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot convergence\n", + "plt.plot(log_posts);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize trained sub_params vs their initial values\n", + "final_sub_params = tree_map(lambda p: p.detach().clone(), dict(model.named_parameters()))\n", + "\n", + "init_untrained_params = torch.cat([v.flatten() for k, v in init_sub_params.items() if 'bert' not in k])\n", + "final_untrained_params = torch.cat([v.flatten() for k, v in final_sub_params.items() if 'bert' not in k])\n", + "\n", + "plt.hist(init_untrained_params.cpu().numpy(), bins=100, alpha=0.5, label='Init', density=True)\n", + "plt.hist(final_untrained_params.cpu().numpy(), bins=100, alpha=0.5, label='Final', density=True)\n", + "plt.legend();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Jacobian requires more memory, so we'll use a smaller batch size for the Laplace approximation\n", + "laplace_train_dataloader, _ = load_dataloaders(small=True, batch_size=8)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Use uqlib for diagonal Fisher information covariance matrix\n", + "laplace_approx_transform = uqlib.laplace.diag_fisher.build(sub_param_to_log_posterior)\n", + "laplace_state = laplace_approx_transform.init(sub_params)\n", + "\n", + "for batch in tqdm(laplace_train_dataloader):\n", + " batch = {k: v.to(device) for k, v in batch.items()}\n", + " laplace_state = laplace_approx_transform.update(\n", + " laplace_state, batch\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save state\n", + "laplace_state = tree_map_(lambda x: x.detach().cpu(), laplace_state)\n", + "pickle.dump(laplace_state, open(\"yelp_laplace_state.pkl\", \"wb\"))\n", + "\n", + "# laplace_state = pickle.load(open(\"yelp_laplace_state.pkl\", \"rb\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize the standard deviations of the Laplace approximation\n", + "prec_diag = torch.cat([v.detach().cpu().flatten() for v in laplace_state.prec_diag.values()]).numpy()\n", + "sd_diag = prec_diag ** -0.5\n", + "\n", + "plt.hist(sd_diag, bins=100, density=True);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = load_dataset(\"yelp_review_full\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer\n", + "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Llama-2-7b-hf\")\n", + "\n", + "def tokenize_function(examples):\n", + " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokenized_datasets = dataset.map(tokenize_function, batched=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokenized_datasets['train'].add_column('targets', tokenized_datasets['train']['input_ids'], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokenized_datasets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From ef574c94972acbdcec261573b5036612aa1eb38f Mon Sep 17 00:00:00 2001 From: phoebeklett Date: Wed, 7 Feb 2024 14:48:30 -0500 Subject: [PATCH 02/12] Add LoRA to training loop, use Guanco dataset --- examples/laplace-lora/load.py | 78 +++++++++++--- examples/laplace-lora/yelp_laplace_lora.ipynb | 100 +++++------------- 2 files changed, 88 insertions(+), 90 deletions(-) diff --git a/examples/laplace-lora/load.py b/examples/laplace-lora/load.py index 13c352be..7fd27aaf 100644 --- a/examples/laplace-lora/load.py +++ b/examples/laplace-lora/load.py @@ -1,10 +1,14 @@ from functools import partial +from itertools import groupby +import numpy as np +import regex as re from datasets import load_dataset from optree import tree_map, tree_reduce import torch +from torch.nn import CrossEntropyLoss from torch.utils.data import DataLoader -from torch.distributions import Categorical from transformers import AutoTokenizer, AutoModelForCausalLM +from peft import LoraConfig, TaskType, get_peft_model from uqlib import model_to_function @@ -13,27 +17,25 @@ def load_dataloaders(small=False, batch_size=8): - dataset = load_dataset("yelp_review_full") + dataset = load_dataset("timdettmers/openassistant-guanaco") tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") tokenizer.pad_token = tokenizer.eos_token def tokenize_function(examples): - return tokenizer(examples["text"], padding="max_length", max_length=50, truncation=True) + return tokenizer( + examples["text"], padding="max_length", max_length=100, truncation=True + ) tokenized_datasets = dataset.map(tokenize_function, batched=True) - tokenized_datasets['train'] = tokenized_datasets['train'].add_column('labels', tokenized_datasets['train']['input_ids']) - tokenized_datasets['test'] = tokenized_datasets['test'].add_column('labels', tokenized_datasets['test']['input_ids']) - - tokenized_datasets = tokenized_datasets.remove_columns(["text", "label"]) tokenized_datasets.set_format("torch") + train_dataset = tokenized_datasets["train"] + eval_dataset = tokenized_datasets["test"] + if small: - train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) - eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) - else: - train_dataset = tokenized_datasets["train"] - eval_dataset = tokenized_datasets["test"] + train_dataset = train_dataset.shuffle(seed=42).select(range(1000)) + eval_dataset = eval_dataset.shuffle(seed=42).select(range(1000)) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size) eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size) @@ -45,15 +47,63 @@ def load_model( prior_sd=1.0, num_data=None, per_sample=False, + target_modules=None, + r=8, + alpha=32, + dropout=0.1, + verbose=False, ): model = AutoModelForCausalLM.from_pretrained( "meta-llama/Llama-2-7b-hf", ) + # only adapt W_q, W_v, W_o + # regex may not work for all models + modules = [ + re.sub("^(model\\.)*|(\\.weight)*$", "", name) + for name, _ in model.named_parameters() + if any(sub in name for sub in ["self_attn.q", "self_attn.v", "self_attn.o"]) + ] + # only adapt last layer + if target_modules == "last_layer": + modules = [ + ( + name, + np.array([int(sub) for sub in name.split(".") if sub.isdigit()]).item(), + ) + for name in modules + ] + modules = [ + [name for name, layer in list(group)] + for _, group in groupby( + sorted(modules, key=lambda x: x[-1]), key=lambda x: x[-1] + ) + ][-1] + + peft_config = LoraConfig( + task_type=TaskType.SEQ_2_SEQ_LM, + target_modules=modules, + r=r, + lora_alpha=alpha, + lora_dropout=dropout, + ) + model = get_peft_model(model, peft_config) + if verbose: + model.print_trainable_parameters() model_func = model_to_function(model) def categorical_log_likelihood(labels, logits): - return Categorical(logits=logits, validate_args=False).log_prob(labels) + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, model.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + return loss def univariate_normal_log_prob(x, mean, sd): return -0.5 * ((x - mean) / sd) ** 2 @@ -67,7 +117,7 @@ def normal_log_prior(p) -> float: def param_to_log_posterior_per_sample(p, batch, num_data) -> torch.tensor: output = model_func(p, **batch) return ( - categorical_log_likelihood(batch["labels"], output.logits) + categorical_log_likelihood(batch["input_ids"], output.logits) ) + normal_log_prior(p) / num_data, output if per_sample: diff --git a/examples/laplace-lora/yelp_laplace_lora.ipynb b/examples/laplace-lora/yelp_laplace_lora.ipynb index ed890217..ffe24eb1 100644 --- a/examples/laplace-lora/yelp_laplace_lora.ipynb +++ b/examples/laplace-lora/yelp_laplace_lora.ipynb @@ -12,9 +12,19 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/anaconda3/envs/uq/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "[2024-02-07 13:24:33,751] torch.distributed.elastic.multiprocessing.redirects: [WARNING] NOTE: Redirects are currently not supported in Windows or MacOs.\n" + ] + } + ], "source": [ "import torch\n", "from tqdm.auto import tqdm\n", @@ -43,9 +53,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|██████████| 2/2 [00:25<00:00, 12.59s/it]\n" + ] + } + ], "source": [ "# Load model (with standard Gaussian prior)\n", "model, param_to_log_posterior = load_model(num_data=num_data, prior_sd=1e3)\n", @@ -60,19 +78,7 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Only train the last layer\n", - "for name, param in model.named_parameters():\n", - " if 'models.layers.31.self_attn' not in name:\n", - " param.requires_grad = False" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -211,64 +217,6 @@ "plt.hist(sd_diag, bins=100, density=True);" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from datasets import load_dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dataset = load_dataset(\"yelp_review_full\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from transformers import AutoTokenizer\n", - "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Llama-2-7b-hf\")\n", - "\n", - "def tokenize_function(examples):\n", - " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tokenized_datasets = dataset.map(tokenize_function, batched=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tokenized_datasets['train'].add_column('targets', tokenized_datasets['train']['input_ids'], inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tokenized_datasets" - ] - }, { "cell_type": "code", "execution_count": null, From 02f3d73b33a04ea7215e8c39e6d81d98a4e64086 Mon Sep 17 00:00:00 2001 From: paperspace Date: Wed, 7 Feb 2024 23:27:32 +0000 Subject: [PATCH 03/12] Update lr, vis weight changes --- examples/laplace-lora/load.py | 10 +-- examples/laplace-lora/yelp_laplace_lora.ipynb | 72 +++++++++---------- 2 files changed, 38 insertions(+), 44 deletions(-) diff --git a/examples/laplace-lora/load.py b/examples/laplace-lora/load.py index 7fd27aaf..f1d5b9dc 100644 --- a/examples/laplace-lora/load.py +++ b/examples/laplace-lora/load.py @@ -28,14 +28,15 @@ def tokenize_function(examples): ) tokenized_datasets = dataset.map(tokenize_function, batched=True) + tokenized_datasets = tokenized_datasets.remove_columns(["text"]) tokenized_datasets.set_format("torch") train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["test"] if small: - train_dataset = train_dataset.shuffle(seed=42).select(range(1000)) - eval_dataset = eval_dataset.shuffle(seed=42).select(range(1000)) + train_dataset = train_dataset.shuffle(seed=42).select(range(100)) + eval_dataset = eval_dataset.shuffle(seed=42).select(range(100)) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size) eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size) @@ -80,7 +81,7 @@ def load_model( ][-1] peft_config = LoraConfig( - task_type=TaskType.SEQ_2_SEQ_LM, + task_type=TaskType.CAUSAL_LM, target_modules=modules, r=r, lora_alpha=alpha, @@ -116,6 +117,7 @@ def normal_log_prior(p) -> float: def param_to_log_posterior_per_sample(p, batch, num_data) -> torch.tensor: output = model_func(p, **batch) + return ( categorical_log_likelihood(batch["input_ids"], output.logits) ) + normal_log_prior(p) / num_data, output @@ -131,4 +133,4 @@ def param_to_log_posterior(p, batch, num_data) -> float: if num_data is not None: param_to_log_posterior = partial(param_to_log_posterior, num_data=num_data) - return model, param_to_log_posterior + return model, param_to_log_posterior, modules diff --git a/examples/laplace-lora/yelp_laplace_lora.ipynb b/examples/laplace-lora/yelp_laplace_lora.ipynb index ffe24eb1..03fedd99 100644 --- a/examples/laplace-lora/yelp_laplace_lora.ipynb +++ b/examples/laplace-lora/yelp_laplace_lora.ipynb @@ -4,27 +4,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Post-hoc Laplace approx to LoRA parameters at model checkpoints theta_MAP obtained from standard fine-tuning. \n", - "\n", - "Done below: 1 checkpoint of the Laplace approx to the posterior of the fine-tuned parameters\n", - "To do: Change fine tuning to LoRA fine tuning, loop and eval on benchmark using both analytical and empirical methods" + "Post-hoc Laplace approx to LoRA parameters at model checkpoints theta_MAP obtained from standard fine-tuning." ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/homebrew/anaconda3/envs/uq/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "[2024-02-07 13:24:33,751] torch.distributed.elastic.multiprocessing.redirects: [WARNING] NOTE: Redirects are currently not supported in Windows or MacOs.\n" - ] - } - ], + "outputs": [], "source": [ "import torch\n", "from tqdm.auto import tqdm\n", @@ -53,20 +40,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Loading checkpoint shards: 100%|██████████| 2/2 [00:25<00:00, 12.59s/it]\n" - ] - } - ], + "outputs": [], "source": [ "# Load model (with standard Gaussian prior)\n", - "model, param_to_log_posterior = load_model(num_data=num_data, prior_sd=1e3)\n", + "model, param_to_log_posterior, target_module_names = load_model(num_data=num_data, prior_sd=1e3, target_modules=\"last_layer\")\n", "\n", "# Turn off Dropout\n", "model.eval()\n", @@ -78,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -103,7 +82,7 @@ "outputs": [], "source": [ "# Train (as usual, using native PyTorch) for MAP\n", - "optimizer = AdamW(sub_params.values(), lr=1e-3, maximize=True)\n", + "optimizer = AdamW(sub_params.values(), lr=1e-5, maximize=True)\n", "\n", "num_epochs = 30\n", "num_training_steps = num_epochs * len(train_dataloader)\n", @@ -121,7 +100,7 @@ "\n", "# model.train()\n", "for epoch in range(num_epochs):\n", - " for batch in train_dataloader: \n", + " for batch in train_dataloader:\n", " batch = {k: v.to(device) for k, v in batch.items()}\n", "\n", " log_post, out = sub_param_to_log_posterior(sub_params, batch)\n", @@ -129,12 +108,11 @@ " log_post.backward()\n", " log_posts.append(log_post.item())\n", " \n", - " print(log_posts[-1], end='\\r')\n", - "\n", " optimizer.step()\n", " lr_scheduler.step()\n", " optimizer.zero_grad()\n", - " progress_bar.update(1)" + " progress_bar.update(1)\n", + " progress_bar.set_postfix(loss=log_posts[-1])" ] }, { @@ -154,13 +132,27 @@ "outputs": [], "source": [ "# Visualize trained sub_params vs their initial values\n", + "import regex as re\n", "final_sub_params = tree_map(lambda p: p.detach().clone(), dict(model.named_parameters()))\n", "\n", - "init_untrained_params = torch.cat([v.flatten() for k, v in init_sub_params.items() if 'bert' not in k])\n", - "final_untrained_params = torch.cat([v.flatten() for k, v in final_sub_params.items() if 'bert' not in k])\n", + "base = ()\n", + "final = ()\n", + "for weights_matrix in target_module_names:\n", + " W = [v for k, v in final_sub_params.items() if re.sub(\"^(base_model.model.model\\\\.)*|(\\\\.base_layer.weight)*$\", \"\", k) == weights_matrix][0]\n", + " A = [v for k, v in final_sub_params.items() if re.sub(\"^(base_model.model.model\\\\.)*|(\\\\.lora_A.default.weight)*$\", \"\", k) == weights_matrix][0]\n", + " B = [v for k, v in final_sub_params.items() if re.sub(\"^(base_model.model.model\\\\.)*|(\\\\.lora_B.default.weight)*$\", \"\", k) == weights_matrix][0]\n", + " \n", + " W_del = B @ A \n", + " W_new = W + W_del\n", + "\n", + " base += (W, )\n", + " final += (W_new,)\n", + "\n", + "base = torch.cat(base).flatten()\n", + "final = torch.cat(final).flatten()\n", "\n", - "plt.hist(init_untrained_params.cpu().numpy(), bins=100, alpha=0.5, label='Init', density=True)\n", - "plt.hist(final_untrained_params.cpu().numpy(), bins=100, alpha=0.5, label='Final', density=True)\n", + "plt.hist(base.cpu().numpy(), bins=100, alpha=0.5, label='Init', density=True)\n", + "plt.hist(final.cpu().numpy(), bins=100, alpha=0.5, label='Final', density=True)\n", "plt.legend();" ] }, @@ -199,9 +191,9 @@ "source": [ "# Save state\n", "laplace_state = tree_map_(lambda x: x.detach().cpu(), laplace_state)\n", - "pickle.dump(laplace_state, open(\"yelp_laplace_state.pkl\", \"wb\"))\n", + "pickle.dump(laplace_state, open(\"guanaco_laplace_state.pkl\", \"wb\"))\n", "\n", - "# laplace_state = pickle.load(open(\"yelp_laplace_state.pkl\", \"rb\"))" + "# laplace_state = pickle.load(open(\"guanaco_laplace_state.pkl\", \"rb\"))" ] }, { @@ -241,7 +233,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.12.1" } }, "nbformat": 4, From 7476a824f69a12122527a948e345aabecb1d3220 Mon Sep 17 00:00:00 2001 From: phoebeklett Date: Thu, 8 Feb 2024 09:59:59 -0500 Subject: [PATCH 04/12] Neg loss, turn off dropout --- examples/laplace-lora/load.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/laplace-lora/load.py b/examples/laplace-lora/load.py index f1d5b9dc..0fc25e32 100644 --- a/examples/laplace-lora/load.py +++ b/examples/laplace-lora/load.py @@ -51,7 +51,7 @@ def load_model( target_modules=None, r=8, alpha=32, - dropout=0.1, + dropout=0.0, verbose=False, ): model = AutoModelForCausalLM.from_pretrained( @@ -104,6 +104,7 @@ def categorical_log_likelihood(labels, logits): # Enable model parallelism shift_labels = shift_labels.to(shift_logits.device) loss = loss_fct(shift_logits, shift_labels) + loss -= loss return loss def univariate_normal_log_prob(x, mean, sd): From d30ecc69cb051bab58a6a3004aa1e84676eb4ef6 Mon Sep 17 00:00:00 2001 From: paperspace Date: Thu, 8 Feb 2024 22:38:33 +0000 Subject: [PATCH 05/12] Add LaPlace LoRA training loop --- .gitignore | 5 +- examples/laplace-lora/load.py | 137 ---------- examples/laplace-lora/yelp_laplace_lora.ipynb | 241 ------------------ examples/lora_transformer.py | 122 +++++++++ experiments/base.py | 155 +++++++++++ experiments/laplace_lora/__init__.py | 5 + experiments/laplace_lora/configs/config.yml | 48 ++++ experiments/laplace_lora/dataset.py | 71 ++++++ experiments/laplace_lora/experiment.py | 104 ++++++++ experiments/main.py | 71 ++++++ experiments/utils/utils.py | 67 +++++ 11 files changed, 647 insertions(+), 379 deletions(-) delete mode 100644 examples/laplace-lora/load.py delete mode 100644 examples/laplace-lora/yelp_laplace_lora.ipynb create mode 100644 examples/lora_transformer.py create mode 100644 experiments/base.py create mode 100644 experiments/laplace_lora/__init__.py create mode 100644 experiments/laplace_lora/configs/config.yml create mode 100644 experiments/laplace_lora/dataset.py create mode 100644 experiments/laplace_lora/experiment.py create mode 100644 experiments/main.py diff --git a/.gitignore b/.gitignore index 9dd3e441..67e02322 100644 --- a/.gitignore +++ b/.gitignore @@ -102,4 +102,7 @@ ENV/ /site # mypy -.mypy_cache/ \ No newline at end of file +.mypy_cache/ + +# Experiment runs +experiments/runs/ \ No newline at end of file diff --git a/examples/laplace-lora/load.py b/examples/laplace-lora/load.py deleted file mode 100644 index 0fc25e32..00000000 --- a/examples/laplace-lora/load.py +++ /dev/null @@ -1,137 +0,0 @@ -from functools import partial -from itertools import groupby -import numpy as np -import regex as re -from datasets import load_dataset -from optree import tree_map, tree_reduce -import torch -from torch.nn import CrossEntropyLoss -from torch.utils.data import DataLoader -from transformers import AutoTokenizer, AutoModelForCausalLM -from peft import LoraConfig, TaskType, get_peft_model - -from uqlib import model_to_function - - -# From https://huggingface.co/docs/transformers/training#train-in-native-pytorch - - -def load_dataloaders(small=False, batch_size=8): - dataset = load_dataset("timdettmers/openassistant-guanaco") - - tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") - tokenizer.pad_token = tokenizer.eos_token - - def tokenize_function(examples): - return tokenizer( - examples["text"], padding="max_length", max_length=100, truncation=True - ) - - tokenized_datasets = dataset.map(tokenize_function, batched=True) - tokenized_datasets = tokenized_datasets.remove_columns(["text"]) - tokenized_datasets.set_format("torch") - - train_dataset = tokenized_datasets["train"] - eval_dataset = tokenized_datasets["test"] - - if small: - train_dataset = train_dataset.shuffle(seed=42).select(range(100)) - eval_dataset = eval_dataset.shuffle(seed=42).select(range(100)) - - train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size) - eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size) - - return train_dataloader, eval_dataloader - - -def load_model( - prior_sd=1.0, - num_data=None, - per_sample=False, - target_modules=None, - r=8, - alpha=32, - dropout=0.0, - verbose=False, -): - model = AutoModelForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", - ) - # only adapt W_q, W_v, W_o - # regex may not work for all models - modules = [ - re.sub("^(model\\.)*|(\\.weight)*$", "", name) - for name, _ in model.named_parameters() - if any(sub in name for sub in ["self_attn.q", "self_attn.v", "self_attn.o"]) - ] - # only adapt last layer - if target_modules == "last_layer": - modules = [ - ( - name, - np.array([int(sub) for sub in name.split(".") if sub.isdigit()]).item(), - ) - for name in modules - ] - modules = [ - [name for name, layer in list(group)] - for _, group in groupby( - sorted(modules, key=lambda x: x[-1]), key=lambda x: x[-1] - ) - ][-1] - - peft_config = LoraConfig( - task_type=TaskType.CAUSAL_LM, - target_modules=modules, - r=r, - lora_alpha=alpha, - lora_dropout=dropout, - ) - model = get_peft_model(model, peft_config) - if verbose: - model.print_trainable_parameters() - - model_func = model_to_function(model) - - def categorical_log_likelihood(labels, logits): - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, model.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) - loss -= loss - return loss - - def univariate_normal_log_prob(x, mean, sd): - return -0.5 * ((x - mean) / sd) ** 2 - - def normal_log_prior(p) -> float: - per_group_vals = tree_map( - lambda p: univariate_normal_log_prob(p, 0, prior_sd).sum(), p - ) - return tree_reduce(torch.add, per_group_vals) - - def param_to_log_posterior_per_sample(p, batch, num_data) -> torch.tensor: - output = model_func(p, **batch) - - return ( - categorical_log_likelihood(batch["input_ids"], output.logits) - ) + normal_log_prior(p) / num_data, output - - if per_sample: - param_to_log_posterior = param_to_log_posterior_per_sample - else: - - def param_to_log_posterior(p, batch, num_data) -> float: - log_probs, aux = param_to_log_posterior_per_sample(p, batch, num_data) - return log_probs.mean(), aux - - if num_data is not None: - param_to_log_posterior = partial(param_to_log_posterior, num_data=num_data) - - return model, param_to_log_posterior, modules diff --git a/examples/laplace-lora/yelp_laplace_lora.ipynb b/examples/laplace-lora/yelp_laplace_lora.ipynb deleted file mode 100644 index 03fedd99..00000000 --- a/examples/laplace-lora/yelp_laplace_lora.ipynb +++ /dev/null @@ -1,241 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Post-hoc Laplace approx to LoRA parameters at model checkpoints theta_MAP obtained from standard fine-tuning." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "from tqdm.auto import tqdm\n", - "from torch.optim import AdamW\n", - "from transformers import get_scheduler\n", - "from optree import tree_map_, tree_map\n", - "import pickle\n", - "import matplotlib.pyplot as plt\n", - "\n", - "import uqlib\n", - "\n", - "from load import load_dataloaders, load_model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load data\n", - "train_dataloader, eval_dataloader = load_dataloaders(small=True, batch_size=32)\n", - "num_data = len(train_dataloader.dataset)\n", - "print(\"Training data size: \", num_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load model (with standard Gaussian prior)\n", - "model, param_to_log_posterior, target_module_names = load_model(num_data=num_data, prior_sd=1e3, target_modules=\"last_layer\")\n", - "\n", - "# Turn off Dropout\n", - "model.eval()\n", - "\n", - "# Load to GPU\n", - "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", - "model.to(device);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Extract only the parameters to be trained\n", - "sub_params, sub_param_to_log_posterior = uqlib.extract_requires_grad_and_func(dict(model.named_parameters()), param_to_log_posterior)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Store initial values of sub_params to check against later\n", - "init_sub_params = tree_map(lambda x: x.detach().clone(), sub_params)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Train (as usual, using native PyTorch) for MAP\n", - "optimizer = AdamW(sub_params.values(), lr=1e-5, maximize=True)\n", - "\n", - "num_epochs = 30\n", - "num_training_steps = num_epochs * len(train_dataloader)\n", - "lr_scheduler = get_scheduler(\n", - " name=\"linear\",\n", - " optimizer=optimizer,\n", - " num_warmup_steps=0,\n", - " num_training_steps=num_training_steps,\n", - ")\n", - "\n", - "\n", - "progress_bar = tqdm(range(num_training_steps))\n", - "\n", - "log_posts = []\n", - "\n", - "# model.train()\n", - "for epoch in range(num_epochs):\n", - " for batch in train_dataloader:\n", - " batch = {k: v.to(device) for k, v in batch.items()}\n", - "\n", - " log_post, out = sub_param_to_log_posterior(sub_params, batch)\n", - "\n", - " log_post.backward()\n", - " log_posts.append(log_post.item())\n", - " \n", - " optimizer.step()\n", - " lr_scheduler.step()\n", - " optimizer.zero_grad()\n", - " progress_bar.update(1)\n", - " progress_bar.set_postfix(loss=log_posts[-1])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Plot convergence\n", - "plt.plot(log_posts);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize trained sub_params vs their initial values\n", - "import regex as re\n", - "final_sub_params = tree_map(lambda p: p.detach().clone(), dict(model.named_parameters()))\n", - "\n", - "base = ()\n", - "final = ()\n", - "for weights_matrix in target_module_names:\n", - " W = [v for k, v in final_sub_params.items() if re.sub(\"^(base_model.model.model\\\\.)*|(\\\\.base_layer.weight)*$\", \"\", k) == weights_matrix][0]\n", - " A = [v for k, v in final_sub_params.items() if re.sub(\"^(base_model.model.model\\\\.)*|(\\\\.lora_A.default.weight)*$\", \"\", k) == weights_matrix][0]\n", - " B = [v for k, v in final_sub_params.items() if re.sub(\"^(base_model.model.model\\\\.)*|(\\\\.lora_B.default.weight)*$\", \"\", k) == weights_matrix][0]\n", - " \n", - " W_del = B @ A \n", - " W_new = W + W_del\n", - "\n", - " base += (W, )\n", - " final += (W_new,)\n", - "\n", - "base = torch.cat(base).flatten()\n", - "final = torch.cat(final).flatten()\n", - "\n", - "plt.hist(base.cpu().numpy(), bins=100, alpha=0.5, label='Init', density=True)\n", - "plt.hist(final.cpu().numpy(), bins=100, alpha=0.5, label='Final', density=True)\n", - "plt.legend();" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Jacobian requires more memory, so we'll use a smaller batch size for the Laplace approximation\n", - "laplace_train_dataloader, _ = load_dataloaders(small=True, batch_size=8)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Use uqlib for diagonal Fisher information covariance matrix\n", - "laplace_approx_transform = uqlib.laplace.diag_fisher.build(sub_param_to_log_posterior)\n", - "laplace_state = laplace_approx_transform.init(sub_params)\n", - "\n", - "for batch in tqdm(laplace_train_dataloader):\n", - " batch = {k: v.to(device) for k, v in batch.items()}\n", - " laplace_state = laplace_approx_transform.update(\n", - " laplace_state, batch\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Save state\n", - "laplace_state = tree_map_(lambda x: x.detach().cpu(), laplace_state)\n", - "pickle.dump(laplace_state, open(\"guanaco_laplace_state.pkl\", \"wb\"))\n", - "\n", - "# laplace_state = pickle.load(open(\"guanaco_laplace_state.pkl\", \"rb\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize the standard deviations of the Laplace approximation\n", - "prec_diag = torch.cat([v.detach().cpu().flatten() for v in laplace_state.prec_diag.values()]).numpy()\n", - "sd_diag = prec_diag ** -0.5\n", - "\n", - "plt.hist(sd_diag, bins=100, density=True);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.1" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/lora_transformer.py b/examples/lora_transformer.py new file mode 100644 index 00000000..72c351ba --- /dev/null +++ b/examples/lora_transformer.py @@ -0,0 +1,122 @@ +import regex as re +import numpy as np +from itertools import groupby +from optree import tree_map, tree_reduce +import lightning as L +import torch +from torch.optim import AdamW +from transformers import AutoModelForCausalLM +from peft import LoraConfig, TaskType, get_peft_model +from ml_collections.config_dict import FrozenConfigDict + +import uqlib +from uqlib import model_to_function + + +class TransformerModule(L.LightningModule): + def __init__(self, config: FrozenConfigDict): + super().__init__() + self.automatic_optimization = False + + self.pretrained_model_name_or_path = config.pretrained_model_name_or_path + + self.prior_sd = config.prior_sd + self.per_sample = config.per_sample + + self.target_modules = config.lora_config.target_modules + self.r = config.lora_config.r + self.alpha = config.lora_config.alpha + self.dropout = config.lora_config.dropout + + model = AutoModelForCausalLM.from_pretrained( + self.pretrained_model_name_or_path + ).to(config.device) + # only adapt W_q, W_v, W_o + # regex may not work for all models + modules = [ + re.sub("^(model\\.)*|(\\.weight)*$", "", name) + for name, _ in model.named_parameters() + if any(sub in name for sub in ["self_attn.q", "self_attn.v", "self_attn.o"]) + ] + # only adapt last layer + if self.target_modules == "last_layer": + modules = [ + ( + name, + np.array( + [int(sub) for sub in name.split(".") if sub.isdigit()] + ).item(), + ) + for name in modules + ] + modules = [ + [name for name, layer in list(group)] + for _, group in groupby( + sorted(modules, key=lambda x: x[-1]), key=lambda x: x[-1] + ) + ][-1] + + peft_config = LoraConfig( + task_type=TaskType.CAUSAL_LM, + target_modules=modules, + r=self.r, + lora_alpha=self.alpha, + lora_dropout=self.dropout, + ) + + self.model = get_peft_model(model, peft_config) + self.model.print_trainable_parameters() + self.model_func = model_to_function(self.model) + + @staticmethod + def univariate_normal_log_prob(x, mean, sd): + return -0.5 * ((x - mean) / sd) ** 2 + + def normal_log_prior(self, p) -> float: + per_group_vals = tree_map( + lambda p: self.univariate_normal_log_prob(p, 0, self.prior_sd).sum(), p + ) + return tree_reduce(torch.add, per_group_vals) + + def param_to_log_posterior_per_sample(self, p, batch) -> torch.tensor: + output = self.model_func(p, labels=batch["input_ids"], **batch) + return -output.loss + self.normal_log_prior(p), output + + def def_param_to_log_posterior(self, **kwargs): + if self.per_sample: + param_to_log_posterior = self.param_to_log_posterior_per_sample + else: + + def param_to_log_posterior(p, batch) -> float: + log_probs, aux = self.param_to_log_posterior_per_sample(p, batch) + return log_probs.mean(), aux + + return param_to_log_posterior + + def configure_optimizers(self): + param_to_log_posterior = self.def_param_to_log_posterior() + + sub_params, sub_param_to_log_posterior = uqlib.extract_requires_grad_and_func( + dict(self.model.named_parameters()), param_to_log_posterior + ) + self.sub_params = sub_params + self.sub_param_to_log_posterior = sub_param_to_log_posterior + + optimizer = AdamW(sub_params.values(), lr=1e-5, maximize=True) + self.optimizer = optimizer + + return optimizer + + def training_step(self, batch, batch_idx): + batch = {k: v.to(self.model.device) for k, v in batch.items()} + + opt = self.optimizers() + opt.zero_grad() + + log_post, out = self.sub_param_to_log_posterior(self.sub_params, batch) + + log_post.backward() + self.log("log_post", log_post.item()) + opt.step() + + return torch.tensor(log_post.item()) diff --git a/experiments/base.py b/experiments/base.py new file mode 100644 index 00000000..972dc000 --- /dev/null +++ b/experiments/base.py @@ -0,0 +1,155 @@ +"""Base class for experiments""" + +import os +import pickle +import pandas +import torch +from typing import Optional, Union +from abc import ABC, abstractmethod +from ml_collections.config_dict import ConfigDict, FrozenConfigDict + + +class Dataset(ABC): + """ + Base class for all datasets + """ + + def __init__(self, config: FrozenConfigDict): + self.config = ConfigDict(config) # thaw config + + self.name = config.name + self.bsz = config.batch_size + + @property + @abstractmethod + def datasets( + self, + ) -> tuple[Union[torch.utils.data.Dataset, pandas.DataFrame]]: + """ + Store datasets, return train, test. + Can be torch dataset, pandas dataframe, etc. + """ + + @datasets.setter + @abstractmethod + def datasets( + self, + ): + """ + Set datasets, return train, test + """ + + +class TorchDataset(Dataset): + """ + Base class for torch datasets + """ + + def __init__(self, config: FrozenConfigDict): + super().__init__(config) + self.cache_dir = config.cache_dir + self.num_workers = config.num_workers + + @property + @abstractmethod + def dataloaders( + self, + ) -> tuple[torch.utils.data.DataLoader]: + """ + Store torch dataloaders, return train, test + """ + + @dataloaders.setter + @abstractmethod + def dataloaders( + self, + ): + """ + Set torch dataloaders, return train, test + """ + + +class Experiment(ABC): + """ + Base class for experiments + """ + + def __init__(self, config: FrozenConfigDict): + self.config = ConfigDict(config) # thaw config + self.experiment_log_dir = config.experiment_log_dir + + @property + @abstractmethod + def train_metrics( + self, + ) -> dict: + """ + Define train metrics + """ + + @train_metrics.setter + @abstractmethod + def train_metrics( + self, + ): + """ + Set train metrics + """ + + @property + @abstractmethod + def test_metrics( + self, + ) -> dict: + """ + Define test metrics + """ + + @test_metrics.setter + @abstractmethod + def test_metrics( + self, + ): + """ + Set test metrics + """ + + @abstractmethod + def train(self, dataset: Dataset) -> dict: + """ + Train model, return dictionary of train metrics. + """ + + @abstractmethod + def test(self, dataset: Dataset) -> dict: + """ + Test model, return dictionary of test metrics. + """ + + @abstractmethod + def run_experiment(self, dataset: Dataset, resume: bool = None): + """ + Run experiment pipeline + """ + + def save_results( + self, results, metadata: str = None, checkpoint: Optional[int] = None + ): # To do: make more specific than pickle + """ + Save results as pickle file + """ + folder = "final" if not checkpoint else "checkpoints" + experiment_dir = os.path.join(self.experiment_log_dir, folder) + os.makedirs(experiment_dir, exist_ok=True) + result_file = os.path.join( + experiment_dir, f"results-{metadata}.pkl" if metadata else "results.pkl" + ) + with open(result_file, "wb") as f: + pickle.dump(results, f) + + def run(self, dataset: Dataset, resume: bool = None, **kwargs): + """ + Run experiment and save results + """ + results = self.run_experiment(dataset=dataset, resume=resume, **kwargs) + self.save_results(results) diff --git a/experiments/laplace_lora/__init__.py b/experiments/laplace_lora/__init__.py new file mode 100644 index 00000000..447b3d50 --- /dev/null +++ b/experiments/laplace_lora/__init__.py @@ -0,0 +1,5 @@ +"""Imports for the LoRA experiment.""" +from experiments.laplace_lora.experiment import LoRAExperiment +from experiments.laplace_lora.dataset import HuggingfaceDataset + +__all__ = ["LoRAExperiment", "HuggingfaceDataset"] diff --git a/experiments/laplace_lora/configs/config.yml b/experiments/laplace_lora/configs/config.yml new file mode 100644 index 00000000..e803102d --- /dev/null +++ b/experiments/laplace_lora/configs/config.yml @@ -0,0 +1,48 @@ +# File dirs +base_dir: &base_path "./experiments/" +logs_dir: &logs_path "./experiments/runs/laplace_lora/" +data_dir: &data_path "./experiments/laplace_lora/data/" + + + +# Model +model_config: &model_params + pretrained_model_name_or_path: "meta-llama/Llama-2-7b-hf" + prior_sd: 1.0 + num_data: 100 + per_sample: False + device: 'cuda:0' + + #LoRA + lora_config: &lora_params + target_modules: "last_layer" + r: 8 + alpha: 32 + dropout: 0.0 + +# Dataset +dataset_config: + name: "timdettmers/openassistant-guanaco" + cache_dir: *data_path + batch_size: 8 + small: True + num_workers: 0 + tokenizer_pretrained_model_name_or_path: "meta-llama/Llama-2-7b-hf" + max_length: 100 + truncation: True + inputs_key: "text" + +# Experiment +experiment_config: + experiment_name: "laplace_lora" + model_config: *model_params + train_metrics: ['training_loss'] + test_metrics: ['accuracy'] + devices: ['cuda:0'] + batch_frequency: 16 + + trainer_config: + max_epochs: 30 + # accumulate_grad_batches: 8 + accelerator: "gpu" + diff --git a/experiments/laplace_lora/dataset.py b/experiments/laplace_lora/dataset.py new file mode 100644 index 00000000..ae22b029 --- /dev/null +++ b/experiments/laplace_lora/dataset.py @@ -0,0 +1,71 @@ +"""HF Dataset""" +import torch +from ml_collections.config_dict import ConfigDict, FrozenConfigDict +from transformers import AutoTokenizer +from datasets import load_dataset + +from experiments.base import TorchDataset + + +class HuggingfaceDataset(TorchDataset): + """ + HF Dataset + """ + + def __init__(self, config: FrozenConfigDict): + super().__init__(config) + self.name = config.name + + self.tokenizer = AutoTokenizer.from_pretrained( + config["tokenizer_pretrained_model_name_or_path"] + ) + self.tokenizer.pad_token = self.tokenizer.eos_token + + self.datasets = self.config + self.dataloaders = self.config + + def tokenize_function(self, examples): + return self.tokenizer( + examples[self.config.inputs_key], + padding="max_length", + max_length=self.config.max_length, + truncation=self.config.truncation, + ) + + @property + def datasets(self): + return self._datasets + + @datasets.setter + def datasets(self, config: ConfigDict): + dataset = load_dataset(self.name) + + tokenized_datasets = dataset.map(self.tokenize_function, batched=True) + tokenized_datasets = tokenized_datasets.remove_columns([config.inputs_key]) + tokenized_datasets.set_format("torch") + + train_dataset = tokenized_datasets["train"] + eval_dataset = tokenized_datasets["test"] + + if self.config.small: + train_dataset = train_dataset.shuffle(seed=42).select(range(100)) + eval_dataset = eval_dataset.shuffle(seed=42).select(range(100)) + + self.trainset = train_dataset + self.testset = eval_dataset + self._datasets = self.trainset, self.testset + + @property + def dataloaders(self): + return self._dataloaders + + @dataloaders.setter + def dataloaders(self, config: ConfigDict): + self.train_dataloader = torch.utils.data.DataLoader( + self.trainset, shuffle=True, batch_size=self.config.batch_size + ) + self.test_dataloader = torch.utils.data.DataLoader( + self.testset, batch_size=self.config.batch_size + ) + + self._dataloaders = self.train_dataloader, self.test_dataloader diff --git a/experiments/laplace_lora/experiment.py b/experiments/laplace_lora/experiment.py new file mode 100644 index 00000000..0732a15a --- /dev/null +++ b/experiments/laplace_lora/experiment.py @@ -0,0 +1,104 @@ +import os +from examples.lora_transformer import TransformerModule +from lightning.pytorch.loggers import WandbLogger +from lightning.pytorch import Trainer +from lightning.pytorch.callbacks import TQDMProgressBar, ModelCheckpoint +from experiments.utils.utils import save_config +from ml_collections.config_dict import FrozenConfigDict +from experiments.base import Experiment +from experiments.laplace_lora.dataset import HuggingfaceDataset +import wandb + + +class LoRAExperiment(Experiment): + def __init__(self, config: FrozenConfigDict): + super().__init__(config) + self.test_metrics = config + self.train_metrics = config + self.devices = config["devices"] + + self.config_as_dict = self.config.to_dict() + self.wandb_logger = WandbLogger( + log_model="all", + project=config["experiment_name"], + save_dir=config["experiment_log_dir"], + ) + wandb.config = self.config_as_dict + self.config.wandb_id = self.wandb_logger._wandb_init["id"] + self.config.wandb_name = self.wandb_logger._wandb_init["name"] + save_config(self.config.to_dict(), config["experiment_log_dir"] + "/config.yml") + + self.model = TransformerModule(config.model_config) + + @property + def train_metrics(self): + return self._train_metrics + + @train_metrics.setter + def train_metrics(self, config): + metrics = {metric: [] for metric in config["train_metrics"]} + self._train_metrics = metrics + + @property + def test_metrics(self): + return self._test_metrics + + @test_metrics.setter + def test_metrics(self, config): + metrics = {metric: [] for metric in config["test_metrics"]} + self._test_metrics = metrics + + def train(self, dataset: HuggingfaceDataset, **kwargs): + callbacks = [ + TQDMProgressBar(refresh_rate=1), + ModelCheckpoint( + dirpath=f"{self.experiment_log_dir}/checkpoints/trainstep_checkpoints", + filename="{epoch:06}-{step:09}", + every_n_train_steps=self.config["batch_frequency"], + save_last=True, + verbose=True, + save_weights_only=True, + ), + ModelCheckpoint( + dirpath=f"{self.experiment_log_dir}/checkpoints", + filename="{epoch:06}", + verbose=True, + save_last=True, + save_on_train_epoch_end=True, + save_weights_only=False, + ), + ] + trainer_kwargs = self.config_as_dict["trainer_config"] + trainer = Trainer( + **trainer_kwargs, callbacks=callbacks, logger=self.wandb_logger + ) + + resume = kwargs.get("resume", None) + train_dataset = dataset.train_dataloader + + try: + resume_ckpt = None + if resume is not None: + resume_ckpt = os.path.join(resume, "checkpoints", "last.ckpt") + trainer.fit(self.model, train_dataset, ckpt_path=resume_ckpt) + finally: + if trainer.global_rank == 0: + final_ckpt = os.path.join( + self.experiment_log_dir, "checkpoints", "last.ckpt" + ) + trainer.save_checkpoint(final_ckpt) + + def test(self, **kwargs): + """ + To implement + """ + pass + + def run_experiment( + self, dataset: HuggingfaceDataset, resume: bool = None, **kwargs + ): + """ + Run experiment + """ + results = self.train(dataset, resume=resume, **kwargs) + return results diff --git a/experiments/main.py b/experiments/main.py new file mode 100644 index 00000000..aab4732b --- /dev/null +++ b/experiments/main.py @@ -0,0 +1,71 @@ +"""Script for running experiments.""" + +import datetime +import glob +import os +from absl import app, flags +from experiments.utils.utils import ( + load_config, + save_config, + setup_log_dir, +) +from experiments.laplace_lora import LoRAExperiment, HuggingfaceDataset + +FLAGS = flags.FLAGS +flags.DEFINE_string("base", None, "Path to base config.") +flags.DEFINE_string("resume", None, "Path to resume training.") +flags.DEFINE_string("devices", None, "Devices to use.") +flags.DEFINE_boolean("verbose", False, "Whether to print non-flag arguments.") + + +def main(argv): + """ + Main function for running experiments. + """ + if FLAGS.verbose: + print("non-flag arguments:", argv) + + if FLAGS.resume is None: + assert ( + FLAGS.base is not None + ), "Configs not specified, specify at least resume or base" + config = load_config(FLAGS.base) + else: + assert os.path.exists( + FLAGS.resume + ), "Provided path to resume training does not exist" + config_paths = glob.glob(os.path.join(FLAGS.resume, "*.yaml")) + assert len(config_paths) == 1, "Too many possible configs to resume from" + config = load_config(config_paths[0]) + + timestamp = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + experiment_name = config.get("experiment_name", None) + + experiment_log_dir = setup_log_dir( + config.get("logs_dir", "logs"), + timestamp, + resume=FLAGS.resume, + experiment_name=experiment_name, + ) + if FLAGS.devices is not None: + devices_list = FLAGS.devices.split(",") + config["experiment_config"]["devices"] = devices_list + + if FLAGS.resume is None: + config["experiment_config"]["experiment_log_dir"] = experiment_log_dir + save_config( + config.to_dict(), f"{experiment_log_dir}/{os.path.basename(FLAGS.base)}" + ) + + experiment = LoRAExperiment( + config["experiment_config"] + ) ## This will CHANGE per experiment + dataset = HuggingfaceDataset( + config["dataset_config"] + ) ## This will CHANGE per experiment + + experiment.run(dataset=dataset, resume=FLAGS.resume) + + +if __name__ == "__main__": + app.run(main) diff --git a/experiments/utils/utils.py b/experiments/utils/utils.py index 99c09f42..b31a8866 100644 --- a/experiments/utils/utils.py +++ b/experiments/utils/utils.py @@ -1,6 +1,10 @@ +import os from typing import List import torch from torch import nn +from omegaconf import OmegaConf +from pytorch_lightning.utilities import rank_zero_only +from ml_collections.config_dict import FrozenConfigDict def parse_devices(devices): @@ -30,3 +34,66 @@ def load_optimizer_param_to_model(model: nn.Module, groups: List[List[torch.Tens for model_param, optimizer_param in zip(list(model.parameters()), optimizer_params): model_param.data = optimizer_param + + +REQUIRED_PARAMS = ["dataset_config", "experiment_config"] + + +def load_config(file: str) -> FrozenConfigDict: + """ + Load config file + """ + config = OmegaConf.load(file) + for param in REQUIRED_PARAMS: + assert param in config, f"Missing key {param} in config" + + config = FrozenConfigDict(config) + return config + + +@rank_zero_only +def save_config(conf: OmegaConf, fp: str): + """ + Save config file, only once + """ + OmegaConf.save(config=conf, f=fp) + + +@rank_zero_only +def create_log_dir(log_dir_name: str): + """ + Create log directory, only once + """ + if not os.path.exists(log_dir_name): + os.mkdir(log_dir_name) + + +def setup_log_dir( + log_dir_name: str, + timestamp: str, + resume: bool = False, + experiment_name: str = None, +) -> str: + """ + Setup log directory + """ + if resume: + return resume + + # Create parent log name + if not os.path.exists(log_dir_name): + os.mkdir(log_dir_name) + + # Create timestamp folder + log_dir_name = os.path.join(log_dir_name, timestamp) + + # Add experiment name if specified + if experiment_name is not None: + log_dir_name += f"_{experiment_name}" + + create_log_dir(log_dir_name) + + # Create checkpoints folder + create_log_dir(f"{log_dir_name}/checkpoints") + + return log_dir_name From 550abc33fa3567ea4be6c060a2add60950109443 Mon Sep 17 00:00:00 2001 From: paperspace Date: Fri, 9 Feb 2024 19:45:44 +0000 Subject: [PATCH 06/12] Add N back into updates --- examples/lora_transformer.py | 48 ++++++++------------- experiments/laplace_lora/configs/config.yml | 1 - 2 files changed, 18 insertions(+), 31 deletions(-) diff --git a/examples/lora_transformer.py b/examples/lora_transformer.py index 72c351ba..765e3057 100644 --- a/examples/lora_transformer.py +++ b/examples/lora_transformer.py @@ -1,6 +1,7 @@ import regex as re import numpy as np from itertools import groupby +from functools import partial from optree import tree_map, tree_reduce import lightning as L import torch @@ -19,9 +20,7 @@ def __init__(self, config: FrozenConfigDict): self.automatic_optimization = False self.pretrained_model_name_or_path = config.pretrained_model_name_or_path - self.prior_sd = config.prior_sd - self.per_sample = config.per_sample self.target_modules = config.lora_config.target_modules self.r = config.lora_config.r @@ -78,45 +77,34 @@ def normal_log_prior(self, p) -> float: ) return tree_reduce(torch.add, per_group_vals) - def param_to_log_posterior_per_sample(self, p, batch) -> torch.tensor: + def param_to_log_posterior(self, p, batch, num_data) -> torch.tensor: output = self.model_func(p, labels=batch["input_ids"], **batch) - return -output.loss + self.normal_log_prior(p), output - - def def_param_to_log_posterior(self, **kwargs): - if self.per_sample: - param_to_log_posterior = self.param_to_log_posterior_per_sample - else: - - def param_to_log_posterior(p, batch) -> float: - log_probs, aux = self.param_to_log_posterior_per_sample(p, batch) - return log_probs.mean(), aux + return (-output.loss) + self.normal_log_prior(p) / num_data, output - return param_to_log_posterior - - def configure_optimizers(self): - param_to_log_posterior = self.def_param_to_log_posterior() + def on_train_start(self) -> None: + param_to_log_posterior = partial( + self.param_to_log_posterior, + num_data=len(self.trainer.train_dataloader.dataset), + ) - sub_params, sub_param_to_log_posterior = uqlib.extract_requires_grad_and_func( + ( + self.sub_params, + self.sub_param_to_log_posterior, + ) = uqlib.extract_requires_grad_and_func( dict(self.model.named_parameters()), param_to_log_posterior ) - self.sub_params = sub_params - self.sub_param_to_log_posterior = sub_param_to_log_posterior + self.opt = AdamW(self.sub_params.values(), lr=1e-5, maximize=True) - optimizer = AdamW(sub_params.values(), lr=1e-5, maximize=True) - self.optimizer = optimizer - - return optimizer + def configure_optimizers(self): + pass def training_step(self, batch, batch_idx): - batch = {k: v.to(self.model.device) for k, v in batch.items()} - - opt = self.optimizers() - opt.zero_grad() + self.opt.zero_grad() log_post, out = self.sub_param_to_log_posterior(self.sub_params, batch) - log_post.backward() + self.log("log_post", log_post.item()) - opt.step() + self.opt.step() return torch.tensor(log_post.item()) diff --git a/experiments/laplace_lora/configs/config.yml b/experiments/laplace_lora/configs/config.yml index e803102d..ffe230b4 100644 --- a/experiments/laplace_lora/configs/config.yml +++ b/experiments/laplace_lora/configs/config.yml @@ -10,7 +10,6 @@ model_config: &model_params pretrained_model_name_or_path: "meta-llama/Llama-2-7b-hf" prior_sd: 1.0 num_data: 100 - per_sample: False device: 'cuda:0' #LoRA From 5d34b14546d2a2519d5b0449cee63719f8e48e68 Mon Sep 17 00:00:00 2001 From: paperspace Date: Fri, 9 Feb 2024 20:35:35 +0000 Subject: [PATCH 07/12] Add final training params --- examples/lora_transformer.py | 4 +--- experiments/laplace_lora/configs/config.yml | 20 +++++++++----------- experiments/laplace_lora/dataset.py | 9 +++++++-- experiments/main.py | 7 +++++++ 4 files changed, 24 insertions(+), 16 deletions(-) diff --git a/examples/lora_transformer.py b/examples/lora_transformer.py index 765e3057..7577e54f 100644 --- a/examples/lora_transformer.py +++ b/examples/lora_transformer.py @@ -27,9 +27,7 @@ def __init__(self, config: FrozenConfigDict): self.alpha = config.lora_config.alpha self.dropout = config.lora_config.dropout - model = AutoModelForCausalLM.from_pretrained( - self.pretrained_model_name_or_path - ).to(config.device) + model = AutoModelForCausalLM.from_pretrained(self.pretrained_model_name_or_path) # only adapt W_q, W_v, W_o # regex may not work for all models modules = [ diff --git a/experiments/laplace_lora/configs/config.yml b/experiments/laplace_lora/configs/config.yml index ffe230b4..7f170c31 100644 --- a/experiments/laplace_lora/configs/config.yml +++ b/experiments/laplace_lora/configs/config.yml @@ -4,13 +4,11 @@ logs_dir: &logs_path "./experiments/runs/laplace_lora/" data_dir: &data_path "./experiments/laplace_lora/data/" - # Model model_config: &model_params pretrained_model_name_or_path: "meta-llama/Llama-2-7b-hf" prior_sd: 1.0 num_data: 100 - device: 'cuda:0' #LoRA lora_config: &lora_params @@ -23,11 +21,11 @@ model_config: &model_params dataset_config: name: "timdettmers/openassistant-guanaco" cache_dir: *data_path - batch_size: 8 - small: True - num_workers: 0 + batch_size: 4 + small: False + num_workers: 11 tokenizer_pretrained_model_name_or_path: "meta-llama/Llama-2-7b-hf" - max_length: 100 + max_length: 4096 truncation: True inputs_key: "text" @@ -35,13 +33,13 @@ dataset_config: experiment_config: experiment_name: "laplace_lora" model_config: *model_params - train_metrics: ['training_loss'] - test_metrics: ['accuracy'] - devices: ['cuda:0'] - batch_frequency: 16 + train_metrics: ['log_post'] + test_metrics: ['na'] + batch_frequency: 4 + seed: 2024 trainer_config: max_epochs: 30 - # accumulate_grad_batches: 8 accelerator: "gpu" + log_every_n_steps: 10 diff --git a/experiments/laplace_lora/dataset.py b/experiments/laplace_lora/dataset.py index ae22b029..1bbb4ac3 100644 --- a/experiments/laplace_lora/dataset.py +++ b/experiments/laplace_lora/dataset.py @@ -62,10 +62,15 @@ def dataloaders(self): @dataloaders.setter def dataloaders(self, config: ConfigDict): self.train_dataloader = torch.utils.data.DataLoader( - self.trainset, shuffle=True, batch_size=self.config.batch_size + self.trainset, + shuffle=True, + batch_size=self.config.batch_size, + num_workers=self.config.num_workers, ) self.test_dataloader = torch.utils.data.DataLoader( - self.testset, batch_size=self.config.batch_size + self.testset, + batch_size=self.config.batch_size, + num_workers=self.config.num_workers, ) self._dataloaders = self.train_dataloader, self.test_dataloader diff --git a/experiments/main.py b/experiments/main.py index aab4732b..e186a8a9 100644 --- a/experiments/main.py +++ b/experiments/main.py @@ -2,6 +2,7 @@ import datetime import glob +import torch import os from absl import app, flags from experiments.utils.utils import ( @@ -17,6 +18,8 @@ flags.DEFINE_string("devices", None, "Devices to use.") flags.DEFINE_boolean("verbose", False, "Whether to print non-flag arguments.") +os.environ["TOKENIZERS_PARALLELISM"] = "false" + def main(argv): """ @@ -51,12 +54,16 @@ def main(argv): devices_list = FLAGS.devices.split(",") config["experiment_config"]["devices"] = devices_list + torch.set_float32_matmul_precision("medium") + if FLAGS.resume is None: config["experiment_config"]["experiment_log_dir"] = experiment_log_dir save_config( config.to_dict(), f"{experiment_log_dir}/{os.path.basename(FLAGS.base)}" ) + torch.manual_seed(config["experiment_config"]["seed"]) + experiment = LoRAExperiment( config["experiment_config"] ) ## This will CHANGE per experiment From 88545a41e78df2db459fb56d8b3eb4d76b4f68a4 Mon Sep 17 00:00:00 2001 From: paperspace Date: Mon, 12 Feb 2024 22:37:34 +0000 Subject: [PATCH 08/12] Reorg code in favor of more simple scripts --- experiments/base.py | 155 ------------------ experiments/laplace_lora/__init__.py | 6 +- experiments/laplace_lora/configs/config.yml | 45 ----- experiments/laplace_lora/dataset.py | 76 --------- experiments/laplace_lora/experiment.py | 104 ------------ .../laplace_lora}/lora_transformer.py | 3 +- experiments/main.py | 78 --------- experiments/run_laplace_lora.py | 113 +++++++++++++ experiments/utils/__init__.py | 2 +- experiments/utils/configs/laplace_lora.yaml | 29 ++++ experiments/utils/utils.py | 2 +- 11 files changed, 147 insertions(+), 466 deletions(-) delete mode 100644 experiments/base.py delete mode 100644 experiments/laplace_lora/configs/config.yml delete mode 100644 experiments/laplace_lora/dataset.py delete mode 100644 experiments/laplace_lora/experiment.py rename {examples => experiments/laplace_lora}/lora_transformer.py (97%) delete mode 100644 experiments/main.py create mode 100644 experiments/run_laplace_lora.py create mode 100644 experiments/utils/configs/laplace_lora.yaml diff --git a/experiments/base.py b/experiments/base.py deleted file mode 100644 index 972dc000..00000000 --- a/experiments/base.py +++ /dev/null @@ -1,155 +0,0 @@ -"""Base class for experiments""" - -import os -import pickle -import pandas -import torch -from typing import Optional, Union -from abc import ABC, abstractmethod -from ml_collections.config_dict import ConfigDict, FrozenConfigDict - - -class Dataset(ABC): - """ - Base class for all datasets - """ - - def __init__(self, config: FrozenConfigDict): - self.config = ConfigDict(config) # thaw config - - self.name = config.name - self.bsz = config.batch_size - - @property - @abstractmethod - def datasets( - self, - ) -> tuple[Union[torch.utils.data.Dataset, pandas.DataFrame]]: - """ - Store datasets, return train, test. - Can be torch dataset, pandas dataframe, etc. - """ - - @datasets.setter - @abstractmethod - def datasets( - self, - ): - """ - Set datasets, return train, test - """ - - -class TorchDataset(Dataset): - """ - Base class for torch datasets - """ - - def __init__(self, config: FrozenConfigDict): - super().__init__(config) - self.cache_dir = config.cache_dir - self.num_workers = config.num_workers - - @property - @abstractmethod - def dataloaders( - self, - ) -> tuple[torch.utils.data.DataLoader]: - """ - Store torch dataloaders, return train, test - """ - - @dataloaders.setter - @abstractmethod - def dataloaders( - self, - ): - """ - Set torch dataloaders, return train, test - """ - - -class Experiment(ABC): - """ - Base class for experiments - """ - - def __init__(self, config: FrozenConfigDict): - self.config = ConfigDict(config) # thaw config - self.experiment_log_dir = config.experiment_log_dir - - @property - @abstractmethod - def train_metrics( - self, - ) -> dict: - """ - Define train metrics - """ - - @train_metrics.setter - @abstractmethod - def train_metrics( - self, - ): - """ - Set train metrics - """ - - @property - @abstractmethod - def test_metrics( - self, - ) -> dict: - """ - Define test metrics - """ - - @test_metrics.setter - @abstractmethod - def test_metrics( - self, - ): - """ - Set test metrics - """ - - @abstractmethod - def train(self, dataset: Dataset) -> dict: - """ - Train model, return dictionary of train metrics. - """ - - @abstractmethod - def test(self, dataset: Dataset) -> dict: - """ - Test model, return dictionary of test metrics. - """ - - @abstractmethod - def run_experiment(self, dataset: Dataset, resume: bool = None): - """ - Run experiment pipeline - """ - - def save_results( - self, results, metadata: str = None, checkpoint: Optional[int] = None - ): # To do: make more specific than pickle - """ - Save results as pickle file - """ - folder = "final" if not checkpoint else "checkpoints" - experiment_dir = os.path.join(self.experiment_log_dir, folder) - os.makedirs(experiment_dir, exist_ok=True) - result_file = os.path.join( - experiment_dir, f"results-{metadata}.pkl" if metadata else "results.pkl" - ) - with open(result_file, "wb") as f: - pickle.dump(results, f) - - def run(self, dataset: Dataset, resume: bool = None, **kwargs): - """ - Run experiment and save results - """ - results = self.run_experiment(dataset=dataset, resume=resume, **kwargs) - self.save_results(results) diff --git a/experiments/laplace_lora/__init__.py b/experiments/laplace_lora/__init__.py index 447b3d50..13ea9aab 100644 --- a/experiments/laplace_lora/__init__.py +++ b/experiments/laplace_lora/__init__.py @@ -1,5 +1 @@ -"""Imports for the LoRA experiment.""" -from experiments.laplace_lora.experiment import LoRAExperiment -from experiments.laplace_lora.dataset import HuggingfaceDataset - -__all__ = ["LoRAExperiment", "HuggingfaceDataset"] +from experiments.laplace_lora.lora_transformer import TransformerModule diff --git a/experiments/laplace_lora/configs/config.yml b/experiments/laplace_lora/configs/config.yml deleted file mode 100644 index 7f170c31..00000000 --- a/experiments/laplace_lora/configs/config.yml +++ /dev/null @@ -1,45 +0,0 @@ -# File dirs -base_dir: &base_path "./experiments/" -logs_dir: &logs_path "./experiments/runs/laplace_lora/" -data_dir: &data_path "./experiments/laplace_lora/data/" - - -# Model -model_config: &model_params - pretrained_model_name_or_path: "meta-llama/Llama-2-7b-hf" - prior_sd: 1.0 - num_data: 100 - - #LoRA - lora_config: &lora_params - target_modules: "last_layer" - r: 8 - alpha: 32 - dropout: 0.0 - -# Dataset -dataset_config: - name: "timdettmers/openassistant-guanaco" - cache_dir: *data_path - batch_size: 4 - small: False - num_workers: 11 - tokenizer_pretrained_model_name_or_path: "meta-llama/Llama-2-7b-hf" - max_length: 4096 - truncation: True - inputs_key: "text" - -# Experiment -experiment_config: - experiment_name: "laplace_lora" - model_config: *model_params - train_metrics: ['log_post'] - test_metrics: ['na'] - batch_frequency: 4 - seed: 2024 - - trainer_config: - max_epochs: 30 - accelerator: "gpu" - log_every_n_steps: 10 - diff --git a/experiments/laplace_lora/dataset.py b/experiments/laplace_lora/dataset.py deleted file mode 100644 index 1bbb4ac3..00000000 --- a/experiments/laplace_lora/dataset.py +++ /dev/null @@ -1,76 +0,0 @@ -"""HF Dataset""" -import torch -from ml_collections.config_dict import ConfigDict, FrozenConfigDict -from transformers import AutoTokenizer -from datasets import load_dataset - -from experiments.base import TorchDataset - - -class HuggingfaceDataset(TorchDataset): - """ - HF Dataset - """ - - def __init__(self, config: FrozenConfigDict): - super().__init__(config) - self.name = config.name - - self.tokenizer = AutoTokenizer.from_pretrained( - config["tokenizer_pretrained_model_name_or_path"] - ) - self.tokenizer.pad_token = self.tokenizer.eos_token - - self.datasets = self.config - self.dataloaders = self.config - - def tokenize_function(self, examples): - return self.tokenizer( - examples[self.config.inputs_key], - padding="max_length", - max_length=self.config.max_length, - truncation=self.config.truncation, - ) - - @property - def datasets(self): - return self._datasets - - @datasets.setter - def datasets(self, config: ConfigDict): - dataset = load_dataset(self.name) - - tokenized_datasets = dataset.map(self.tokenize_function, batched=True) - tokenized_datasets = tokenized_datasets.remove_columns([config.inputs_key]) - tokenized_datasets.set_format("torch") - - train_dataset = tokenized_datasets["train"] - eval_dataset = tokenized_datasets["test"] - - if self.config.small: - train_dataset = train_dataset.shuffle(seed=42).select(range(100)) - eval_dataset = eval_dataset.shuffle(seed=42).select(range(100)) - - self.trainset = train_dataset - self.testset = eval_dataset - self._datasets = self.trainset, self.testset - - @property - def dataloaders(self): - return self._dataloaders - - @dataloaders.setter - def dataloaders(self, config: ConfigDict): - self.train_dataloader = torch.utils.data.DataLoader( - self.trainset, - shuffle=True, - batch_size=self.config.batch_size, - num_workers=self.config.num_workers, - ) - self.test_dataloader = torch.utils.data.DataLoader( - self.testset, - batch_size=self.config.batch_size, - num_workers=self.config.num_workers, - ) - - self._dataloaders = self.train_dataloader, self.test_dataloader diff --git a/experiments/laplace_lora/experiment.py b/experiments/laplace_lora/experiment.py deleted file mode 100644 index 0732a15a..00000000 --- a/experiments/laplace_lora/experiment.py +++ /dev/null @@ -1,104 +0,0 @@ -import os -from examples.lora_transformer import TransformerModule -from lightning.pytorch.loggers import WandbLogger -from lightning.pytorch import Trainer -from lightning.pytorch.callbacks import TQDMProgressBar, ModelCheckpoint -from experiments.utils.utils import save_config -from ml_collections.config_dict import FrozenConfigDict -from experiments.base import Experiment -from experiments.laplace_lora.dataset import HuggingfaceDataset -import wandb - - -class LoRAExperiment(Experiment): - def __init__(self, config: FrozenConfigDict): - super().__init__(config) - self.test_metrics = config - self.train_metrics = config - self.devices = config["devices"] - - self.config_as_dict = self.config.to_dict() - self.wandb_logger = WandbLogger( - log_model="all", - project=config["experiment_name"], - save_dir=config["experiment_log_dir"], - ) - wandb.config = self.config_as_dict - self.config.wandb_id = self.wandb_logger._wandb_init["id"] - self.config.wandb_name = self.wandb_logger._wandb_init["name"] - save_config(self.config.to_dict(), config["experiment_log_dir"] + "/config.yml") - - self.model = TransformerModule(config.model_config) - - @property - def train_metrics(self): - return self._train_metrics - - @train_metrics.setter - def train_metrics(self, config): - metrics = {metric: [] for metric in config["train_metrics"]} - self._train_metrics = metrics - - @property - def test_metrics(self): - return self._test_metrics - - @test_metrics.setter - def test_metrics(self, config): - metrics = {metric: [] for metric in config["test_metrics"]} - self._test_metrics = metrics - - def train(self, dataset: HuggingfaceDataset, **kwargs): - callbacks = [ - TQDMProgressBar(refresh_rate=1), - ModelCheckpoint( - dirpath=f"{self.experiment_log_dir}/checkpoints/trainstep_checkpoints", - filename="{epoch:06}-{step:09}", - every_n_train_steps=self.config["batch_frequency"], - save_last=True, - verbose=True, - save_weights_only=True, - ), - ModelCheckpoint( - dirpath=f"{self.experiment_log_dir}/checkpoints", - filename="{epoch:06}", - verbose=True, - save_last=True, - save_on_train_epoch_end=True, - save_weights_only=False, - ), - ] - trainer_kwargs = self.config_as_dict["trainer_config"] - trainer = Trainer( - **trainer_kwargs, callbacks=callbacks, logger=self.wandb_logger - ) - - resume = kwargs.get("resume", None) - train_dataset = dataset.train_dataloader - - try: - resume_ckpt = None - if resume is not None: - resume_ckpt = os.path.join(resume, "checkpoints", "last.ckpt") - trainer.fit(self.model, train_dataset, ckpt_path=resume_ckpt) - finally: - if trainer.global_rank == 0: - final_ckpt = os.path.join( - self.experiment_log_dir, "checkpoints", "last.ckpt" - ) - trainer.save_checkpoint(final_ckpt) - - def test(self, **kwargs): - """ - To implement - """ - pass - - def run_experiment( - self, dataset: HuggingfaceDataset, resume: bool = None, **kwargs - ): - """ - Run experiment - """ - results = self.train(dataset, resume=resume, **kwargs) - return results diff --git a/examples/lora_transformer.py b/experiments/laplace_lora/lora_transformer.py similarity index 97% rename from examples/lora_transformer.py rename to experiments/laplace_lora/lora_transformer.py index 7577e54f..1f6944b5 100644 --- a/examples/lora_transformer.py +++ b/experiments/laplace_lora/lora_transformer.py @@ -21,6 +21,7 @@ def __init__(self, config: FrozenConfigDict): self.pretrained_model_name_or_path = config.pretrained_model_name_or_path self.prior_sd = config.prior_sd + self.lr = config.lr self.target_modules = config.lora_config.target_modules self.r = config.lora_config.r @@ -91,7 +92,7 @@ def on_train_start(self) -> None: ) = uqlib.extract_requires_grad_and_func( dict(self.model.named_parameters()), param_to_log_posterior ) - self.opt = AdamW(self.sub_params.values(), lr=1e-5, maximize=True) + self.opt = AdamW(self.sub_params.values(), lr=self.lr, maximize=True) def configure_optimizers(self): pass diff --git a/experiments/main.py b/experiments/main.py deleted file mode 100644 index e186a8a9..00000000 --- a/experiments/main.py +++ /dev/null @@ -1,78 +0,0 @@ -"""Script for running experiments.""" - -import datetime -import glob -import torch -import os -from absl import app, flags -from experiments.utils.utils import ( - load_config, - save_config, - setup_log_dir, -) -from experiments.laplace_lora import LoRAExperiment, HuggingfaceDataset - -FLAGS = flags.FLAGS -flags.DEFINE_string("base", None, "Path to base config.") -flags.DEFINE_string("resume", None, "Path to resume training.") -flags.DEFINE_string("devices", None, "Devices to use.") -flags.DEFINE_boolean("verbose", False, "Whether to print non-flag arguments.") - -os.environ["TOKENIZERS_PARALLELISM"] = "false" - - -def main(argv): - """ - Main function for running experiments. - """ - if FLAGS.verbose: - print("non-flag arguments:", argv) - - if FLAGS.resume is None: - assert ( - FLAGS.base is not None - ), "Configs not specified, specify at least resume or base" - config = load_config(FLAGS.base) - else: - assert os.path.exists( - FLAGS.resume - ), "Provided path to resume training does not exist" - config_paths = glob.glob(os.path.join(FLAGS.resume, "*.yaml")) - assert len(config_paths) == 1, "Too many possible configs to resume from" - config = load_config(config_paths[0]) - - timestamp = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") - experiment_name = config.get("experiment_name", None) - - experiment_log_dir = setup_log_dir( - config.get("logs_dir", "logs"), - timestamp, - resume=FLAGS.resume, - experiment_name=experiment_name, - ) - if FLAGS.devices is not None: - devices_list = FLAGS.devices.split(",") - config["experiment_config"]["devices"] = devices_list - - torch.set_float32_matmul_precision("medium") - - if FLAGS.resume is None: - config["experiment_config"]["experiment_log_dir"] = experiment_log_dir - save_config( - config.to_dict(), f"{experiment_log_dir}/{os.path.basename(FLAGS.base)}" - ) - - torch.manual_seed(config["experiment_config"]["seed"]) - - experiment = LoRAExperiment( - config["experiment_config"] - ) ## This will CHANGE per experiment - dataset = HuggingfaceDataset( - config["dataset_config"] - ) ## This will CHANGE per experiment - - experiment.run(dataset=dataset, resume=FLAGS.resume) - - -if __name__ == "__main__": - app.run(main) diff --git a/experiments/run_laplace_lora.py b/experiments/run_laplace_lora.py new file mode 100644 index 00000000..baae4779 --- /dev/null +++ b/experiments/run_laplace_lora.py @@ -0,0 +1,113 @@ +import argparse +import os +import glob +import datetime +import torch +from lightning.pytorch import Trainer +from lightning.pytorch.loggers import WandbLogger +from datasets import load_dataset +from transformers import AutoTokenizer + +from experiments.utils import parse_devices, load_config, save_config, setup_log_dir +from experiments.laplace_lora import TransformerModule + +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +parser = argparse.ArgumentParser() +parser.add_argument("--name", default=None, type=str) +parser.add_argument("--resume", default=None, type=str) +parser.add_argument("--base", default=None, type=str) +parser.add_argument("--devices", default=parse_devices, type=str) +parser.add_argument("--epochs", default=100, type=int) +parser.add_argument("--log_frequency", default=10, type=int) +parser.add_argument("--seed", default=42, type=int) + +args = parser.parse_args() + +if __name__ == "__main__": + device_type = "cpu" if callable(args.devices) else "gpu" + if args.resume is None: + assert ( + args.base is not None + ), "Configs not specified, specify at least resume or base" + config = load_config(args.base) + else: + assert os.path.exists( + args.resume + ), "Provided path to resume training does not exist" + config_paths = glob.glob(os.path.join(args.resume, "*.yaml")) + assert len(config_paths) == 1, "Too many possible configs to resume from" + config = load_config(config_paths[0]) + + timestamp = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + experiment_name = config.get("experiment_name", None) + + experiment_log_dir = setup_log_dir( + config.get("logs_dir", "logs"), + timestamp, + resume=args.resume, + experiment_name=experiment_name, + ) + + if args.resume is None: + save_config( + config.to_dict(), f"{experiment_log_dir}/{os.path.basename(args.base)}" + ) + + torch.set_float32_matmul_precision("medium") + torch.manual_seed(args.seed) + + trainer_kwargs = { + "max_epochs": args.epochs, + "accelerator": device_type, + "log_every_n_steps": args.log_frequency, + } + + logger = WandbLogger( + log_model="all", + project=config.get("experiment_name", ""), + save_dir=config.get("logs_dir", "logs"), + ) + + trainer = Trainer(**trainer_kwargs, logger=logger) + + tokenizer = AutoTokenizer.from_pretrained( + config.model_config.pretrained_model_name_or_path + ) + model = TransformerModule(config.model_config) + + dataset = load_dataset(config.dataset_name) + + def tokenize_function(examples): + return tokenizer(examples["text"], padding="max_length", truncation=True) + + tokenized_datasets = dataset.map(tokenize_function, batched=True) + tokenized_datasets = tokenized_datasets.remove_columns([config.inputs_key]) + tokenized_datasets.set_format("torch") + + if config.small: + train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100)) + eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100)) + + train_dataloader = torch.utils.data.DataLoader( + train_dataset, + shuffle=True, + batch_size=config.batch_size, + num_workers=config.num_workers, + ) + + test_dataloader = torch.utils.data.DataLoader( + eval_dataset, + batch_size=config.batch_size, + num_workers=config.num_workers, + ) + + try: + resume_ckpt = None + if args.resume is not None: + resume_ckpt = os.path.join(args.resume, "checkpoints", "last.ckpt") + trainer.fit(model, train_dataloader, ckpt_path=resume_ckpt) + finally: + if trainer.global_rank == 0: + final_ckpt = os.path.join(experiment_log_dir, "checkpoints", "last.ckpt") + trainer.save_checkpoint(final_ckpt) diff --git a/experiments/utils/__init__.py b/experiments/utils/__init__.py index 499b2f03..b51202d9 100644 --- a/experiments/utils/__init__.py +++ b/experiments/utils/__init__.py @@ -1 +1 @@ -from .utils import parse_devices +from .utils import parse_devices, load_config, save_config, setup_log_dir diff --git a/experiments/utils/configs/laplace_lora.yaml b/experiments/utils/configs/laplace_lora.yaml new file mode 100644 index 00000000..818df614 --- /dev/null +++ b/experiments/utils/configs/laplace_lora.yaml @@ -0,0 +1,29 @@ +# File dirs +base_dir: &base_path "./experiments/" +logs_dir: &logs_path "./experiments/runs/laplace_lora/" +data_dir: &data_path "./experiments/laplace_lora/data/" + +experiment_name: "laplace_lora" + +# Model +model_config: &model_params + pretrained_model_name_or_path: "meta-llama/Llama-2-7b-hf" + prior_sd: 1.0 + lr: 0.00001 + + #LoRA + lora_config: &lora_params + target_modules: "last_layer" + r: 8 + alpha: 32 + dropout: 0.0 + +# Dataset +dataset_name: "timdettmers/openassistant-guanaco" +batch_size: 4 +small: True +num_workers: 11 +tokenizer_pretrained_model_name_or_path: "meta-llama/Llama-2-7b-hf" +max_length: 4096 +truncation: True +inputs_key: "text" diff --git a/experiments/utils/utils.py b/experiments/utils/utils.py index b31a8866..494753d5 100644 --- a/experiments/utils/utils.py +++ b/experiments/utils/utils.py @@ -36,7 +36,7 @@ def load_optimizer_param_to_model(model: nn.Module, groups: List[List[torch.Tens model_param.data = optimizer_param -REQUIRED_PARAMS = ["dataset_config", "experiment_config"] +REQUIRED_PARAMS = ["dataset_config", "model_config"] def load_config(file: str) -> FrozenConfigDict: From 70e27edccb013235a59c462d35c63e30bd50cd36 Mon Sep 17 00:00:00 2001 From: paperspace Date: Tue, 13 Feb 2024 17:28:18 +0000 Subject: [PATCH 09/12] Add Eval code for LoRA --- experiments/eval_lora_transformer.py | 164 +++++++++++++++++++++++++++ experiments/run_laplace_lora.py | 22 +++- experiments/utils/utils.py | 2 +- 3 files changed, 181 insertions(+), 7 deletions(-) create mode 100644 experiments/eval_lora_transformer.py diff --git a/experiments/eval_lora_transformer.py b/experiments/eval_lora_transformer.py new file mode 100644 index 00000000..7ceec0b7 --- /dev/null +++ b/experiments/eval_lora_transformer.py @@ -0,0 +1,164 @@ +import torch +import wandb +from tqdm import tqdm +import pickle +from omegaconf import OmegaConf +import os +from ml_collections.config_dict import ConfigDict +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer + +from experiments.laplace_lora import TransformerModule +from experiments.utils.utils import load_config + + +def evaluate(model, tuned, dataset): + results = [] + avg_nlls_base = () + avg_ppls_base = () + avg_nlls_tuned = () + avg_ppls_tuned = () + for idx, sample in tqdm(enumerate(dataset)): + input_ids = sample["input_ids"].unsqueeze(0) + max_input_length = model.config.max_position_embeddings + + sample_nlls_base = [] + sample_nlls_tuned = [] + prev_end_loc = 0 + seq_len = input_ids.size(1) + for begin_loc in range(0, seq_len, 512): + end_loc = min(begin_loc + max_input_length, seq_len) + subseq = input_ids[:, begin_loc:end_loc] + targets = subseq.clone() + trg_len = end_loc - prev_end_loc + targets[:, :-trg_len] = -100 + + with torch.no_grad(): + output_base = model( + input_ids=subseq.to(model.device), + labels=targets, + ) + sample_nlls_base.append(output_base.loss) + + output_tuned = tuned.model( + input_ids=subseq.to(tuned.model.device), + labels=targets, + ) + sample_nlls_tuned.append(output_tuned.loss) + + prev_end_loc = end_loc + if end_loc == seq_len: + break + + sample_nlls_base = torch.tensor(sample_nlls_base) + sample_ppls_base = torch.exp(sample_nlls_base) + + sample_avg_nll_base = torch.mean(sample_nlls_base) + sample_avg_ppl_base = torch.mean(sample_ppls_base) + wandb.log({"sample_avg_nll_base": sample_avg_nll_base}) + wandb.log({"sample_avg_ppl_base": sample_avg_ppl_base}) + + sample_nlls_tuned = torch.tensor(sample_nlls_tuned) + sample_ppls_tuned = torch.exp(sample_nlls_tuned) + + sample_avg_nll_tuned = torch.mean(sample_nlls_tuned) + sample_avg_ppl_tuned = torch.mean(sample_ppls_tuned) + wandb.log({"sample_avg_nll_tuned": sample_avg_nll_tuned}) + wandb.log({"sample_avg_ppl_tuned": sample_avg_ppl_tuned}) + + results += [ + { + "idx": idx, + "input_ids": sample["input_ids"], + "nlls_base": sample_nlls_base, + "nlls_tuned": sample_nlls_tuned, + "ppls_base": sample_ppls_base, + "ppls_tuned": sample_ppls_tuned, + "avg_nll_base": sample_avg_nll_base, + "avg_ppl_base": sample_avg_ppl_base, + "avg_nll_tuned": sample_avg_nll_tuned, + "avg_ppl_tuned": sample_avg_ppl_tuned, + } + ] + + avg_nlls_base += (sample_avg_nll_base,) + avg_ppls_base += (sample_avg_ppl_base,) + + avg_nlls_tuned += (sample_avg_nll_tuned,) + avg_ppls_tuned += (sample_avg_ppl_tuned,) + + avg_nll_base = torch.mean(torch.tensor(avg_nlls_base)) + avg_ppl_base = torch.mean(torch.tensor(avg_ppls_base)) + + avg_nll_tuned = torch.mean(torch.tensor(avg_nlls_tuned)) + avg_ppl_tuned = torch.mean(torch.tensor(avg_ppls_tuned)) + + wandb.log({"Avg NLL, Base Model": avg_nll_base}) + wandb.log({"Avg PPL, Base Model": avg_ppl_base}) + + wandb.log({"Avg NLL, Tuned Model": avg_nll_tuned}) + wandb.log({"Avg PPL, Tuned Model": avg_ppl_tuned}) + + return results + + +DATETIME = "" +EXPERIMENT_LOG_DIR = f"./experiments/runs/laplace_lora/{DATETIME}_laplace_lora" +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +if __name__ == "__main__": + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + LORA_WEIGHTS = EXPERIMENT_LOG_DIR + "/checkpoints/last.ckpt" + CONFIG = EXPERIMENT_LOG_DIR + "/laplace_lora.yaml" + config = ConfigDict(load_config(CONFIG)) + + model_tuned = TransformerModule.load_from_checkpoint( + LORA_WEIGHTS, config=config["model_config"] + ).to(device) + model = AutoModelForCausalLM.from_pretrained( + config.model_config.pretrained_model_name_or_path + ).to(device) + print("Weights loaded successfully!") + + wandb.init( + project=config["experiment_name"], + dir=config.get("logs_dir", "logs"), + ) + config.wandb_id_eval = wandb.run.id + config.wandb_name_eval = wandb.run.name + + OmegaConf.save( + config=config.to_dict(), + f=EXPERIMENT_LOG_DIR + f"/{config['experiment_name']}.yaml", + ) + + dataset = load_dataset(config.dataset_name) + tokenizer = AutoTokenizer.from_pretrained( + config.tokenizer_pretrained_model_name_or_path + ) + tokenizer.pad_token = tokenizer.eos_token + + def tokenize_function(examples): + return tokenizer( + examples["text"], + padding="max_length", + max_length=config.max_length, + truncation=True, + ) + + tokenized_datasets = dataset.map(tokenize_function, batched=True) + tokenized_datasets = tokenized_datasets.remove_columns([config.inputs_key]) + tokenized_datasets.set_format("torch") + + train_dataset = tokenized_datasets["train"] + eval_dataset = tokenized_datasets["test"] + + if config.small: + train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100)) + eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100)) + + results = evaluate(model, model_tuned, eval_dataset) + + result_file = os.path.join(EXPERIMENT_LOG_DIR, "results-eval.pkl") + with open(result_file, "wb") as f: + pickle.dump(results, f) diff --git a/experiments/run_laplace_lora.py b/experiments/run_laplace_lora.py index baae4779..4c711291 100644 --- a/experiments/run_laplace_lora.py +++ b/experiments/run_laplace_lora.py @@ -49,11 +49,6 @@ experiment_name=experiment_name, ) - if args.resume is None: - save_config( - config.to_dict(), f"{experiment_log_dir}/{os.path.basename(args.base)}" - ) - torch.set_float32_matmul_precision("medium") torch.manual_seed(args.seed) @@ -68,23 +63,38 @@ project=config.get("experiment_name", ""), save_dir=config.get("logs_dir", "logs"), ) + config["wandb_name"] = logger.experiment.name + config["wandb_id"] = logger.experiment.id + + if args.resume is None: + save_config( + config.to_dict(), f"{experiment_log_dir}/{os.path.basename(args.base)}" + ) trainer = Trainer(**trainer_kwargs, logger=logger) tokenizer = AutoTokenizer.from_pretrained( config.model_config.pretrained_model_name_or_path ) + tokenizer.pad_token = tokenizer.eos_token model = TransformerModule(config.model_config) dataset = load_dataset(config.dataset_name) def tokenize_function(examples): - return tokenizer(examples["text"], padding="max_length", truncation=True) + return tokenizer( + examples["text"], + padding="max_length", + max_length=config.max_length, + truncation=True, + ) tokenized_datasets = dataset.map(tokenize_function, batched=True) tokenized_datasets = tokenized_datasets.remove_columns([config.inputs_key]) tokenized_datasets.set_format("torch") + train_dataset = tokenized_datasets["train"] + eval_dataset = tokenized_datasets["test"] if config.small: train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100)) eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100)) diff --git a/experiments/utils/utils.py b/experiments/utils/utils.py index 494753d5..09d87a7e 100644 --- a/experiments/utils/utils.py +++ b/experiments/utils/utils.py @@ -36,7 +36,7 @@ def load_optimizer_param_to_model(model: nn.Module, groups: List[List[torch.Tens model_param.data = optimizer_param -REQUIRED_PARAMS = ["dataset_config", "model_config"] +REQUIRED_PARAMS = ["model_config", "experiment_name"] def load_config(file: str) -> FrozenConfigDict: From d859ca6000915cd1218c276b099e7bc46b944fee Mon Sep 17 00:00:00 2001 From: paperspace Date: Tue, 13 Feb 2024 22:41:50 +0000 Subject: [PATCH 10/12] Small config change --- experiments/run_laplace_lora.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/experiments/run_laplace_lora.py b/experiments/run_laplace_lora.py index 4c711291..94014ff2 100644 --- a/experiments/run_laplace_lora.py +++ b/experiments/run_laplace_lora.py @@ -7,6 +7,7 @@ from lightning.pytorch.loggers import WandbLogger from datasets import load_dataset from transformers import AutoTokenizer +from ml_collections import ConfigDict from experiments.utils import parse_devices, load_config, save_config, setup_log_dir from experiments.laplace_lora import TransformerModule @@ -58,6 +59,9 @@ "log_every_n_steps": args.log_frequency, } + model = TransformerModule(config.model_config) + + config = ConfigDict(config) # thaw logger = WandbLogger( log_model="all", project=config.get("experiment_name", ""), @@ -66,6 +70,10 @@ config["wandb_name"] = logger.experiment.name config["wandb_id"] = logger.experiment.id + config["epochs"] = args.epochs + config["log_frequency"] = args.log_frequency + config["seed"] = args.seed + if args.resume is None: save_config( config.to_dict(), f"{experiment_log_dir}/{os.path.basename(args.base)}" From f61e9ac7c2a49e2b5276e1df93eb302adc01ea78 Mon Sep 17 00:00:00 2001 From: paperspace Date: Wed, 14 Feb 2024 14:17:13 +0000 Subject: [PATCH 11/12] Clean up select modules, remove redundancies --- experiments/laplace_lora/lora_transformer.py | 39 ++++++++++---------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/experiments/laplace_lora/lora_transformer.py b/experiments/laplace_lora/lora_transformer.py index 1f6944b5..e6f62903 100644 --- a/experiments/laplace_lora/lora_transformer.py +++ b/experiments/laplace_lora/lora_transformer.py @@ -1,5 +1,3 @@ -import regex as re -import numpy as np from itertools import groupby from functools import partial from optree import tree_map, tree_reduce @@ -31,28 +29,29 @@ def __init__(self, config: FrozenConfigDict): model = AutoModelForCausalLM.from_pretrained(self.pretrained_model_name_or_path) # only adapt W_q, W_v, W_o # regex may not work for all models - modules = [ - re.sub("^(model\\.)*|(\\.weight)*$", "", name) - for name, _ in model.named_parameters() - if any(sub in name for sub in ["self_attn.q", "self_attn.v", "self_attn.o"]) + + WEIGHTS_TO_LORA = ["q_proj", "v_proj", "o_proj"] + + modules = list(model.model.layers.named_parameters()) + module_names_with_layer = [ + (name.split(".")[0], f'layer.{name.strip('.weight')}') + for name, param in modules + if any( + sub in name + for sub in [ + "self_attn.{sub}".format(sub=sub) for sub in WEIGHTS_TO_LORA + ] + ) ] + # only adapt last layer if self.target_modules == "last_layer": modules = [ - ( - name, - np.array( - [int(sub) for sub in name.split(".") if sub.isdigit()] - ).item(), - ) - for name in modules - ] - modules = [ - [name for name, layer in list(group)] - for _, group in groupby( - sorted(modules, key=lambda x: x[-1]), key=lambda x: x[-1] - ) + [layer for name, layer in list(group)] + for _, group in groupby(module_names_with_layer, key=lambda x: x[0]) ][-1] + else: + modules = [name for layer, name in module_names_with_layer] peft_config = LoraConfig( task_type=TaskType.CAUSAL_LM, @@ -106,4 +105,4 @@ def training_step(self, batch, batch_idx): self.log("log_post", log_post.item()) self.opt.step() - return torch.tensor(log_post.item()) + return log_post From 1e01e0c9f41fe5d88a8d82063af0a9a8643d3599 Mon Sep 17 00:00:00 2001 From: paperspace Date: Wed, 14 Feb 2024 15:09:19 +0000 Subject: [PATCH 12/12] Add comments --- experiments/laplace_lora/lora_transformer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/experiments/laplace_lora/lora_transformer.py b/experiments/laplace_lora/lora_transformer.py index e6f62903..db067d47 100644 --- a/experiments/laplace_lora/lora_transformer.py +++ b/experiments/laplace_lora/lora_transformer.py @@ -33,6 +33,7 @@ def __init__(self, config: FrozenConfigDict): WEIGHTS_TO_LORA = ["q_proj", "v_proj", "o_proj"] modules = list(model.model.layers.named_parameters()) + # Get layer index, name for layers to adapt module_names_with_layer = [ (name.split(".")[0], f'layer.{name.strip('.weight')}') for name, param in modules @@ -44,7 +45,7 @@ def __init__(self, config: FrozenConfigDict): ) ] - # only adapt last layer + # Subset of layers to adapt if self.target_modules == "last_layer": modules = [ [layer for name, layer in list(group)]