diff --git a/README.md b/README.md index fab3ee5..03137e0 100644 --- a/README.md +++ b/README.md @@ -2,39 +2,86 @@ If the last 6 months of AI research felt like a decade to you, you are not alone! With a new Large Language Model (LLM) released every other week, it has been challenging to keep up with the current pace of innovation in AI. While there many LLM model which not Non-Hungging Face model Hard to Quantize the model if realsed as Pre-trianed model , Adapter-LoRa is Tool help to Assign **nn.LInear-** to LoRa Linear Decompsition + +### Usage Tool AdpaterLoRa + +```python + +import torch.nn as nn +import torch +from core.Quantized import AdapterLoRa + +model = nn.TransformerEncoderLayer(d_model=512, nhead=8) + +Adpate_model = AdapterLoRa(model , method="LoRa", Rank=4) + +# adding Linear Layer buitl Self.attention +Adpate_model.add_layer("self_attn") +Adpate_model.add_layer("linear1") +Adpate_model.add_layer("linear2") + +# reconstruct model Quantized +Adpate_model.reconstruct_model() + +# Iplmented LoRa Method +model = Adpate_model.implement_lora(verbose=True) +# Total trainable parameters before LoRA: 3176960 +# Total trainable parameters after LoRA: 24576 + +# This sets requires_grad to False for all parameters without the string "lora_" in their names + +# Training loop +for batch in dataloader: + model.train() +``` +### Saving Wieghts model + +* Save LoRA model (only the LoRA matrixes will be saved). + ```python -class Linear(nn.Module): - def __init__(self , in_features , out_features, bais=False): - super(Linear , self).__init__() - self.a = nn.Paramerts(torch.zero(-1 , 0)) - self.b = nn.Paramters(torch.onse(-1 , 0))) - def forward(self input): - if bais=True: - return x*a + b - return x * b +import loralib as lora +# ===== Before ===== +# torch.save(model.state_dict(), checkpoint_path) +# ===== After ===== +torch.save(lora.lora_state_dict(model), checkpoint_path) ``` -- Performance and productivety +### Loading the Pre-Trained Model + +* Load LoRA model (need to load the pre-trained model first). + +```python +import loralib as lora +# Load the pre-trained checkpoint first +model.load_state_dict(torch.load('ckpt_pretrained.pt'), strict=False) +# Then load the LoRA checkpoint +model.load_state_dict(torch.load('ckpt_lora.pt'), strict=False) +``` + + +- Quantized Model + - Time to Train + - Cost to Train ## What's in it for you? For each of the above four pillars, we are sharing our codebase and insights to: -- Assist you to leverage Transfomer-Based Model for your business needs and challenges +- Assist you to leverage Transfomer-Based Model for your machines needs and challenges -- Boost reproducibility efforts which are becoming increasingly difficult with LLMs +- Boost reproducibility efforts which are becoming increasingly difficult with Transfomers i am providing Tool that are ready-to-use for Quantize the model: -- Finetuning Transfomer-Based on your proprietary dataset via PeFT methodologies such as LoRA and Prefix Tuning +- Finetuning Transfomer-Based on your proprietary dataset via PeFT methodologies such as LoRA and QLoRa - Performing hyperparameter optimization to get the maximum performance out of these models ## What's the best way to use this repository? -Go over to the TRansfomer-Based-specific directory that you are interested in, and open the ```README.md```. We have included details about the LLM, followed by performance results on open-source datasets! +Go over to the Transfomer-Based-specific directory that you are interested in, and open the ```README.md```. We have included details about the LLMs, followed by performance results on open-source datasets! ## Roadmap diff --git a/__init__.py b/__init__.py index 1710169..6fbba22 100644 --- a/__init__.py +++ b/__init__.py @@ -1,2 +1,2 @@ -from .core.Quantized import AdapterLoRa -from .core.utils import make_lora_replace \ No newline at end of file +from core.Quantized import AdapterLoRa +from core.utils import make_lora_replace \ No newline at end of file diff --git a/core/Quantized.py b/core/Quantized.py index 4a0e6a3..38685fd 100644 --- a/core/Quantized.py +++ b/core/Quantized.py @@ -1,7 +1,7 @@ import loratorch as LoraT import torch.nn as nn import loralib as lora -from utils import make_lora_replace +from .utils import make_lora_replace class CastOutputToFloat(nn.Module): def forward(self, x): @@ -100,7 +100,7 @@ def reconstruct_model(self): make_lora_replace(self.model, self.lora_layer, self.Rank, self.layer) return "Model successfully reconstructed with LoRA-adapted layers" - def implement_lora(self): + def implement_lora(self,verbose=False): """ Implement LoRA adaptation on the model. @@ -108,11 +108,13 @@ def implement_lora(self): nn.Module: The model with LoRA adaptation applied. """ total_trainable_params_before = sum(p.numel() for p in self.model.parameters() if p.requires_grad) - print(f"Total trainable parameters before LoRA: {total_trainable_params_before}") + if verbose == True: + print(f"Total trainable parameters before LoRA: {total_trainable_params_before}") self.LoRa.mark_only_lora_as_trainable(self.model) total_trainable_params_after = sum(p.numel() for p in self.model.parameters() if p.requires_grad) - print(f"Total trainable parameters after LoRA: {total_trainable_params_after}") + if verbose == True: + print(f"Total trainable parameters after LoRA: {total_trainable_params_after}") return self.model diff --git a/exmpales/MultiAttention.py b/exmpales/MultiAttention.py new file mode 100644 index 0000000..7c0f792 --- /dev/null +++ b/exmpales/MultiAttention.py @@ -0,0 +1,111 @@ +import torch.nn as nn +import torch +import math + + +class ScaleDotProductAttention(nn.Module): + + """ + compute scale dot product attention + + Query : given sentence that we focused on (decoder) + Key : every sentence to check relationship with Qeur y(encoder) + Value : every sentence same with Key (encoder) + """ + + def __init__(self,config): + super(ScaleDotProductAttention, self).__init__() + self.softmax = nn.Softmax(dim=-1) + self.attention_dropout = nn.Dropout(config["attention_droput"]) + + def forward(self, q, k, v,output_attentions=False): + # input is 4 dimension tensor + # [batch_size, head, length, d_tensor] + batch_size, head, length, d_tensor = k.size() + + # 1. dot product Query with Key^T to compute similarity + k_t = k.transpose(2, 3) # transpose + score = (q @ k_t) / math.sqrt(d_tensor) # scaled dot product + + # 3. pass them softmax to make [0, 1] range + score = self.softmax(score) + score = self.attention_dropout(score) + + # 4. multiply with Value + v = score @ v + if not output_attentions: + return (v, None) + + return v, score + + +class MultiHeadAttention(nn.Module): + + def __init__(self,config): + super(MultiHeadAttention, self).__init__() + self.n_head = Config["num_heads"] + self.attention = ScaleDotProductAttention(config) + self.w_q = nn.Linear(config["embedding_size"], config["embedding_size"],bias = config["qkv_bias"]) + self.w_k = nn.Linear(config["embedding_size"], config["embedding_size"],bias = config["qkv_bias"]) + self.w_v = nn.Linear(config["embedding_size"], config["embedding_size"],bias = config["qkv_bias"]) + self.w_concat = nn.Linear(config["embedding_size"], config["embedding_size"]) + + def forward(self, q, k, v,output_attentions=False): + # 1. dot product with weight matrices + q, k, v = self.w_q(q), self.w_k(k), self.w_v(v) + + # 2. split tensor by number of heads + q, k, v = self.split(q), self.split(k), self.split(v) + + # 3. do scale dot product to compute similarity + out, attention = self.attention(q, k, v,output_attentions=output_attentions) + + # 4. concat and pass to linear layer + out = self.concat(out) + out = self.w_concat(out) + + # 5. visualize attention map + # TODO : we should implement visualization + if not output_attentions: + return (out, None) + + return out , attention + + def split(self, tensor): + """ + split tensor by number of head + + :param tensor: [batch_size, length, d_model] + :return: [batch_size, head, length, d_tensor] + """ + batch_size, length, d_model = tensor.size() + + d_tensor = d_model // self.n_head + tensor = tensor.view(batch_size, length, self.n_head, d_tensor).transpose(1, 2) + # it is similar with group convolution (split by number of heads) + + return tensor + + def concat(self, tensor): + """ + inverse function of self.split(tensor : torch.Tensor) + + :param tensor: [batch_size, head, length, d_tensor] + :return: [batch_size, length, d_model] + """ + batch_size, head, length, d_tensor = tensor.size() + d_model = head * d_tensor + + tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model) + return tensor + """ + inverse function of self.split(tensor : torch.Tensor) + + :param tensor: [batch_size, head, length, d_tensor] + :return: [batch_size, length, d_model] + """ + batch_size, head, length, d_tensor = tensor.size() + d_model = head * d_tensor + + tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model) + return tensor diff --git a/exmpales/Usage.py b/exmpales/Usage.py new file mode 100644 index 0000000..0c6a457 --- /dev/null +++ b/exmpales/Usage.py @@ -0,0 +1,22 @@ +import torch.nn as nn +import torch +import os +import sys + +current_dir = os.path.dirname(__file__) +target_dir = os.path.abspath(os.path.join(current_dir, ".././")) +sys.path.insert(0, target_dir) + +from core.Quantized import AdapterLoRa + +model = nn.TransformerEncoderLayer(d_model=512, nhead=8) + +Adpate_model = AdapterLoRa(model , method="LoRa", Rank=4) +Adpate_model.add_layer("self_attn") +Adpate_model.add_layer("linear1") +Adpate_model.add_layer("linear2") +Adpate_model.reconstruct_model() +model = Adpate_model.implement_lora(verbose=True) + + + diff --git a/exmpales/transfomer.py b/exmpales/transfomer.py deleted file mode 100644 index e69de29..0000000 diff --git a/exmpales/Attention.py b/exmpales/transfomerEncoder.py similarity index 100% rename from exmpales/Attention.py rename to exmpales/transfomerEncoder.py diff --git a/requirements.txt b/requirements.txt index 2059e40..bc83bf5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -Loralib -loratorch \ No newline at end of file +git+https://github.com/Baijiong-Lin/LoRA-Torch +git+https://github.com/microsoft/LoRA \ No newline at end of file diff --git a/setup.py b/setup.py index b3c07f3..66d78d9 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ setuptools.setup( name="AdapterLoRa", - version="1.1.1", + version="1.1.2", author="Youness EL BRAG", author_email="younsselbrag@gmail.com", description="A Tool for adaptation Larger Transfomer-Based model and Quantization built top on libraries LoRa and LoRa-Torch.",