done all the project LoRa Quantized

youness-elbrag · Aug 24, 2023 · adb63a7 · adb63a7
1 parent 18d5777
commit adb63a7
Show file tree

Hide file tree

Showing 9 changed files with 205 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -2,39 +2,86 @@
 
 If the last 6 months of AI research felt like a decade to you, you are not alone! With a new Large Language Model (LLM) released every other week, it has been challenging to keep up with the current pace of innovation in AI. While there many LLM model which not Non-Hungging Face model Hard to Quantize the model if realsed as Pre-trianed model , Adapter-LoRa is Tool help to Assign **nn.LInear-** to LoRa Linear Decompsition 
 
+
+### Usage Tool AdpaterLoRa
+
+```python
+
+import torch.nn as nn
+import torch
+from core.Quantized import AdapterLoRa
+
+model = nn.TransformerEncoderLayer(d_model=512, nhead=8)
+
+Adpate_model = AdapterLoRa(model , method="LoRa", Rank=4)
+
+# adding Linear Layer buitl Self.attention 
+Adpate_model.add_layer("self_attn") 
+Adpate_model.add_layer("linear1")
+Adpate_model.add_layer("linear2")
+
+# reconstruct model Quantized 
+Adpate_model.reconstruct_model()
+
+# Iplmented LoRa Method
+model = Adpate_model.implement_lora(verbose=True)
+# Total trainable parameters before LoRA: 3176960
+# Total trainable parameters after LoRA: 24576
+
+# This sets requires_grad to False for all parameters without the string "lora_" in their names
+
+# Training loop
+for batch in dataloader:
+    model.train()
+```
+### Saving Wieghts model 
+
+* Save LoRA model (only the LoRA matrixes will be saved).
+
 ```python
-class Linear(nn.Module):
-    def __init__(self , in_features , out_features, bais=False):
-        super(Linear , self).__init__()
-        self.a = nn.Paramerts(torch.zero(-1 , 0))
-        self.b = nn.Paramters(torch.onse(-1 , 0)))
-    def forward(self input):
-        if bais=True:
-            return x*a + b 
-        return x * b
+import loralib as lora 
+# ===== Before =====
+# torch.save(model.state_dict(), checkpoint_path)
+# ===== After =====
+torch.save(lora.lora_state_dict(model), checkpoint_path)
 ```
 
-- <img src="assets/rocket.gif" width="32" height="32"/> Performance and productivety <img src="assets/rocket.gif" width="32" height="32"/>
+### Loading the Pre-Trained Model 
+
+* Load LoRA model (need to load the pre-trained model first).
+
+```python
+import loralib as lora 
+# Load the pre-trained checkpoint first
+model.load_state_dict(torch.load('ckpt_pretrained.pt'), strict=False)
+# Then load the LoRA checkpoint
+model.load_state_dict(torch.load('ckpt_lora.pt'), strict=False)
+```
+
+
+- <img src="assets/rocket.gif" width="32" height="32"/> Quantized Model <img src="assets/rocket.gif" width="32" height="32"/>
+
 - <img src="assets/time.gif" width="32" height="32"/> Time to Train <img src="assets/time.gif" width="32" height="32"/>
+
 - <img src="assets/money.gif" width="32" height="32"/> Cost to Train <img src="assets/money.gif" width="32" height="32"/>
 
 
 ## What's in it for you?
 
 For each of the above four pillars, we are sharing our codebase and insights to:
-- Assist you to leverage Transfomer-Based Model for your business needs and challenges
+- Assist you to leverage Transfomer-Based Model for your machines needs and challenges
 
-- Boost reproducibility efforts which are becoming increasingly difficult with LLMs
+- Boost reproducibility efforts which are becoming increasingly difficult with Transfomers 
 
 i am providing Tool that are ready-to-use for Quantize the model:
 
-- Finetuning Transfomer-Based on your proprietary dataset via PeFT methodologies such as LoRA and Prefix Tuning
+- Finetuning Transfomer-Based on your proprietary dataset via PeFT methodologies such as LoRA and QLoRa
 
 - Performing hyperparameter optimization to get the maximum performance out of these models
 
 ## What's the best way to use this repository?
 
-Go over to the TRansfomer-Based-specific directory that you are interested in, and open the ```README.md```. We have included details about the LLM, followed by performance results on open-source datasets!
+Go over to the Transfomer-Based-specific directory that you are interested in, and open the ```README.md```. We have included details about the LLMs, followed by performance results on open-source datasets!
 
 ## Roadmap
 

diff --git a/__init__.py b/__init__.py
@@ -1,2 +1,2 @@
-from .core.Quantized import AdapterLoRa
-from .core.utils import make_lora_replace
+from core.Quantized import AdapterLoRa
+from core.utils import make_lora_replace
diff --git a/core/Quantized.py b/core/Quantized.py
@@ -1,7 +1,7 @@
 import loratorch as LoraT
 import torch.nn as nn
 import loralib as lora
-from utils import make_lora_replace
+from .utils import make_lora_replace
 
 class CastOutputToFloat(nn.Module):
     def forward(self, x):
@@ -100,19 +100,21 @@ def reconstruct_model(self):
             make_lora_replace(self.model, self.lora_layer, self.Rank, self.layer)
             return "Model successfully reconstructed with LoRA-adapted layers"
 
-    def implement_lora(self):
+    def implement_lora(self,verbose=False):
         """
         Implement LoRA adaptation on the model.
 
         Returns:
             nn.Module: The model with LoRA adaptation applied.
         """
         total_trainable_params_before = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
-        print(f"Total trainable parameters before LoRA: {total_trainable_params_before}")
+        if verbose == True:
+            print(f"Total trainable parameters before LoRA: {total_trainable_params_before}")
 
         self.LoRa.mark_only_lora_as_trainable(self.model)
 
         total_trainable_params_after = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
-        print(f"Total trainable parameters after LoRA: {total_trainable_params_after}")
+        if verbose == True:
+            print(f"Total trainable parameters after LoRA: {total_trainable_params_after}")
 
         return self.model
diff --git a/exmpales/MultiAttention.py b/exmpales/MultiAttention.py
@@ -0,0 +1,111 @@
+import torch.nn as nn 
+import torch
+import math 
+
+
+class ScaleDotProductAttention(nn.Module):
+
+    """
+    compute scale dot product attention
+
+    Query : given sentence that we focused on (decoder)
+    Key : every sentence to check relationship with Qeur    y(encoder)
+    Value : every sentence same with Key (encoder)
+    """
+
+    def __init__(self,config):
+        super(ScaleDotProductAttention, self).__init__()
+        self.softmax = nn.Softmax(dim=-1)
+        self.attention_dropout = nn.Dropout(config["attention_droput"])
+
+    def forward(self, q, k, v,output_attentions=False):
+        # input is 4 dimension tensor
+        # [batch_size, head, length, d_tensor]
+        batch_size, head, length, d_tensor = k.size()
+
+        # 1. dot product Query with Key^T to compute similarity
+        k_t = k.transpose(2, 3)  # transpose
+        score = (q @ k_t) / math.sqrt(d_tensor)  # scaled dot product
+
+        # 3. pass them softmax to make [0, 1] range
+        score = self.softmax(score)
+        score =  self.attention_dropout(score)
+
+        # 4. multiply with Value
+        v = score @ v
+        if not output_attentions:
+            return (v, None)
+
+        return v, score
+
+
+class MultiHeadAttention(nn.Module):
+
+    def __init__(self,config):
+        super(MultiHeadAttention, self).__init__()
+        self.n_head = Config["num_heads"]
+        self.attention = ScaleDotProductAttention(config)
+        self.w_q = nn.Linear(config["embedding_size"], config["embedding_size"],bias = config["qkv_bias"])
+        self.w_k = nn.Linear(config["embedding_size"], config["embedding_size"],bias = config["qkv_bias"])
+        self.w_v = nn.Linear(config["embedding_size"], config["embedding_size"],bias = config["qkv_bias"])
+        self.w_concat = nn.Linear(config["embedding_size"], config["embedding_size"])
+
+    def forward(self, q, k, v,output_attentions=False):
+        # 1. dot product with weight matrices
+        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v)
+
+        # 2. split tensor by number of heads
+        q, k, v = self.split(q), self.split(k), self.split(v)
+
+        # 3. do scale dot product to compute similarity
+        out, attention = self.attention(q, k, v,output_attentions=output_attentions)
+
+        # 4. concat and pass to linear layer
+        out = self.concat(out)
+        out = self.w_concat(out)
+
+        # 5. visualize attention map
+        # TODO : we should implement visualization
+        if not output_attentions:
+            return (out, None)
+
+        return out , attention
+
+    def split(self, tensor):
+        """
+        split tensor by number of head
+
+        :param tensor: [batch_size, length, d_model]
+        :return: [batch_size, head, length, d_tensor]
+        """
+        batch_size, length, d_model = tensor.size()
+
+        d_tensor = d_model // self.n_head
+        tensor = tensor.view(batch_size, length, self.n_head, d_tensor).transpose(1, 2)
+        # it is similar with group convolution (split by number of heads)
+
+        return tensor
+
+    def concat(self, tensor):
+        """
+        inverse function of self.split(tensor : torch.Tensor)
+
+        :param tensor: [batch_size, head, length, d_tensor]
+        :return: [batch_size, length, d_model]
+        """
+        batch_size, head, length, d_tensor = tensor.size()
+        d_model = head * d_tensor
+
+        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model)
+        return tensor
+        """
+        inverse function of self.split(tensor : torch.Tensor)
+
+        :param tensor: [batch_size, head, length, d_tensor]
+        :return: [batch_size, length, d_model]
+        """
+        batch_size, head, length, d_tensor = tensor.size()
+        d_model = head * d_tensor
+
+        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model)
+        return tensor
diff --git a/exmpales/Usage.py b/exmpales/Usage.py
@@ -0,0 +1,22 @@
+import torch.nn as nn
+import torch
+import os 
+import sys
+
+current_dir = os.path.dirname(__file__)
+target_dir = os.path.abspath(os.path.join(current_dir, ".././"))
+sys.path.insert(0, target_dir)
+
+from core.Quantized import AdapterLoRa
+
+model = nn.TransformerEncoderLayer(d_model=512, nhead=8)
+
+Adpate_model = AdapterLoRa(model , method="LoRa", Rank=4)
+Adpate_model.add_layer("self_attn")
+Adpate_model.add_layer("linear1")
+Adpate_model.add_layer("linear2")
+Adpate_model.reconstruct_model()
+model = Adpate_model.implement_lora(verbose=True)
+
+
+
diff --git a/exmpales/transfomer.py b/exmpales/transfomer.py
diff --git a/exmpales/Attention.py → exmpales/transfomerEncoder.py b/exmpales/Attention.py → exmpales/transfomerEncoder.py
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,2 @@
-Loralib
-loratorch
+git+https://github.com/Baijiong-Lin/LoRA-Torch
+git+https://github.com/microsoft/LoRA
diff --git a/setup.py b/setup.py
@@ -18,7 +18,7 @@
 
 setuptools.setup(
     name="AdapterLoRa",
-    version="1.1.1",
+    version="1.1.2",
     author="Youness EL BRAG",
     author_email="[email protected]",
     description="A Tool for adaptation Larger Transfomer-Based model and Quantization built top on libraries LoRa and LoRa-Torch.",