HanyangTechAI · seung7361 · Mar 30, 2023
diff --git a/week2/2주차_최승빈.md b/week2/2주차_최승빈.md
@@ -0,0 +1,82 @@
+# 최승빈
+
+생성일: March 27, 2023 2:55 PM
+
+# Introduction to Transformers
+
+## Positional Encoding
+
+```python
+class Transformer(torch.nn.Module):
+    def __init__(self, embedding_size, src_vocab_size, trg_vocab_size, src_pad_idx,
+                 num_heads, num_encoder_layers, num_decoder_layers,
+                 forward_expansion, dropout, max_len, device):
+        super(Transformer, self).__init__()
+
+        self.src_word_embedding = torch.nn.Embedding(src_vocab_size, embedding_size).to(device)
+        self.src_position_embedding = torch.nn.Embedding(max_len, embedding_size).to(device)
+        self.trg_word_embedding = torch.nn.Embedding(trg_vocab_size, embedding_size).to(device)
+        self.trg_position_embedding = torch.nn.Embedding(max_len, embedding_size).to(device)
+
+		def make_src_mask(self, src):
+	        src_mask = src.transpose(0, 1) == self.src_pad_idx
+
+	        # (src_length, N) => (N, src_length)
+	        return src_mask.to(device)
+
+	    def forward(self, src, trg):
+	        src_seq_length, N = src.shape
+	        trg_seq_length, N = trg.shape
+	        src_position = (
+	            torch.arange(0, src_seq_length).unsqueeze(1).expand(src_seq_length, N).to(self.device)
+	        )
+
+
+	        trg_position = (
+	            torch.arange(0, trg_seq_length).unsqueeze(1).expand(trg_seq_length, N).to(self.device)
+	        )
+
+
+	        embed_src = self.dropout(
+	            (self.src_word_embedding(src) + self.src_position_embedding(src_position))
+	        ).to(device)
+
+	        embed_trg = self.dropout(
+	            (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_position))
+	        ).to(device)
+
+	        src_padding_mask = self.make_src_mask(src).to(device)
+	        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(self.device)
+	        print(src.shape, trg.shape)
+	        print(embed_src.shape, embed_trg.shape, src_padding_mask.shape, trg_mask.shape)
+	        out = self.transformer(embed_src, embed_trg,
+	                               src_key_padding_mask=src_padding_mask,
+	                               tgt_mask=trg_mask).to(device)
+	        out = self.fc_out(out).to(device)
+
+	        return out
+```
+
+![Untitled](%E1%84%8E%E1%85%AC%E1%84%89%E1%85%B3%E1%86%BC%E1%84%87%E1%85%B5%E1%86%AB%20b6a3460c2d94449bb7afe91140762d80/Untitled.png)
+
+- Transformer does not have a positional information at the first place
+- So we add token embedding vector to represent the position
+
+## Multihead Attention
+
+![Untitled](%E1%84%8E%E1%85%AC%E1%84%89%E1%85%B3%E1%86%BC%E1%84%87%E1%85%B5%E1%86%AB%20b6a3460c2d94449bb7afe91140762d80/Untitled%201.png)
+
+- Allows the model to attend to different parts of the input sequence in parallel by splitting the input into multiple heads and performing scaled dot-product attention on each head.
+- improved performance and able to do complex tasks
+- All of them are concatenated at the last for the further processing
+
+## Self Attention
+
+![Untitled](%E1%84%8E%E1%85%AC%E1%84%89%E1%85%B3%E1%86%BC%E1%84%87%E1%85%B5%E1%86%AB%20b6a3460c2d94449bb7afe91140762d80/Untitled%202.png)
+
+- the input sequence is transformed into three matrices: query, key, and value matrix
+- `\sqrt{d_k}` is done to prevent the value becoming too large, which might result the model train instability
+
+Layer Normalization
+
+Residual Connection
diff --git a/week3/3주차_최승빈.md b/week3/3주차_최승빈.md
@@ -0,0 +1,82 @@
+# 최승빈
+
+생성일: March 27, 2023 2:55 PM
+
+# Introduction to Transformers
+
+## Positional Encoding
+
+```python
+class Transformer(torch.nn.Module):
+    def __init__(self, embedding_size, src_vocab_size, trg_vocab_size, src_pad_idx,
+                 num_heads, num_encoder_layers, num_decoder_layers,
+                 forward_expansion, dropout, max_len, device):
+        super(Transformer, self).__init__()
+
+        self.src_word_embedding = torch.nn.Embedding(src_vocab_size, embedding_size).to(device)
+        self.src_position_embedding = torch.nn.Embedding(max_len, embedding_size).to(device)
+        self.trg_word_embedding = torch.nn.Embedding(trg_vocab_size, embedding_size).to(device)
+        self.trg_position_embedding = torch.nn.Embedding(max_len, embedding_size).to(device)
+
+		def make_src_mask(self, src):
+	        src_mask = src.transpose(0, 1) == self.src_pad_idx
+
+	        # (src_length, N) => (N, src_length)
+	        return src_mask.to(device)
+
+	    def forward(self, src, trg):
+	        src_seq_length, N = src.shape
+	        trg_seq_length, N = trg.shape
+	        src_position = (
+	            torch.arange(0, src_seq_length).unsqueeze(1).expand(src_seq_length, N).to(self.device)
+	        )
+
+
+	        trg_position = (
+	            torch.arange(0, trg_seq_length).unsqueeze(1).expand(trg_seq_length, N).to(self.device)
+	        )
+
+
+	        embed_src = self.dropout(
+	            (self.src_word_embedding(src) + self.src_position_embedding(src_position))
+	        ).to(device)
+
+	        embed_trg = self.dropout(
+	            (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_position))
+	        ).to(device)
+
+	        src_padding_mask = self.make_src_mask(src).to(device)
+	        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(self.device)
+	        print(src.shape, trg.shape)
+	        print(embed_src.shape, embed_trg.shape, src_padding_mask.shape, trg_mask.shape)
+	        out = self.transformer(embed_src, embed_trg,
+	                               src_key_padding_mask=src_padding_mask,
+	                               tgt_mask=trg_mask).to(device)
+	        out = self.fc_out(out).to(device)
+
+	        return out
+```
+
+![Untitled](%E1%84%8E%E1%85%AC%E1%84%89%E1%85%B3%E1%86%BC%E1%84%87%E1%85%B5%E1%86%AB%20b6a3460c2d94449bb7afe91140762d80/Untitled.png)
+
+- Transformer does not have a positional information at the first place
+- So we add token embedding vector to represent the position
+
+## Multihead Attention
+
+![Untitled](%E1%84%8E%E1%85%AC%E1%84%89%E1%85%B3%E1%86%BC%E1%84%87%E1%85%B5%E1%86%AB%20b6a3460c2d94449bb7afe91140762d80/Untitled%201.png)
+
+- Allows the model to attend to different parts of the input sequence in parallel by splitting the input into multiple heads and performing scaled dot-product attention on each head.
+- improved performance and able to do complex tasks
+- All of them are concatenated at the last for the further processing
+
+## Self Attention
+
+![Untitled](%E1%84%8E%E1%85%AC%E1%84%89%E1%85%B3%E1%86%BC%E1%84%87%E1%85%B5%E1%86%AB%20b6a3460c2d94449bb7afe91140762d80/Untitled%202.png)
+
+- the input sequence is transformed into three matrices: query, key, and value matrix
+- `\sqrt{d_k}` is done to prevent the value becoming too large, which might result the model train instability
+
+Layer Normalization
+
+Residual Connection