From cabec9e6cd0896ef487caf149bd2e743366c7353 Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Mon, 15 May 2023 14:56:49 +0800
Subject: [PATCH 01/19] Add MIMICIV configs.

---
 example_config/MIMICIV-ICD10-50/laat.yml      | 39 +++++++++++
 example_config/MIMICIV-ICD10-50/laat_tune.yml | 65 +++++++++++++++++++
 example_config/MIMICIV-ICD9-50/laat.yml       | 39 +++++++++++
 example_config/MIMICIV-ICD9-50/laat_tune.yml  | 65 +++++++++++++++++++
 4 files changed, 208 insertions(+)
 create mode 100644 example_config/MIMICIV-ICD10-50/laat.yml
 create mode 100644 example_config/MIMICIV-ICD10-50/laat_tune.yml
 create mode 100644 example_config/MIMICIV-ICD9-50/laat.yml
 create mode 100644 example_config/MIMICIV-ICD9-50/laat_tune.yml

diff --git a/example_config/MIMICIV-ICD10-50/laat.yml b/example_config/MIMICIV-ICD10-50/laat.yml
new file mode 100644
index 00000000..b9ee7d22
--- /dev/null
+++ b/example_config/MIMICIV-ICD10-50/laat.yml
@@ -0,0 +1,39 @@
+# data
+training_file: data/MIMICIV-ICD10-50/train.txt
+val_file: data/MIMICIV-ICD10-50/valid.txt
+test_file: data/MIMICIV-ICD10-50/test.txt
+data_name: MIMICIV-ICD10-50
+min_vocab_freq: 1
+max_seq_length: 4000
+include_test_labels: true
+remove_no_label_data: true
+
+# train
+seed: 0
+epochs: 50
+batch_size: 8
+optimizer: adamw
+learning_rate: 0.001
+weight_decay: 0
+patience: 6
+shuffle: true
+
+# eval
+eval_batch_size: 8
+monitor_metrics: ['Loss', 'Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@5', 'P@8', 'P@15']
+val_metric: Micro-F1
+
+# model
+model_name: LAAT
+init_weight: kaiming_uniform
+network_config:
+  embed_dropout: 0.3
+  encoder_dropout: 0
+  rnn_dim: 512  # 512//2 = 256
+  num_layers: 1
+  d_a: 256
+
+# pretrained vocab / embeddings
+vocab_file: data/MIMICIV-ICD10-50/vocab.csv # 179,131, min_vocab_freq=1
+embed_file: data/MIMICIV-ICD10-50/processed_full.embed
+normalize_embed: false
diff --git a/example_config/MIMICIV-ICD10-50/laat_tune.yml b/example_config/MIMICIV-ICD10-50/laat_tune.yml
new file mode 100644
index 00000000..6101a0c4
--- /dev/null
+++ b/example_config/MIMICIV-ICD10-50/laat_tune.yml
@@ -0,0 +1,65 @@
+# data
+training_file: data/MIMICIV-ICD10-50/train.txt
+val_file: data/MIMICIV-ICD10-50/valid.txt
+test_file: data/MIMICIV-ICD10-50/test.txt
+data_name: MIMICIV-ICD10-50
+min_vocab_freq: 1
+max_seq_length: 4000
+include_test_labels: true
+remove_no_label_data: true
+add_special_tokens: false
+
+# train
+seed: 0
+epochs: 50
+batch_size: 8
+optimizer: adamw
+learning_rate: 0.001
+momentum: 0.9
+weight_decay: 0
+patience: 6
+early_stopping_metric: Micro-F1
+shuffle: true
+
+# eval
+eval_batch_size: 8
+monitor_metrics: ['Loss', 'Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@5', 'P@8', 'P@15']
+val_metric: Micro-F1
+
+# model
+model_name: LAAT
+loss_function: binary_cross_entropy_with_logits
+init_weight: kaiming_uniform
+network_config:
+  embed_dropout: 0.3
+  encoder_dropout: 0
+  rnn_dim: ['grid_search', [512, 1024, 256]]
+  num_layers: 1
+  d_a: ['grid_search', [256, 512, 128]]
+
+# pretrained vocab / embeddings
+vocab_file: data/MIMICIV-ICD10-50/vocab.csv # 179,131, min_vocab_freq=1
+embed_file: data/MIMICIV-ICD10-50/processed_full.embed
+normalize_embed: false
+
+# hyperparamter search
+search_alg: basic_variant
+embed_cache_dir: null
+num_samples: 1
+scheduler: null
+no_merge_train_val: true # do not retrain
+
+# other parameters specified in main.py::get_args
+checkpoint_path: null
+cpu: false
+data_workers: 8
+eval: false
+label_file: null
+limit_train_batches: 1.0
+limit_val_batches: 1.0
+limit_test_batches: 1.0
+metric_threshold: 0.5
+result_dir: runs
+save_k_predictions: 0
+silent: true
+val_size: 0.2
diff --git a/example_config/MIMICIV-ICD9-50/laat.yml b/example_config/MIMICIV-ICD9-50/laat.yml
new file mode 100644
index 00000000..0fd281b3
--- /dev/null
+++ b/example_config/MIMICIV-ICD9-50/laat.yml
@@ -0,0 +1,39 @@
+# data
+training_file: data/MIMICIV-ICD9-50/train.txt
+val_file: data/MIMICIV-ICD9-50/valid.txt
+test_file: data/MIMICIV-ICD9-50/test.txt
+data_name: MIMICIV-ICD9-50
+min_vocab_freq: 1
+max_seq_length: 4000
+include_test_labels: true
+remove_no_label_data: true
+
+# train
+seed: 0 # 1337
+epochs: 50
+batch_size: 8
+optimizer: adamw
+learning_rate: 0.001
+weight_decay: 0
+patience: 6
+shuffle: true
+
+# eval
+eval_batch_size: 8
+monitor_metrics: ['Loss', 'Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@5', 'P@8', 'P@15']
+val_metric: Micro-F1
+
+# model
+model_name: LAAT
+init_weight: kaiming_uniform
+network_config:
+  embed_dropout: 0.3
+  encoder_dropout: 0
+  rnn_dim: 512  # 512//2 = 256
+  num_layers: 1
+  d_a: 256
+
+# pretrained vocab / embeddings
+vocab_file: data/MIMICIV-ICD9-50/vocab.csv # 282,173, min_vocab_freq=1
+embed_file: data/MIMICIV-ICD9-50/processed_full.embed
+normalize_embed: false
diff --git a/example_config/MIMICIV-ICD9-50/laat_tune.yml b/example_config/MIMICIV-ICD9-50/laat_tune.yml
new file mode 100644
index 00000000..42c4c3c6
--- /dev/null
+++ b/example_config/MIMICIV-ICD9-50/laat_tune.yml
@@ -0,0 +1,65 @@
+# data
+training_file: data/MIMICIV-ICD9-50/train.txt
+val_file: data/MIMICIV-ICD9-50/valid.txt
+test_file: data/MIMICIV-ICD9-50/test.txt
+data_name: MIMICIV-ICD9-50
+min_vocab_freq: 1
+max_seq_length: 4000
+include_test_labels: true
+remove_no_label_data: true
+add_special_tokens: false
+
+# train
+seed: 0
+epochs: 50
+batch_size: 8
+optimizer: adamw
+learning_rate: 0.001
+momentum: 0.9
+weight_decay: 0
+patience: 6
+early_stopping_metric: Micro-F1
+shuffle: true
+
+# eval
+eval_batch_size: 8
+monitor_metrics: ['Loss', 'Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@5', 'P@8', 'P@15']
+val_metric: Micro-F1
+
+# model
+model_name: LAAT
+loss_function: binary_cross_entropy_with_logits
+init_weight: kaiming_uniform
+network_config:
+  embed_dropout: 0.3
+  encoder_dropout: 0
+  rnn_dim: ['grid_search', [512, 1024, 256]]
+  num_layers: 1
+  d_a: ['grid_search', [256, 512, 128]]
+
+# pretrained vocab / embeddings
+vocab_file: data/MIMICIV-ICD9-50/vocab.csv # 282,173, min_vocab_freq=1
+embed_file: data/MIMICIV-ICD9-50/processed_full.embed
+normalize_embed: false
+
+# hyperparamter search
+search_alg: basic_variant
+embed_cache_dir: null
+num_samples: 1
+scheduler: null
+no_merge_train_val: true # do not retrain
+
+# other parameters specified in main.py::get_args
+checkpoint_path: null
+cpu: false
+data_workers: 8
+eval: false
+label_file: null
+limit_train_batches: 1.0
+limit_val_batches: 1.0
+limit_test_batches: 1.0
+metric_threshold: 0.5
+result_dir: runs
+save_k_predictions: 0
+silent: true
+val_size: 0.2

From 3771f4459457e98ecd8ff4518c0741dc2a6e8995 Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Mon, 15 May 2023 15:25:45 +0800
Subject: [PATCH 02/19] Add LAAT reproduce code.

---
 example_config/MIMICIV-ICD10-50/laat_tune.yml |  4 +-
 example_config/MIMICIV-ICD9-50/laat_tune.yml  |  4 +-
 libmultilabel/nn/data_utils.py                | 11 ++-
 libmultilabel/nn/model.py                     | 18 ++++-
 libmultilabel/nn/networks/__init__.py         |  1 +
 libmultilabel/nn/networks/laat.py             | 75 +++++++++++++++++++
 libmultilabel/nn/networks/modules.py          |  2 +-
 7 files changed, 106 insertions(+), 9 deletions(-)
 create mode 100644 libmultilabel/nn/networks/laat.py

diff --git a/example_config/MIMICIV-ICD10-50/laat_tune.yml b/example_config/MIMICIV-ICD10-50/laat_tune.yml
index 6101a0c4..558ac3be 100644
--- a/example_config/MIMICIV-ICD10-50/laat_tune.yml
+++ b/example_config/MIMICIV-ICD10-50/laat_tune.yml
@@ -33,9 +33,9 @@ init_weight: kaiming_uniform
 network_config:
   embed_dropout: 0.3
   encoder_dropout: 0
-  rnn_dim: ['grid_search', [512, 1024, 256]]
+  rnn_dim: ['grid_search', [512, 1024, 768, 256]]  # 2u
   num_layers: 1
-  d_a: ['grid_search', [256, 512, 128]]
+  d_a: ['grid_search', [256, 512, 384, 128]]
 
 # pretrained vocab / embeddings
 vocab_file: data/MIMICIV-ICD10-50/vocab.csv # 179,131, min_vocab_freq=1
diff --git a/example_config/MIMICIV-ICD9-50/laat_tune.yml b/example_config/MIMICIV-ICD9-50/laat_tune.yml
index 42c4c3c6..cc03991c 100644
--- a/example_config/MIMICIV-ICD9-50/laat_tune.yml
+++ b/example_config/MIMICIV-ICD9-50/laat_tune.yml
@@ -33,9 +33,9 @@ init_weight: kaiming_uniform
 network_config:
   embed_dropout: 0.3
   encoder_dropout: 0
-  rnn_dim: ['grid_search', [512, 1024, 256]]
+  rnn_dim: ['grid_search', [512, 1024, 768, 256]] # 2u
   num_layers: 1
-  d_a: ['grid_search', [256, 512, 128]]
+  d_a: ['grid_search', [256, 512, 384, 128]]
 
 # pretrained vocab / embeddings
 vocab_file: data/MIMICIV-ICD9-50/vocab.csv # 282,173, min_vocab_freq=1
diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py
index 73a7f9d6..7c25c819 100644
--- a/libmultilabel/nn/data_utils.py
+++ b/libmultilabel/nn/data_utils.py
@@ -45,12 +45,13 @@ def __init__(
         tokenizer=None,
         word_dict=None,
     ):
-        self.data = data
+        # self.data = data
         self.classes = classes
         self.max_seq_length = max_seq_length
         self.word_dict = word_dict
         self.tokenizer = tokenizer
         self.add_special_tokens = add_special_tokens
+        self.data = self.sort_by_length(data)  # LAAT
 
         self.num_classes = len(self.classes)
         self.label_binarizer = MultiLabelBinarizer().fit([classes])
@@ -77,6 +78,10 @@ def __getitem__(self, index):
             "label": torch.IntTensor(self.label_binarizer.transform([data["label"]])[0]),
         }
 
+    def sort_by_length(self, data):
+        # reverse for LAAT
+        return sorted(data, key=lambda x: -len(x['text'][:self.max_seq_length]))
+
 
 def tokenize(text):
     """Tokenize text.
@@ -387,8 +392,10 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache=N
     if load_embedding_from_file:
         # Add UNK embedding
         # AttentionXML: np.random.uniform(-1.0, 1.0, embed_size)
+        # LAAT: np.random.uniform(-0.25, 0.25, embedding_size)
         # CAML: np.random.randn(embed_size)
-        unk_vector = torch.randn(embed_size)
+        import numpy as np
+        unk_vector = torch.tensor(np.random.uniform(-0.25, 0.25, embed_size))
         embedding_weights[word_dict[UNK]] = unk_vector
 
     # Store pretrained word embedding
diff --git a/libmultilabel/nn/model.py b/libmultilabel/nn/model.py
index bc6cf134..e0a8399d 100644
--- a/libmultilabel/nn/model.py
+++ b/libmultilabel/nn/model.py
@@ -40,6 +40,7 @@ def __init__(
         multiclass=False,
         silent=False,
         save_k_predictions=0,
+        val_metric='Micro-F1', # LAAT
         **kwargs
     ):
         super().__init__()
@@ -59,6 +60,7 @@ def __init__(
         self.multiclass = multiclass
         top_k = 1 if self.multiclass else None
         self.eval_metric = get_metrics(metric_threshold, monitor_metrics, num_classes, top_k=top_k)
+        self.val_metric = val_metric # LAAT
 
     @abstractmethod
     def shared_step(self, batch):
@@ -82,8 +84,20 @@ def configure_optimizers(self):
         else:
             raise RuntimeError("Unsupported optimizer: {self.optimizer}")
 
-        torch.nn.utils.clip_grad_value_(parameters, 0.5)
-
+        # torch.nn.utils.clip_grad_value_(parameters, 0.5)
+        # LAAT hard code
+        if self.val_metric is not None:
+            return {
+                "optimizer": optimizer,
+                "lr_scheduler": {
+                    "scheduler": optim.lr_scheduler.ReduceLROnPlateau(
+                        optimizer, mode="max",
+                        factor=0.9,
+                        patience=5,
+                        min_lr=0.0001),
+                    "monitor": self.val_metric,
+                },
+            }
         return optimizer
 
     def training_step(self, batch, batch_idx):
diff --git a/libmultilabel/nn/networks/__init__.py b/libmultilabel/nn/networks/__init__.py
index a5667856..4bc59d24 100644
--- a/libmultilabel/nn/networks/__init__.py
+++ b/libmultilabel/nn/networks/__init__.py
@@ -3,6 +3,7 @@
 from .bert import BERT
 from .bert_attention import BERTAttention
 from .caml import CAML
+from .laat import LAAT
 from .kim_cnn import KimCNN
 from .xml_cnn import XMLCNN
 from .labelwise_attention_networks import BiGRULWAN
diff --git a/libmultilabel/nn/networks/laat.py b/libmultilabel/nn/networks/laat.py
new file mode 100644
index 00000000..171e1f37
--- /dev/null
+++ b/libmultilabel/nn/networks/laat.py
@@ -0,0 +1,75 @@
+import torch
+import torch.nn as nn
+
+from .modules import Embedding, LSTMEncoder
+
+
+class LAAT(nn.Module):
+    """LAAT Vu.
+
+    Args:
+        embed_vecs (torch.Tensor): The pre-trained word vectors of shape (vocab_size, embed_dim).
+        num_classes (int): Total number of classes.
+        filter_sizes (list): Size of convolutional filters.
+        num_filter_per_size (int): The number of filters in convolutional layers in each size. Defaults to 50.
+        dropout (float): The dropout rate of the word embedding. Defaults to 0.2.
+    """
+
+    def __init__(
+        self,
+        embed_vecs,
+        num_classes,
+        rnn_dim=1024,
+        num_layers=1,
+        d_a=512,
+        embed_dropout=0.3,
+        encoder_dropout=0,
+    ):
+        super(LAAT, self).__init__()
+
+        self.embedding = Embedding(embed_vecs, embed_dropout)
+        self.num_layers = num_layers
+        self.rnn_dim = rnn_dim
+
+        # Initialize rnn layer (H: 2u * N)
+        self.encoder = LSTMEncoder(
+            input_size=embed_vecs.shape[1],
+            hidden_size=rnn_dim//2, num_layers=num_layers, dropout=encoder_dropout)
+
+        mean = 0.0
+        std = 0.3
+        # first linear
+        # Z = tanh(WH), W: (d_a * 2u), H: (2u * N), Z: (d_a * N)
+        self.W = nn.Linear(rnn_dim, d_a, bias=False)
+        torch.nn.init.normal(self.W.weight, mean, std)
+
+        """Context vectors for computing attention with
+        (in_features, out_features) = (d_a, num_classes)
+        """
+        # second linear
+        # A = softmax(UZ), U: (|L| * d_a), Z: (d_a * N), A: |L| * N
+        self.Q = nn.Linear(d_a, num_classes, bias=False)
+        torch.nn.init.normal(self.Q.weight, mean, std)
+
+        # Final layer: create a matrix to use for the #labels binary classifiers
+        self.output = nn.Linear(rnn_dim, num_classes, bias=True)
+        torch.nn.init.normal(self.output.weight, mean, std)
+
+    def forward(self, input):
+        # Get embeddings and apply dropout
+        x = self.embedding(input['text'])  # (batch_size, length, embed_dim)
+
+        x = self.encoder(x, input['length'])  # (batch_size, length, rnn_dim)
+        Z = torch.tanh(self.W(x))  # (batch_size, length, d_a)
+
+        # (batch_size, class_num, length)
+        alpha = torch.softmax(self.Q.weight.matmul(Z.transpose(1, 2)), dim=2)
+
+        # Document representations are weighted sums using the attention
+        E = alpha.matmul(x)
+
+        # Compute a probability for each label
+        logits = self.output.weight.mul(E).sum(dim=2).add(
+            self.output.bias)  # (batch_size, num_classes)
+
+        return {'logits': logits, 'attention': alpha}
diff --git a/libmultilabel/nn/networks/modules.py b/libmultilabel/nn/networks/modules.py
index d41918ea..416f4c81 100644
--- a/libmultilabel/nn/networks/modules.py
+++ b/libmultilabel/nn/networks/modules.py
@@ -16,7 +16,7 @@ class Embedding(nn.Module):
 
     def __init__(self, embed_vecs, dropout=0.2):
         super(Embedding, self).__init__()
-        self.embedding = nn.Embedding.from_pretrained(embed_vecs, freeze=False, padding_idx=0)
+        self.embedding = nn.Embedding.from_pretrained(embed_vecs, freeze=True, padding_idx=0) # LAAT
         self.dropout = nn.Dropout(dropout)
 
     def forward(self, input):

From 61e3d30a2bd9401ab3e6dd6c337a0bc8f18782b0 Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Sun, 21 May 2023 07:05:20 +0800
Subject: [PATCH 03/19] Backup print learning rate

---
 libmultilabel/nn/model.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libmultilabel/nn/model.py b/libmultilabel/nn/model.py
index e0a8399d..c093db46 100644
--- a/libmultilabel/nn/model.py
+++ b/libmultilabel/nn/model.py
@@ -111,6 +111,10 @@ def validation_step_end(self, batch_parts):
         return self._shared_eval_step_end(batch_parts)
 
     def validation_epoch_end(self, step_outputs):
+        # print learning rate (LAAT)
+        lightning_optimizer = self.optimizers()
+        for param_group in lightning_optimizer.optimizer.param_groups:
+            print(f"\nLearning Rate: {param_group['lr']}\n")
         return self._shared_eval_epoch_end(step_outputs, "val")
 
     def test_step(self, batch, batch_idx):

From 4d777ca5c7741aab9d14fefdae2303a5c6bf29a3 Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Thu, 25 May 2023 14:16:53 +0800
Subject: [PATCH 04/19] Update laat_tune config. (MIMICIV-ICD9-50)

---
 example_config/MIMICIV-ICD9-50/laat_tune.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/example_config/MIMICIV-ICD9-50/laat_tune.yml b/example_config/MIMICIV-ICD9-50/laat_tune.yml
index cc03991c..95ded40f 100644
--- a/example_config/MIMICIV-ICD9-50/laat_tune.yml
+++ b/example_config/MIMICIV-ICD9-50/laat_tune.yml
@@ -14,7 +14,7 @@ seed: 0
 epochs: 50
 batch_size: 8
 optimizer: adamw
-learning_rate: 0.001
+learning_rate: ['grid_search', [0.001, 0.0003]] # LAAT: 0.001
 momentum: 0.9
 weight_decay: 0
 patience: 6
@@ -32,10 +32,10 @@ loss_function: binary_cross_entropy_with_logits
 init_weight: kaiming_uniform
 network_config:
   embed_dropout: 0.3
-  encoder_dropout: 0
-  rnn_dim: ['grid_search', [512, 1024, 768, 256]] # 2u
+  encoder_dropout: ['grid_search', [0, 0.2, 0.4]] # LAAT: 0
+  rnn_dim: ['grid_search', [512, 768, 1024]] # LAAT: 512
   num_layers: 1
-  d_a: ['grid_search', [256, 512, 384, 128]]
+  d_a: ['grid_search', [256, 384, 512]] # LAAT: 256
 
 # pretrained vocab / embeddings
 vocab_file: data/MIMICIV-ICD9-50/vocab.csv # 282,173, min_vocab_freq=1

From d463d8af96e0db136d123d10e4ba01d003f0964d Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Thu, 25 May 2023 16:15:37 +0800
Subject: [PATCH 05/19] 0.3 to 0.03 ... bug!!

---
 libmultilabel/nn/networks/laat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libmultilabel/nn/networks/laat.py b/libmultilabel/nn/networks/laat.py
index 171e1f37..864616de 100644
--- a/libmultilabel/nn/networks/laat.py
+++ b/libmultilabel/nn/networks/laat.py
@@ -37,7 +37,7 @@ def __init__(
             hidden_size=rnn_dim//2, num_layers=num_layers, dropout=encoder_dropout)
 
         mean = 0.0
-        std = 0.3
+        std = 0.03
         # first linear
         # Z = tanh(WH), W: (d_a * 2u), H: (2u * N), Z: (d_a * N)
         self.W = nn.Linear(rnn_dim, d_a, bias=False)

From f3502948f4189b184b097f7b7421a8e3ce26c7e3 Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Mon, 5 Jun 2023 04:02:43 +0800
Subject: [PATCH 06/19] Update LAAT.

---
 libmultilabel/nn/data_utils.py       | 43 +++++++++++++++++++---------
 libmultilabel/nn/model.py            | 23 +++++++++++++--
 libmultilabel/nn/networks/laat.py    | 28 ++++++++++--------
 libmultilabel/nn/networks/modules.py | 30 ++++++++++++-------
 libmultilabel/nn/nn_utils.py         |  2 ++
 torch_trainer.py                     |  8 +++++-
 6 files changed, 95 insertions(+), 39 deletions(-)

diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py
index 7c25c819..4bc4aa68 100644
--- a/libmultilabel/nn/data_utils.py
+++ b/libmultilabel/nn/data_utils.py
@@ -1,6 +1,7 @@
 import csv
 import gc
 import logging
+import random
 import warnings
 
 import pandas as pd
@@ -82,6 +83,10 @@ def sort_by_length(self, data):
         # reverse for LAAT
         return sorted(data, key=lambda x: -len(x['text'][:self.max_seq_length]))
 
+    def shuffle_data(self):
+        # LAAT
+        random.shuffle(self.data)
+
 
 def tokenize(text):
     """Tokenize text.
@@ -145,7 +150,8 @@ def get_dataset_loader(
     dataset_loader = torch.utils.data.DataLoader(
         dataset,
         batch_size=batch_size,
-        shuffle=shuffle,
+        # shuffle=shuffle,
+        shuffle=False, # use TextDataset.shuffle_data()
         num_workers=data_workers,
         collate_fn=generate_batch,
         pin_memory="cuda" in device.type,
@@ -374,7 +380,8 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache=N
         vector_dict = {}
         for word_vector in tqdm(word_vectors, disable=silent):
             word, vector = word_vector.rstrip().split(" ", 1)
-            vector = torch.Tensor(list(map(float, vector.split())))
+            # vector = torch.Tensor(list(map(float, vector.split())))
+            vector = list(map(float, vector.split())) # LAAT
             vector_dict[word] = vector
     else:
         logging.info(f"Load pretrained embedding from torchtext.")
@@ -387,16 +394,21 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache=N
         vector_dict = pretrained_aliases[embed_file](cache=cache)
         embed_size = vector_dict.dim
 
-    embedding_weights = torch.zeros(len(word_dict), embed_size)
-
-    if load_embedding_from_file:
-        # Add UNK embedding
-        # AttentionXML: np.random.uniform(-1.0, 1.0, embed_size)
-        # LAAT: np.random.uniform(-0.25, 0.25, embedding_size)
-        # CAML: np.random.randn(embed_size)
-        import numpy as np
-        unk_vector = torch.tensor(np.random.uniform(-0.25, 0.25, embed_size))
-        embedding_weights[word_dict[UNK]] = unk_vector
+    # LAAT init: set default to UNK vector
+    import numpy as np
+    unk_vector = np.random.uniform(-0.25, 0.25, embed_size)
+    embedding_weights = [unk_vector] * (len(word_dict))
+    embedding_weights[0] = np.zeros(embed_size)
+
+    # embedding_weights = torch.zeros(len(word_dict), embed_size)
+    # if load_embedding_from_file:
+    #     # Add UNK embedding
+    #     # AttentionXML: np.random.uniform(-1.0, 1.0, embed_size)
+    #     # LAAT: np.random.uniform(-0.25, 0.25, embedding_size)
+    #     # CAML: np.random.randn(embed_size)
+    #     import numpy as np
+    #     unk_vector = torch.tensor(np.random.uniform(-0.25, 0.25, embed_size))
+    #     embedding_weights[word_dict[UNK]] = unk_vector
 
     # Store pretrained word embedding
     vec_counts = 0
@@ -410,4 +422,9 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache=N
 
     logging.info(f"loaded {vec_counts}/{len(word_dict)} word embeddings")
 
-    return embedding_weights
+    # For resolving `UserWarning: Creating a tensor from a list of numpy.ndarrays is
+    # extremely slow. Please consider converting the list to a single numpy.ndarray
+    # with numpy.array() before converting to a tensor.`
+    embedding_weights = np.array(embedding_weights, dtype=np.float32)
+    return torch.FloatTensor(embedding_weights) # LAAT
+    # return embedding_weights
diff --git a/libmultilabel/nn/model.py b/libmultilabel/nn/model.py
index c093db46..50eb2e10 100644
--- a/libmultilabel/nn/model.py
+++ b/libmultilabel/nn/model.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import pytorch_lightning as pl
+from pytorch_lightning.utilities.types import EPOCH_OUTPUT
 import torch
 import torch.nn.functional as F
 import torch.optim as optim
@@ -41,6 +42,7 @@ def __init__(
         silent=False,
         save_k_predictions=0,
         val_metric='Micro-F1', # LAAT
+        shuffle=True,
         **kwargs
     ):
         super().__init__()
@@ -61,6 +63,8 @@ def __init__(
         top_k = 1 if self.multiclass else None
         self.eval_metric = get_metrics(metric_threshold, monitor_metrics, num_classes, top_k=top_k)
         self.val_metric = val_metric # LAAT
+        self.shuffle = shuffle # LAAT
+        self.num_classes = num_classes
 
     @abstractmethod
     def shared_step(self, batch):
@@ -78,7 +82,11 @@ def configure_optimizers(self):
         elif optimizer_name == "adam":
             optimizer = optim.Adam(parameters, weight_decay=self.weight_decay, lr=self.learning_rate)
         elif optimizer_name == "adamw":
-            optimizer = optim.AdamW(parameters, weight_decay=self.weight_decay, lr=self.learning_rate)
+            # optimizer = optim.AdamW(
+            #     parameters, weight_decay=self.weight_decay, lr=self.learning_rate)
+            from transformers import AdamW
+            optimizer = AdamW(filter(lambda p: p.requires_grad, self.parameters()),
+                              lr=self.learning_rate, weight_decay=self.weight_decay)
         elif optimizer_name == "adamax":
             optimizer = optim.Adamax(parameters, weight_decay=self.weight_decay, lr=self.learning_rate)
         else:
@@ -101,9 +109,20 @@ def configure_optimizers(self):
         return optimizer
 
     def training_step(self, batch, batch_idx):
-        loss, _ = self.shared_step(batch)
+        # LAAT
+        idx = torch.argsort(batch['length'], descending=True, stable=True)
+        sorted_batch = {k: v[idx] for k, v in batch.items()}
+        loss, _ = self.shared_step(sorted_batch)
+        loss = self.num_classes * loss  # LAAT
+        loss = loss / self.num_classes
         return loss
 
+    def training_epoch_end(self, outputs: EPOCH_OUTPUT) -> None:
+        print(f'Reshuffling the data')
+        if self.shuffle:
+            self.trainer.train_dataloader.dataset.datasets.shuffle_data()
+        return super().training_epoch_end(outputs)
+
     def validation_step(self, batch, batch_idx):
         return self._shared_eval_step(batch, batch_idx)
 
diff --git a/libmultilabel/nn/networks/laat.py b/libmultilabel/nn/networks/laat.py
index 864616de..79db83e4 100644
--- a/libmultilabel/nn/networks/laat.py
+++ b/libmultilabel/nn/networks/laat.py
@@ -24,10 +24,11 @@ def __init__(
         d_a=512,
         embed_dropout=0.3,
         encoder_dropout=0,
+        freeze_embed=True,
     ):
         super(LAAT, self).__init__()
 
-        self.embedding = Embedding(embed_vecs, embed_dropout)
+        self.embedding = Embedding(embed_vecs, embed_dropout, freeze_embed)
         self.num_layers = num_layers
         self.rnn_dim = rnn_dim
 
@@ -39,34 +40,37 @@ def __init__(
         mean = 0.0
         std = 0.03
         # first linear
-        # Z = tanh(WH), W: (d_a * 2u), H: (2u * N), Z: (d_a * N)
         self.W = nn.Linear(rnn_dim, d_a, bias=False)
-        torch.nn.init.normal(self.W.weight, mean, std)
 
         """Context vectors for computing attention with
         (in_features, out_features) = (d_a, num_classes)
         """
-        # second linear
-        # A = softmax(UZ), U: (|L| * d_a), Z: (d_a * N), A: |L| * N
+        # second linear (U in the paper)
         self.Q = nn.Linear(d_a, num_classes, bias=False)
-        torch.nn.init.normal(self.Q.weight, mean, std)
 
         # Final layer: create a matrix to use for the #labels binary classifiers
         self.output = nn.Linear(rnn_dim, num_classes, bias=True)
-        torch.nn.init.normal(self.output.weight, mean, std)
+
+        torch.nn.init.normal_(self.W.weight, mean, std)
+        torch.nn.init.normal_(self.Q.weight, mean, std)
+        torch.nn.init.normal_(self.output.weight, mean, std)
 
     def forward(self, input):
         # Get embeddings and apply dropout
         x = self.embedding(input['text'])  # (batch_size, length, embed_dim)
+        H = self.encoder(x, input['length'])  # (batch_size, length, rnn_dim)
 
-        x = self.encoder(x, input['length'])  # (batch_size, length, rnn_dim)
-        Z = torch.tanh(self.W(x))  # (batch_size, length, d_a)
+        # (4) Z = tanh(WH), W: (d_a * 2u), H: (2u * N), Z: (d_a * N)
+        Z = torch.tanh(self.W(H))  # (batch_size, length, d_a)
 
-        # (batch_size, class_num, length)
-        alpha = torch.softmax(self.Q.weight.matmul(Z.transpose(1, 2)), dim=2)
+        # (5) A = softmax(UZ), A: (batch_size, class_num, length)
+        #     Q: (|L| * d_a), Z: (d_a * N), A: |L| * N
+        alpha = self.Q(Z)
+        alpha = torch.softmax(alpha, 1).transpose(1, 2)
+        # alpha = torch.softmax(self.Q.weight.matmul(Z.transpose(1, 2)), dim=2)
 
         # Document representations are weighted sums using the attention
-        E = alpha.matmul(x)
+        E = alpha.matmul(H)
 
         # Compute a probability for each label
         logits = self.output.weight.mul(E).sum(dim=2).add(
diff --git a/libmultilabel/nn/networks/modules.py b/libmultilabel/nn/networks/modules.py
index 416f4c81..22ec606e 100644
--- a/libmultilabel/nn/networks/modules.py
+++ b/libmultilabel/nn/networks/modules.py
@@ -4,6 +4,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+import copy
 
 
 class Embedding(nn.Module):
@@ -14,9 +15,14 @@ class Embedding(nn.Module):
         dropout (float): The dropout rate of the word embedding. Defaults to 0.2.
     """
 
-    def __init__(self, embed_vecs, dropout=0.2):
+    def __init__(self, embed_vecs, dropout=0.2, freeze_embed=False):
         super(Embedding, self).__init__()
-        self.embedding = nn.Embedding.from_pretrained(embed_vecs, freeze=True, padding_idx=0) # LAAT
+        # self.embedding = nn.Embedding.from_pretrained(
+        #     embed_vecs, freeze=freeze_embed, padding_idx=0)  # LAAT
+
+        self.embedding = nn.Embedding(embed_vecs.shape[0], embed_vecs.shape[1])
+        self.embedding.weight = nn.Parameter(copy.deepcopy(
+            embed_vecs), requires_grad=False)
         self.dropout = nn.Dropout(dropout)
 
     def forward(self, input):
@@ -35,19 +41,20 @@ class RNNEncoder(ABC, nn.Module):
 
     def __init__(self, input_size, hidden_size, num_layers, dropout=0):
         super(RNNEncoder, self).__init__()
-        self.rnn = self._get_rnn(input_size, hidden_size, num_layers)
-        self.dropout = nn.Dropout(dropout)
+        # self.rnn = self._get_rnn(input_size, hidden_size, num_layers)
+        # self.dropout = nn.Dropout(dropout)
+        self.rnn = self._get_rnn(input_size, hidden_size, num_layers, dropout)
 
     def forward(self, input, length, **kwargs):
         self.rnn.flatten_parameters()
-        idx = torch.argsort(length, descending=True)
+        idx = torch.argsort(length, descending=True, stable=True)
         length_clamped = length[idx].cpu().clamp(min=1)  # avoid the empty text with length 0
         packed_input = pack_padded_sequence(input[idx], length_clamped, batch_first=True)
         outputs, _ = pad_packed_sequence(self.rnn(packed_input)[0], batch_first=True)
-        return self.dropout(outputs[torch.argsort(idx)])
+        return outputs[torch.argsort(idx)]
 
     @abstractmethod
-    def _get_rnn(self, input_size, hidden_size, num_layers):
+    def _get_rnn(self, input_size, hidden_size, num_layers, dropout):
         raise NotImplementedError
 
 
@@ -64,8 +71,8 @@ class GRUEncoder(RNNEncoder):
     def __init__(self, input_size, hidden_size, num_layers, dropout=0):
         super(GRUEncoder, self).__init__(input_size, hidden_size, num_layers, dropout)
 
-    def _get_rnn(self, input_size, hidden_size, num_layers):
-        return nn.GRU(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
+    def _get_rnn(self, input_size, hidden_size, num_layers, dropout):
+        return nn.GRU(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True, dropout=dropout)
 
 
 class LSTMEncoder(RNNEncoder):
@@ -81,8 +88,9 @@ class LSTMEncoder(RNNEncoder):
     def __init__(self, input_size, hidden_size, num_layers, dropout=0):
         super(LSTMEncoder, self).__init__(input_size, hidden_size, num_layers, dropout)
 
-    def _get_rnn(self, input_size, hidden_size, num_layers):
-        return nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
+    def _get_rnn(self, input_size, hidden_size, num_layers, dropout):
+        return nn.LSTM(input_size, hidden_size, num_layers, bidirectional=True, dropout=dropout)
+        # return nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
 
 
 class CNNEncoder(nn.Module):
diff --git a/libmultilabel/nn/nn_utils.py b/libmultilabel/nn/nn_utils.py
index 8eabf02c..924731be 100644
--- a/libmultilabel/nn/nn_utils.py
+++ b/libmultilabel/nn/nn_utils.py
@@ -52,6 +52,7 @@ def init_model(
     loss_function="binary_cross_entropy_with_logits",
     silent=False,
     save_k_predictions=0,
+    shuffle=False, # LAAT
 ):
     """Initialize a `Model` class for initializing and training a neural network.
 
@@ -108,6 +109,7 @@ def init_model(
         loss_function=loss_function,
         silent=silent,
         save_k_predictions=save_k_predictions,
+        shuffle=shuffle, # LAAT
     )
     return model
 
diff --git a/torch_trainer.py b/torch_trainer.py
index f37706a9..e8dbffbf 100644
--- a/torch_trainer.py
+++ b/torch_trainer.py
@@ -124,7 +124,10 @@ def _setup_model(
             if self.config.embed_file is not None:
                 logging.info("Load word dictionary ")
                 word_dict, embed_vecs = data_utils.load_or_build_text_dict(
-                    dataset=self.datasets["train"],
+                    # dataset=self.datasets["train"],
+                    # LAAT
+                    dataset=self.datasets["train"] + \
+                    self.datasets["val"] + self.datasets["test"],
                     vocab_file=self.config.vocab_file,
                     min_vocab_freq=self.config.min_vocab_freq,
                     embed_file=self.config.embed_file,
@@ -169,6 +172,7 @@ def _setup_model(
                 loss_function=self.config.loss_function,
                 silent=self.config.silent,
                 save_k_predictions=self.config.save_k_predictions,
+                shuffle=self.config.shuffle, # LAAT
             )
 
     def _get_dataset_loader(self, split, shuffle=False):
@@ -208,6 +212,8 @@ def train(self):
             self.trainer.fit(self.model, train_loader)
         else:
             val_loader = self._get_dataset_loader(split="val")
+            if self.config.shuffle:
+                train_loader.dataset.shuffle_data()
             self.trainer.fit(self.model, train_loader, val_loader)
 
         # Set model to the best model. If the validation process is skipped during

From c8e4c9e29c695afc187a86e30f5d0bd7ba66264b Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Fri, 30 Jun 2023 14:27:18 +0800
Subject: [PATCH 07/19] Remove unused settings

---
 libmultilabel/nn/data_utils.py       | 49 ++++++++--------------------
 libmultilabel/nn/model.py            | 22 ++-----------
 libmultilabel/nn/networks/laat.py    | 13 ++++----
 libmultilabel/nn/networks/modules.py |  9 ++---
 libmultilabel/nn/nn_utils.py         |  2 --
 torch_trainer.py                     | 11 +++----
 6 files changed, 30 insertions(+), 76 deletions(-)

diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py
index 4bc4aa68..006ed241 100644
--- a/libmultilabel/nn/data_utils.py
+++ b/libmultilabel/nn/data_utils.py
@@ -1,7 +1,6 @@
 import csv
 import gc
 import logging
-import random
 import warnings
 
 import pandas as pd
@@ -46,13 +45,12 @@ def __init__(
         tokenizer=None,
         word_dict=None,
     ):
-        # self.data = data
+        self.data = data
         self.classes = classes
         self.max_seq_length = max_seq_length
         self.word_dict = word_dict
         self.tokenizer = tokenizer
         self.add_special_tokens = add_special_tokens
-        self.data = self.sort_by_length(data)  # LAAT
 
         self.num_classes = len(self.classes)
         self.label_binarizer = MultiLabelBinarizer().fit([classes])
@@ -79,14 +77,6 @@ def __getitem__(self, index):
             "label": torch.IntTensor(self.label_binarizer.transform([data["label"]])[0]),
         }
 
-    def sort_by_length(self, data):
-        # reverse for LAAT
-        return sorted(data, key=lambda x: -len(x['text'][:self.max_seq_length]))
-
-    def shuffle_data(self):
-        # LAAT
-        random.shuffle(self.data)
-
 
 def tokenize(text):
     """Tokenize text.
@@ -150,8 +140,7 @@ def get_dataset_loader(
     dataset_loader = torch.utils.data.DataLoader(
         dataset,
         batch_size=batch_size,
-        # shuffle=shuffle,
-        shuffle=False, # use TextDataset.shuffle_data()
+        shuffle=shuffle,
         num_workers=data_workers,
         collate_fn=generate_batch,
         pin_memory="cuda" in device.type,
@@ -380,8 +369,7 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache=N
         vector_dict = {}
         for word_vector in tqdm(word_vectors, disable=silent):
             word, vector = word_vector.rstrip().split(" ", 1)
-            # vector = torch.Tensor(list(map(float, vector.split())))
-            vector = list(map(float, vector.split())) # LAAT
+            vector = torch.Tensor(list(map(float, vector.split())))
             vector_dict[word] = vector
     else:
         logging.info(f"Load pretrained embedding from torchtext.")
@@ -394,21 +382,15 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache=N
         vector_dict = pretrained_aliases[embed_file](cache=cache)
         embed_size = vector_dict.dim
 
-    # LAAT init: set default to UNK vector
-    import numpy as np
-    unk_vector = np.random.uniform(-0.25, 0.25, embed_size)
-    embedding_weights = [unk_vector] * (len(word_dict))
-    embedding_weights[0] = np.zeros(embed_size)
-
-    # embedding_weights = torch.zeros(len(word_dict), embed_size)
-    # if load_embedding_from_file:
-    #     # Add UNK embedding
-    #     # AttentionXML: np.random.uniform(-1.0, 1.0, embed_size)
-    #     # LAAT: np.random.uniform(-0.25, 0.25, embedding_size)
-    #     # CAML: np.random.randn(embed_size)
-    #     import numpy as np
-    #     unk_vector = torch.tensor(np.random.uniform(-0.25, 0.25, embed_size))
-    #     embedding_weights[word_dict[UNK]] = unk_vector
+    embedding_weights = torch.zeros(len(word_dict), embed_size)
+
+    if load_embedding_from_file:
+        # Add UNK embedding
+        # AttentionXML: np.random.uniform(-1.0, 1.0, embed_size)
+        # LAAT: np.random.uniform(-0.25, 0.25, embedding_size)
+        # CAML: np.random.randn(embed_size)
+        unk_vector = torch.randn(embed_size)
+        embedding_weights[word_dict[UNK]] = unk_vector
 
     # Store pretrained word embedding
     vec_counts = 0
@@ -422,9 +404,4 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache=N
 
     logging.info(f"loaded {vec_counts}/{len(word_dict)} word embeddings")
 
-    # For resolving `UserWarning: Creating a tensor from a list of numpy.ndarrays is
-    # extremely slow. Please consider converting the list to a single numpy.ndarray
-    # with numpy.array() before converting to a tensor.`
-    embedding_weights = np.array(embedding_weights, dtype=np.float32)
-    return torch.FloatTensor(embedding_weights) # LAAT
-    # return embedding_weights
+    return embedding_weights
diff --git a/libmultilabel/nn/model.py b/libmultilabel/nn/model.py
index 50eb2e10..fb9290d9 100644
--- a/libmultilabel/nn/model.py
+++ b/libmultilabel/nn/model.py
@@ -2,7 +2,6 @@
 
 import numpy as np
 import pytorch_lightning as pl
-from pytorch_lightning.utilities.types import EPOCH_OUTPUT
 import torch
 import torch.nn.functional as F
 import torch.optim as optim
@@ -92,8 +91,8 @@ def configure_optimizers(self):
         else:
             raise RuntimeError("Unsupported optimizer: {self.optimizer}")
 
-        # torch.nn.utils.clip_grad_value_(parameters, 0.5)
-        # LAAT hard code
+        torch.nn.utils.clip_grad_value_(parameters, 0.5)
+        # LAAT hard code (Shao-Syuan)
         if self.val_metric is not None:
             return {
                 "optimizer": optimizer,
@@ -109,20 +108,9 @@ def configure_optimizers(self):
         return optimizer
 
     def training_step(self, batch, batch_idx):
-        # LAAT
-        idx = torch.argsort(batch['length'], descending=True, stable=True)
-        sorted_batch = {k: v[idx] for k, v in batch.items()}
-        loss, _ = self.shared_step(sorted_batch)
-        loss = self.num_classes * loss  # LAAT
-        loss = loss / self.num_classes
+        loss, _ = self.shared_step(batch)
         return loss
 
-    def training_epoch_end(self, outputs: EPOCH_OUTPUT) -> None:
-        print(f'Reshuffling the data')
-        if self.shuffle:
-            self.trainer.train_dataloader.dataset.datasets.shuffle_data()
-        return super().training_epoch_end(outputs)
-
     def validation_step(self, batch, batch_idx):
         return self._shared_eval_step(batch, batch_idx)
 
@@ -130,10 +118,6 @@ def validation_step_end(self, batch_parts):
         return self._shared_eval_step_end(batch_parts)
 
     def validation_epoch_end(self, step_outputs):
-        # print learning rate (LAAT)
-        lightning_optimizer = self.optimizers()
-        for param_group in lightning_optimizer.optimizer.param_groups:
-            print(f"\nLearning Rate: {param_group['lr']}\n")
         return self._shared_eval_epoch_end(step_outputs, "val")
 
     def test_step(self, batch, batch_idx):
diff --git a/libmultilabel/nn/networks/laat.py b/libmultilabel/nn/networks/laat.py
index 79db83e4..a995f27b 100644
--- a/libmultilabel/nn/networks/laat.py
+++ b/libmultilabel/nn/networks/laat.py
@@ -28,7 +28,8 @@ def __init__(
     ):
         super(LAAT, self).__init__()
 
-        self.embedding = Embedding(embed_vecs, embed_dropout, freeze_embed)
+        self.embedding = Embedding(
+            embed_vecs, dropout=embed_dropout, freeze_embed=freeze_embed)
         self.num_layers = num_layers
         self.rnn_dim = rnn_dim
 
@@ -37,8 +38,8 @@ def __init__(
             input_size=embed_vecs.shape[1],
             hidden_size=rnn_dim//2, num_layers=num_layers, dropout=encoder_dropout)
 
-        mean = 0.0
-        std = 0.03
+        # mean = 0.0
+        # std = 0.03
         # first linear
         self.W = nn.Linear(rnn_dim, d_a, bias=False)
 
@@ -51,9 +52,9 @@ def __init__(
         # Final layer: create a matrix to use for the #labels binary classifiers
         self.output = nn.Linear(rnn_dim, num_classes, bias=True)
 
-        torch.nn.init.normal_(self.W.weight, mean, std)
-        torch.nn.init.normal_(self.Q.weight, mean, std)
-        torch.nn.init.normal_(self.output.weight, mean, std)
+        # torch.nn.init.normal_(self.W.weight, mean, std)
+        # torch.nn.init.normal_(self.Q.weight, mean, std)
+        # torch.nn.init.normal_(self.output.weight, mean, std)
 
     def forward(self, input):
         # Get embeddings and apply dropout
diff --git a/libmultilabel/nn/networks/modules.py b/libmultilabel/nn/networks/modules.py
index 22ec606e..b900045e 100644
--- a/libmultilabel/nn/networks/modules.py
+++ b/libmultilabel/nn/networks/modules.py
@@ -17,12 +17,8 @@ class Embedding(nn.Module):
 
     def __init__(self, embed_vecs, dropout=0.2, freeze_embed=False):
         super(Embedding, self).__init__()
-        # self.embedding = nn.Embedding.from_pretrained(
-        #     embed_vecs, freeze=freeze_embed, padding_idx=0)  # LAAT
-
-        self.embedding = nn.Embedding(embed_vecs.shape[0], embed_vecs.shape[1])
-        self.embedding.weight = nn.Parameter(copy.deepcopy(
-            embed_vecs), requires_grad=False)
+        self.embedding = nn.Embedding.from_pretrained(
+            embed_vecs, freeze=freeze_embed, padding_idx=0)
         self.dropout = nn.Dropout(dropout)
 
     def forward(self, input):
@@ -41,6 +37,7 @@ class RNNEncoder(ABC, nn.Module):
 
     def __init__(self, input_size, hidden_size, num_layers, dropout=0):
         super(RNNEncoder, self).__init__()
+        # Li-Chung: PR316
         # self.rnn = self._get_rnn(input_size, hidden_size, num_layers)
         # self.dropout = nn.Dropout(dropout)
         self.rnn = self._get_rnn(input_size, hidden_size, num_layers, dropout)
diff --git a/libmultilabel/nn/nn_utils.py b/libmultilabel/nn/nn_utils.py
index 924731be..8eabf02c 100644
--- a/libmultilabel/nn/nn_utils.py
+++ b/libmultilabel/nn/nn_utils.py
@@ -52,7 +52,6 @@ def init_model(
     loss_function="binary_cross_entropy_with_logits",
     silent=False,
     save_k_predictions=0,
-    shuffle=False, # LAAT
 ):
     """Initialize a `Model` class for initializing and training a neural network.
 
@@ -109,7 +108,6 @@ def init_model(
         loss_function=loss_function,
         silent=silent,
         save_k_predictions=save_k_predictions,
-        shuffle=shuffle, # LAAT
     )
     return model
 
diff --git a/torch_trainer.py b/torch_trainer.py
index e8dbffbf..1035d148 100644
--- a/torch_trainer.py
+++ b/torch_trainer.py
@@ -124,10 +124,10 @@ def _setup_model(
             if self.config.embed_file is not None:
                 logging.info("Load word dictionary ")
                 word_dict, embed_vecs = data_utils.load_or_build_text_dict(
-                    # dataset=self.datasets["train"],
-                    # LAAT
-                    dataset=self.datasets["train"] + \
-                    self.datasets["val"] + self.datasets["test"],
+                    dataset=self.datasets["train"],
+                    # # LAAT
+                    # dataset=self.datasets["train"] + \
+                    # self.datasets["val"] + self.datasets["test"],
                     vocab_file=self.config.vocab_file,
                     min_vocab_freq=self.config.min_vocab_freq,
                     embed_file=self.config.embed_file,
@@ -172,7 +172,6 @@ def _setup_model(
                 loss_function=self.config.loss_function,
                 silent=self.config.silent,
                 save_k_predictions=self.config.save_k_predictions,
-                shuffle=self.config.shuffle, # LAAT
             )
 
     def _get_dataset_loader(self, split, shuffle=False):
@@ -212,8 +211,6 @@ def train(self):
             self.trainer.fit(self.model, train_loader)
         else:
             val_loader = self._get_dataset_loader(split="val")
-            if self.config.shuffle:
-                train_loader.dataset.shuffle_data()
             self.trainer.fit(self.model, train_loader, val_loader)
 
         # Set model to the best model. If the validation process is skipped during

From 1bd97c760b296123d45c9707db2af7735518990a Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Sun, 2 Jul 2023 19:46:27 +0800
Subject: [PATCH 08/19] Add eps for Adam

---
 libmultilabel/nn/model.py    | 21 ++++++++++-----------
 libmultilabel/nn/nn_utils.py |  3 +++
 main.py                      |  3 +++
 torch_trainer.py             |  1 +
 4 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/libmultilabel/nn/model.py b/libmultilabel/nn/model.py
index fb9290d9..dba0194f 100644
--- a/libmultilabel/nn/model.py
+++ b/libmultilabel/nn/model.py
@@ -19,6 +19,7 @@ class MultiLabelModel(pl.LightningModule):
         optimizer (str, optional): Optimizer name (i.e., sgd, adam, or adamw). Defaults to 'adam'.
         momentum (float, optional): Momentum factor for SGD only. Defaults to 0.9.
         weight_decay (int, optional): Weight decay factor. Defaults to 0.
+        eps (float, optional): Epsilon of Adam-based optimizer (e.g., adam, adamw, or adamax). Defaults to 1e-08.
         metric_threshold (float, optional): The decision value threshold over which a label is predicted as positive. Defaults to 0.5.
         monitor_metrics (list, optional): Metrics to monitor while validating. Defaults to None.
         log_path (str): Path to a directory holding the log files and models.
@@ -34,14 +35,14 @@ def __init__(
         optimizer="adam",
         momentum=0.9,
         weight_decay=0,
+        eps=1e-08,
         metric_threshold=0.5,
         monitor_metrics=None,
         log_path=None,
         multiclass=False,
         silent=False,
         save_k_predictions=0,
-        val_metric='Micro-F1', # LAAT
-        shuffle=True,
+        val_metric='Micro-F1',  # LAAT (remove after PR317 merged to master)
         **kwargs
     ):
         super().__init__()
@@ -51,6 +52,7 @@ def __init__(
         self.optimizer = optimizer
         self.momentum = momentum
         self.weight_decay = weight_decay
+        self.eps = eps
 
         # dump log
         self.log_path = log_path
@@ -61,8 +63,7 @@ def __init__(
         self.multiclass = multiclass
         top_k = 1 if self.multiclass else None
         self.eval_metric = get_metrics(metric_threshold, monitor_metrics, num_classes, top_k=top_k)
-        self.val_metric = val_metric # LAAT
-        self.shuffle = shuffle # LAAT
+        self.val_metric = val_metric # LAAT (remove after PR317 merged to master)
         self.num_classes = num_classes
 
     @abstractmethod
@@ -79,15 +80,13 @@ def configure_optimizers(self):
                 parameters, self.learning_rate, momentum=self.momentum, weight_decay=self.weight_decay
             )
         elif optimizer_name == "adam":
-            optimizer = optim.Adam(parameters, weight_decay=self.weight_decay, lr=self.learning_rate)
+            optimizer = optim.Adam(parameters, weight_decay=self.weight_decay, eps=self.eps, lr=self.learning_rate)
         elif optimizer_name == "adamw":
-            # optimizer = optim.AdamW(
-            #     parameters, weight_decay=self.weight_decay, lr=self.learning_rate)
-            from transformers import AdamW
-            optimizer = AdamW(filter(lambda p: p.requires_grad, self.parameters()),
-                              lr=self.learning_rate, weight_decay=self.weight_decay)
+            optimizer = optim.AdamW(
+                parameters, weight_decay=self.weight_decay, eps=self.eps, lr=self.learning_rate)
         elif optimizer_name == "adamax":
-            optimizer = optim.Adamax(parameters, weight_decay=self.weight_decay, lr=self.learning_rate)
+            optimizer = optim.Adamax(
+                parameters, weight_decay=self.weight_decay, eps=self.eps, lr=self.learning_rate)
         else:
             raise RuntimeError("Unsupported optimizer: {self.optimizer}")
 
diff --git a/libmultilabel/nn/nn_utils.py b/libmultilabel/nn/nn_utils.py
index 8eabf02c..e5ac9f99 100644
--- a/libmultilabel/nn/nn_utils.py
+++ b/libmultilabel/nn/nn_utils.py
@@ -46,6 +46,7 @@ def init_model(
     optimizer="adam",
     momentum=0.9,
     weight_decay=0,
+    eps=1e-08,
     metric_threshold=0.5,
     monitor_metrics=None,
     multiclass=False,
@@ -71,6 +72,7 @@ def init_model(
         optimizer (str, optional): Optimizer name (i.e., sgd, adam, or adamw). Defaults to 'adam'.
         momentum (float, optional): Momentum factor for SGD only. Defaults to 0.9.
         weight_decay (int, optional): Weight decay factor. Defaults to 0.
+        eps (float, optional): Epsilon of Adam-based optimizer (e.g., adam, adamw, or adamax). Defaults to 1e-08.
         metric_threshold (float, optional): The decision value threshold over which a label is predicted as positive. Defaults to 0.5.
         monitor_metrics (list, optional): Metrics to monitor while validating. Defaults to None.
         multiclass (bool, optional): Enable multiclass mode. Defaults to False.
@@ -102,6 +104,7 @@ def init_model(
         optimizer=optimizer,
         momentum=momentum,
         weight_decay=weight_decay,
+        eps=eps,
         metric_threshold=metric_threshold,
         monitor_metrics=monitor_metrics,
         multiclass=multiclass,
diff --git a/main.py b/main.py
index e45c6942..ef31bb0b 100644
--- a/main.py
+++ b/main.py
@@ -86,6 +86,9 @@ def add_all_arguments(parser):
     parser.add_argument(
         "--momentum", type=float, default=0.9, help="Momentum factor for SGD only (default: %(default)s)"
     )
+    parser.add_argument(
+        "--eps", type=float, default=1e-08, help="Epsilon of Adam-based optimizer (e.g., adam, adamw, or adamax) (default: %(default)s)"
+    )
     parser.add_argument(
         "--patience",
         type=int,
diff --git a/torch_trainer.py b/torch_trainer.py
index 1035d148..e1f82c01 100644
--- a/torch_trainer.py
+++ b/torch_trainer.py
@@ -166,6 +166,7 @@ def _setup_model(
                 optimizer=self.config.optimizer,
                 momentum=self.config.momentum,
                 weight_decay=self.config.weight_decay,
+                eps=self.config.eps,
                 metric_threshold=self.config.metric_threshold,
                 monitor_metrics=self.config.monitor_metrics,
                 multiclass=self.config.multiclass,

From 8f41aeea2eabc13006d5eb675170370ec3d3b86e Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Sun, 2 Jul 2023 19:50:38 +0800
Subject: [PATCH 09/19] Add eps to laat configs.

---
 example_config/MIMICIV-ICD10-50/laat.yml      | 1 +
 example_config/MIMICIV-ICD10-50/laat_tune.yml | 1 +
 example_config/MIMICIV-ICD9-50/laat.yml       | 1 +
 example_config/MIMICIV-ICD9-50/laat_tune.yml  | 1 +
 4 files changed, 4 insertions(+)

diff --git a/example_config/MIMICIV-ICD10-50/laat.yml b/example_config/MIMICIV-ICD10-50/laat.yml
index b9ee7d22..d9800bb1 100644
--- a/example_config/MIMICIV-ICD10-50/laat.yml
+++ b/example_config/MIMICIV-ICD10-50/laat.yml
@@ -15,6 +15,7 @@ batch_size: 8
 optimizer: adamw
 learning_rate: 0.001
 weight_decay: 0
+eps: 1e-06
 patience: 6
 shuffle: true
 
diff --git a/example_config/MIMICIV-ICD10-50/laat_tune.yml b/example_config/MIMICIV-ICD10-50/laat_tune.yml
index 558ac3be..ad3b60ce 100644
--- a/example_config/MIMICIV-ICD10-50/laat_tune.yml
+++ b/example_config/MIMICIV-ICD10-50/laat_tune.yml
@@ -17,6 +17,7 @@ optimizer: adamw
 learning_rate: 0.001
 momentum: 0.9
 weight_decay: 0
+eps: 1e-06
 patience: 6
 early_stopping_metric: Micro-F1
 shuffle: true
diff --git a/example_config/MIMICIV-ICD9-50/laat.yml b/example_config/MIMICIV-ICD9-50/laat.yml
index 0fd281b3..d795ce50 100644
--- a/example_config/MIMICIV-ICD9-50/laat.yml
+++ b/example_config/MIMICIV-ICD9-50/laat.yml
@@ -15,6 +15,7 @@ batch_size: 8
 optimizer: adamw
 learning_rate: 0.001
 weight_decay: 0
+eps: 1e-06
 patience: 6
 shuffle: true
 
diff --git a/example_config/MIMICIV-ICD9-50/laat_tune.yml b/example_config/MIMICIV-ICD9-50/laat_tune.yml
index 95ded40f..89c8d6dd 100644
--- a/example_config/MIMICIV-ICD9-50/laat_tune.yml
+++ b/example_config/MIMICIV-ICD9-50/laat_tune.yml
@@ -17,6 +17,7 @@ optimizer: adamw
 learning_rate: ['grid_search', [0.001, 0.0003]] # LAAT: 0.001
 momentum: 0.9
 weight_decay: 0
+eps: 1e-06
 patience: 6
 early_stopping_metric: Micro-F1
 shuffle: true

From 0a180953f6fadbcd7b936eed9631a038523a48f7 Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Sun, 2 Jul 2023 19:59:33 +0800
Subject: [PATCH 10/19] Clean up LAAT.

---
 example_config/MIMICIV-ICD10-50/laat.yml      |  2 +-
 example_config/MIMICIV-ICD10-50/laat_tune.yml |  2 +-
 example_config/MIMICIV-ICD9-50/laat.yml       |  2 +-
 example_config/MIMICIV-ICD9-50/laat_tune.yml  |  2 +-
 libmultilabel/nn/networks/laat.py             | 12 ++----------
 5 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/example_config/MIMICIV-ICD10-50/laat.yml b/example_config/MIMICIV-ICD10-50/laat.yml
index d9800bb1..1e82582e 100644
--- a/example_config/MIMICIV-ICD10-50/laat.yml
+++ b/example_config/MIMICIV-ICD10-50/laat.yml
@@ -26,7 +26,7 @@ val_metric: Micro-F1
 
 # model
 model_name: LAAT
-init_weight: kaiming_uniform
+init_weight: null
 network_config:
   embed_dropout: 0.3
   encoder_dropout: 0
diff --git a/example_config/MIMICIV-ICD10-50/laat_tune.yml b/example_config/MIMICIV-ICD10-50/laat_tune.yml
index ad3b60ce..55e1c48e 100644
--- a/example_config/MIMICIV-ICD10-50/laat_tune.yml
+++ b/example_config/MIMICIV-ICD10-50/laat_tune.yml
@@ -30,7 +30,7 @@ val_metric: Micro-F1
 # model
 model_name: LAAT
 loss_function: binary_cross_entropy_with_logits
-init_weight: kaiming_uniform
+init_weight: null
 network_config:
   embed_dropout: 0.3
   encoder_dropout: 0
diff --git a/example_config/MIMICIV-ICD9-50/laat.yml b/example_config/MIMICIV-ICD9-50/laat.yml
index d795ce50..0ef83638 100644
--- a/example_config/MIMICIV-ICD9-50/laat.yml
+++ b/example_config/MIMICIV-ICD9-50/laat.yml
@@ -26,7 +26,7 @@ val_metric: Micro-F1
 
 # model
 model_name: LAAT
-init_weight: kaiming_uniform
+init_weight: null
 network_config:
   embed_dropout: 0.3
   encoder_dropout: 0
diff --git a/example_config/MIMICIV-ICD9-50/laat_tune.yml b/example_config/MIMICIV-ICD9-50/laat_tune.yml
index 89c8d6dd..4c0dd79a 100644
--- a/example_config/MIMICIV-ICD9-50/laat_tune.yml
+++ b/example_config/MIMICIV-ICD9-50/laat_tune.yml
@@ -30,7 +30,7 @@ val_metric: Micro-F1
 # model
 model_name: LAAT
 loss_function: binary_cross_entropy_with_logits
-init_weight: kaiming_uniform
+init_weight: null
 network_config:
   embed_dropout: 0.3
   encoder_dropout: ['grid_search', [0, 0.2, 0.4]] # LAAT: 0
diff --git a/libmultilabel/nn/networks/laat.py b/libmultilabel/nn/networks/laat.py
index a995f27b..1af8b8f9 100644
--- a/libmultilabel/nn/networks/laat.py
+++ b/libmultilabel/nn/networks/laat.py
@@ -38,33 +38,25 @@ def __init__(
             input_size=embed_vecs.shape[1],
             hidden_size=rnn_dim//2, num_layers=num_layers, dropout=encoder_dropout)
 
-        # mean = 0.0
-        # std = 0.03
-        # first linear
         self.W = nn.Linear(rnn_dim, d_a, bias=False)
 
         """Context vectors for computing attention with
         (in_features, out_features) = (d_a, num_classes)
         """
-        # second linear (U in the paper)
         self.Q = nn.Linear(d_a, num_classes, bias=False)
 
         # Final layer: create a matrix to use for the #labels binary classifiers
         self.output = nn.Linear(rnn_dim, num_classes, bias=True)
 
-        # torch.nn.init.normal_(self.W.weight, mean, std)
-        # torch.nn.init.normal_(self.Q.weight, mean, std)
-        # torch.nn.init.normal_(self.output.weight, mean, std)
-
     def forward(self, input):
         # Get embeddings and apply dropout
         x = self.embedding(input['text'])  # (batch_size, length, embed_dim)
         H = self.encoder(x, input['length'])  # (batch_size, length, rnn_dim)
 
-        # (4) Z = tanh(WH), W: (d_a * 2u), H: (2u * N), Z: (d_a * N)
+        # Equation (4) Z = tanh(WH), W: (d_a * 2u), H: (2u * N), Z: (d_a * N)
         Z = torch.tanh(self.W(H))  # (batch_size, length, d_a)
 
-        # (5) A = softmax(UZ), A: (batch_size, class_num, length)
+        # Equation (5) A = softmax(UZ), A: (batch_size, class_num, length)
         #     Q: (|L| * d_a), Z: (d_a * N), A: |L| * N
         alpha = self.Q(Z)
         alpha = torch.softmax(alpha, 1).transpose(1, 2)

From 6b97f832222f650a5ee3fc537f427942451bf029 Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Thu, 6 Jul 2023 23:45:44 +0800
Subject: [PATCH 11/19] Reduce LAAT changes (use CAML implementation),
 train+val as discussed

---
 libmultilabel/nn/networks/laat.py | 6 +++---
 torch_trainer.py                  | 7 +++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/libmultilabel/nn/networks/laat.py b/libmultilabel/nn/networks/laat.py
index 1af8b8f9..bbe9ad23 100644
--- a/libmultilabel/nn/networks/laat.py
+++ b/libmultilabel/nn/networks/laat.py
@@ -58,9 +58,9 @@ def forward(self, input):
 
         # Equation (5) A = softmax(UZ), A: (batch_size, class_num, length)
         #     Q: (|L| * d_a), Z: (d_a * N), A: |L| * N
-        alpha = self.Q(Z)
-        alpha = torch.softmax(alpha, 1).transpose(1, 2)
-        # alpha = torch.softmax(self.Q.weight.matmul(Z.transpose(1, 2)), dim=2)
+        # alpha = self.Q(Z)
+        # alpha = torch.softmax(alpha, 1).transpose(1, 2)
+        alpha = torch.softmax(self.Q.weight.matmul(Z.transpose(1, 2)), dim=2)
 
         # Document representations are weighted sums using the attention
         E = alpha.matmul(H)
diff --git a/torch_trainer.py b/torch_trainer.py
index d2fe0e90..e7e2a11c 100644
--- a/torch_trainer.py
+++ b/torch_trainer.py
@@ -124,10 +124,9 @@ def _setup_model(
             if self.config.embed_file is not None:
                 logging.info("Load word dictionary ")
                 word_dict, embed_vecs = data_utils.load_or_build_text_dict(
-                    dataset=self.datasets["train"],
-                    # # LAAT
-                    # dataset=self.datasets["train"] + \
-                    # self.datasets["val"] + self.datasets["test"],
+                    # add vocab in the validation set
+                    # CAML: train, LAAT: train, val, and test
+                    dataset=self.datasets["train"] + self.datasets.get("val", []),
                     vocab_file=self.config.vocab_file,
                     min_vocab_freq=self.config.min_vocab_freq,
                     embed_file=self.config.embed_file,

From 4f8b86a2fed4454f1cc62b46f5dd0280ea4a6a01 Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Fri, 7 Jul 2023 00:38:38 +0800
Subject: [PATCH 12/19] Remove unused variables.

---
 libmultilabel/nn/model.py            | 1 -
 libmultilabel/nn/networks/modules.py | 6 ++----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/libmultilabel/nn/model.py b/libmultilabel/nn/model.py
index 1b00a786..c450eb4f 100644
--- a/libmultilabel/nn/model.py
+++ b/libmultilabel/nn/model.py
@@ -70,7 +70,6 @@ def __init__(
         self.multiclass = multiclass
         top_k = 1 if self.multiclass else None
         self.eval_metric = get_metrics(metric_threshold, monitor_metrics, num_classes, top_k=top_k)
-        self.num_classes = num_classes
 
     @abstractmethod
     def shared_step(self, batch):
diff --git a/libmultilabel/nn/networks/modules.py b/libmultilabel/nn/networks/modules.py
index b900045e..4a9c1a3c 100644
--- a/libmultilabel/nn/networks/modules.py
+++ b/libmultilabel/nn/networks/modules.py
@@ -4,7 +4,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
-import copy
 
 
 class Embedding(nn.Module):
@@ -44,7 +43,7 @@ def __init__(self, input_size, hidden_size, num_layers, dropout=0):
 
     def forward(self, input, length, **kwargs):
         self.rnn.flatten_parameters()
-        idx = torch.argsort(length, descending=True, stable=True)
+        idx = torch.argsort(length, descending=True)
         length_clamped = length[idx].cpu().clamp(min=1)  # avoid the empty text with length 0
         packed_input = pack_padded_sequence(input[idx], length_clamped, batch_first=True)
         outputs, _ = pad_packed_sequence(self.rnn(packed_input)[0], batch_first=True)
@@ -86,8 +85,7 @@ def __init__(self, input_size, hidden_size, num_layers, dropout=0):
         super(LSTMEncoder, self).__init__(input_size, hidden_size, num_layers, dropout)
 
     def _get_rnn(self, input_size, hidden_size, num_layers, dropout):
-        return nn.LSTM(input_size, hidden_size, num_layers, bidirectional=True, dropout=dropout)
-        # return nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
+        return nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True, dropout=dropout)
 
 
 class CNNEncoder(nn.Module):

From 2e271a97e487ce414455a56b3b4510c49e21034e Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Fri, 7 Jul 2023 16:23:06 +0800
Subject: [PATCH 13/19] Update MIMIC tune configs.

---
 example_config/MIMIC/laat_tune.yml            | 81 +++++++++++++++++++
 example_config/MIMICIV-ICD10-50/laat_tune.yml | 14 +++-
 example_config/MIMICIV-ICD9-50/laat_tune.yml  | 14 +++-
 3 files changed, 101 insertions(+), 8 deletions(-)
 create mode 100644 example_config/MIMIC/laat_tune.yml

diff --git a/example_config/MIMIC/laat_tune.yml b/example_config/MIMIC/laat_tune.yml
new file mode 100644
index 00000000..b0baea8d
--- /dev/null
+++ b/example_config/MIMIC/laat_tune.yml
@@ -0,0 +1,81 @@
+# data
+training_file: data/MIMIC/train.txt
+val_file: data/MIMIC/valid.txt
+test_file: data/MIMIC/test.txt
+data_name: MIMIC
+min_vocab_freq: 1
+max_seq_length: 4000
+include_test_labels: true
+remove_no_label_data: true
+add_special_tokens: false
+
+# train
+seed: 0
+epochs: 50
+batch_size: 8
+optimizer: adamw
+learning_rate: ['grid_search', [0.001, 0.0003]]
+eps: 0.00000001 # 1e-08 (expand str for ray)
+momentum: 0.9 # not used
+weight_decay: 0
+patience: 6
+early_stopping_metric: Micro-F1
+shuffle: true
+lr_scheduler: ReduceLROnPlateau
+scheduler_config:
+  factor: 0.9
+  patience: 5
+  min_lr: 0.0001
+
+# eval
+eval_batch_size: 8
+monitor_metrics: ['Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@5', 'P@8', 'P@15']
+val_metric: Micro-F1
+
+# model
+model_name: LAAT
+loss_function: binary_cross_entropy_with_logits
+init_weight: kaiming_uniform
+network_config:
+  embed_dropout: ['grid_search', [0, 0.2, 0.4, 0.6, 0.8]]
+  encoder_dropout: ['grid_search', [0, 0.2, 0.4]] # LAAT: 0
+  rnn_dim: ['grid_search', [512, 768, 1024]] # LAAT: 512
+  num_layers: 1
+  d_a: ['grid_search', [256, 384, 512]] # LAAT: 256
+  freeze_embed: false
+
+# pretrained vocab / embeddings
+# vocab_file: data/MIMIC/vocab.csv
+embed_file: data/MIMIC-50/word2vec_sg0_100.embed
+normalize_embed: false
+
+# hyperparamter search
+search_alg: basic_variant
+embed_cache_dir: .vector_cache
+num_samples: 1
+scheduler: null
+no_merge_train_val: true # do not retrain
+
+# Uncomment the following lines to enable the ASHAScheduler.
+# See the documentation here: https://docs.ray.io/en/latest/tune/api_docs/schedulers.html#asha-tune-schedulers-ashascheduler
+# scheduler:
+#   time_attr: training_iteration
+#   max_t: 50 # the maximum epochs to run for each config (parameter R in the ASHA paper)
+#   grace_period: 10 # the minimum epochs to run for each config (parameter r in the ASHA paper)
+#   reduction_factor: 3 # reduce the number of configuration to floor(1/reduction_factor) each round of successive halving (called rung in ASHA paper)
+#   brackets: 1 # number of brackets. A smaller bracket index (parameter s in the ASHA paper) means earlier stopping (i.e., less total resources used)
+
+# other parameters specified in main.py::get_args
+checkpoint_path: null
+cpu: false
+data_workers: 8
+eval: false
+label_file: null
+limit_train_batches: 1.0
+limit_val_batches: 1.0
+limit_test_batches: 1.0
+metric_threshold: 0.5
+result_dir: runs
+save_k_predictions: 0
+silent: true
+val_size: 0.2
\ No newline at end of file
diff --git a/example_config/MIMICIV-ICD10-50/laat_tune.yml b/example_config/MIMICIV-ICD10-50/laat_tune.yml
index 55e1c48e..eb219478 100644
--- a/example_config/MIMICIV-ICD10-50/laat_tune.yml
+++ b/example_config/MIMICIV-ICD10-50/laat_tune.yml
@@ -15,28 +15,34 @@ epochs: 50
 batch_size: 8
 optimizer: adamw
 learning_rate: 0.001
+eps: 0.00000001 # 1e-08 (expand str for ray)
 momentum: 0.9
 weight_decay: 0
-eps: 1e-06
 patience: 6
 early_stopping_metric: Micro-F1
 shuffle: true
+lr_scheduler: ReduceLROnPlateau
+scheduler_config:
+  factor: 0.9
+  patience: 5
+  min_lr: 0.0001
 
 # eval
 eval_batch_size: 8
-monitor_metrics: ['Loss', 'Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@5', 'P@8', 'P@15']
+monitor_metrics: ['Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@5', 'P@8', 'P@15']
 val_metric: Micro-F1
 
 # model
 model_name: LAAT
 loss_function: binary_cross_entropy_with_logits
-init_weight: null
+init_weight: kaiming_uniform # null
 network_config:
   embed_dropout: 0.3
   encoder_dropout: 0
   rnn_dim: ['grid_search', [512, 1024, 768, 256]]  # 2u
   num_layers: 1
   d_a: ['grid_search', [256, 512, 384, 128]]
+  freeze_embed: false
 
 # pretrained vocab / embeddings
 vocab_file: data/MIMICIV-ICD10-50/vocab.csv # 179,131, min_vocab_freq=1
@@ -45,7 +51,7 @@ normalize_embed: false
 
 # hyperparamter search
 search_alg: basic_variant
-embed_cache_dir: null
+embed_cache_dir: .vector_cache
 num_samples: 1
 scheduler: null
 no_merge_train_val: true # do not retrain
diff --git a/example_config/MIMICIV-ICD9-50/laat_tune.yml b/example_config/MIMICIV-ICD9-50/laat_tune.yml
index 4c0dd79a..03ba5222 100644
--- a/example_config/MIMICIV-ICD9-50/laat_tune.yml
+++ b/example_config/MIMICIV-ICD9-50/laat_tune.yml
@@ -15,28 +15,34 @@ epochs: 50
 batch_size: 8
 optimizer: adamw
 learning_rate: ['grid_search', [0.001, 0.0003]] # LAAT: 0.001
+eps: 0.00000001
 momentum: 0.9
 weight_decay: 0
-eps: 1e-06
 patience: 6
 early_stopping_metric: Micro-F1
 shuffle: true
+lr_scheduler: ReduceLROnPlateau
+scheduler_config:
+  factor: 0.9
+  patience: 5
+  min_lr: 0.0001
 
 # eval
 eval_batch_size: 8
-monitor_metrics: ['Loss', 'Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@5', 'P@8', 'P@15']
+monitor_metrics: ['Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@5', 'P@8', 'P@15']
 val_metric: Micro-F1
 
 # model
 model_name: LAAT
 loss_function: binary_cross_entropy_with_logits
-init_weight: null
+init_weight: kaiming_uniform # null
 network_config:
   embed_dropout: 0.3
   encoder_dropout: ['grid_search', [0, 0.2, 0.4]] # LAAT: 0
   rnn_dim: ['grid_search', [512, 768, 1024]] # LAAT: 512
   num_layers: 1
   d_a: ['grid_search', [256, 384, 512]] # LAAT: 256
+  freeze_embed: false # true
 
 # pretrained vocab / embeddings
 vocab_file: data/MIMICIV-ICD9-50/vocab.csv # 282,173, min_vocab_freq=1
@@ -45,7 +51,7 @@ normalize_embed: false
 
 # hyperparamter search
 search_alg: basic_variant
-embed_cache_dir: null
+embed_cache_dir: .vector_cache
 num_samples: 1
 scheduler: null
 no_merge_train_val: true # do not retrain

From c04fcaca29de85de9596ce76a65ca4d239c12c12 Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Fri, 7 Jul 2023 16:28:28 +0800
Subject: [PATCH 14/19] Update MIMIC*/laat_tune.yml

---
 example_config/MIMIC/laat_tune.yml           | 4 ++--
 example_config/MIMICIV-ICD9-50/laat_tune.yml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/example_config/MIMIC/laat_tune.yml b/example_config/MIMIC/laat_tune.yml
index b0baea8d..e18aeb79 100644
--- a/example_config/MIMIC/laat_tune.yml
+++ b/example_config/MIMIC/laat_tune.yml
@@ -45,13 +45,13 @@ network_config:
   freeze_embed: false
 
 # pretrained vocab / embeddings
-# vocab_file: data/MIMIC/vocab.csv
+vocab_file: null # generate min_vocab_freq=1 by LibMultiLabel
 embed_file: data/MIMIC-50/word2vec_sg0_100.embed
 normalize_embed: false
 
 # hyperparamter search
 search_alg: basic_variant
-embed_cache_dir: .vector_cache
+embed_cache_dir: null
 num_samples: 1
 scheduler: null
 no_merge_train_val: true # do not retrain
diff --git a/example_config/MIMICIV-ICD9-50/laat_tune.yml b/example_config/MIMICIV-ICD9-50/laat_tune.yml
index 03ba5222..80a04ba3 100644
--- a/example_config/MIMICIV-ICD9-50/laat_tune.yml
+++ b/example_config/MIMICIV-ICD9-50/laat_tune.yml
@@ -51,7 +51,7 @@ normalize_embed: false
 
 # hyperparamter search
 search_alg: basic_variant
-embed_cache_dir: .vector_cache
+embed_cache_dir: null
 num_samples: 1
 scheduler: null
 no_merge_train_val: true # do not retrain

From 15cedc3195da7c61d3b5bbdd4af7c60974bfb67b Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Wed, 12 Jul 2023 00:38:30 +0800
Subject: [PATCH 15/19] Add EUR-Lex laat_tune.yml.

---
 example_config/EUR-Lex/laat_tune.yml | 80 ++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 example_config/EUR-Lex/laat_tune.yml

diff --git a/example_config/EUR-Lex/laat_tune.yml b/example_config/EUR-Lex/laat_tune.yml
new file mode 100644
index 00000000..3bdb9f23
--- /dev/null
+++ b/example_config/EUR-Lex/laat_tune.yml
@@ -0,0 +1,80 @@
+# data
+training_file: data/EUR-Lex/train.txt
+test_file: data/EUR-Lex/test.txt
+data_name: EUR-Lex
+min_vocab_freq: 1
+max_seq_length: 500
+include_test_labels: true # false
+remove_no_label_data: true # false
+add_special_tokens: false
+
+# train
+seed: 0 # 1337
+epochs: 50
+batch_size: 16
+optimizer: adamw
+learning_rate: ['grid_search', [0.001, 0.0003]]
+eps: 0.00000001 # 1e-08 (expand str for ray)
+momentum: 0.9 # not used
+weight_decay: 0
+patience: 10
+early_stopping_metric: RP@5
+shuffle: true
+lr_scheduler: ReduceLROnPlateau
+scheduler_config:
+  factor: 0.9
+  patience: 9
+  min_lr: 0.0001
+
+# eval
+eval_batch_size: 16
+monitor_metrics: ['Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@1', 'P@5', 'RP@5', 'nDCG@5']
+val_metric: RP@5
+
+# model
+model_name: LAAT
+loss_function: binary_cross_entropy_with_logits
+init_weight: kaiming_uniform
+network_config:
+  embed_dropout: ['grid_search', [0, 0.2, 0.4]] # 0.3
+  encoder_dropout: ['grid_search', [0, 0.2, 0.4, 0.6, 0.8]] # ['grid_search', [0, 0.2, 0.4]]
+  rnn_dim: ['grid_search', [256, 512, 1024]] # ['grid_search', [256, 512, 768, 1024]]
+  rnn_layers: 1
+  d_a: ['grid_search', [128, 256, 512]]
+  freeze_embed: false
+
+# pretrained vocab / embeddings
+vocab_file: null
+embed_file: glove.6B.200d
+normalize_embed: false
+
+# hyperparamter search
+search_alg: basic_variant
+embed_cache_dir: .vector_cache
+num_samples: 1
+scheduler: null
+no_merge_train_val: true # do not retrain
+
+# Uncomment the following lines to enable the ASHAScheduler.
+# See the documentation here: https://docs.ray.io/en/latest/tune/api_docs/schedulers.html#asha-tune-schedulers-ashascheduler
+#scheduler:
+  #time_attr: training_iteration
+  #max_t: 50 # the maximum epochs to run for each config (parameter R in the ASHA paper)
+  #grace_period: 10 # the minimum epochs to run for each config (parameter r in the ASHA paper)
+  #reduction_factor: 3 # reduce the number of configuration to floor(1/reduction_factor) each round of successive halving (called rung in ASHA paper)
+  #brackets: 1 # number of brackets. A smaller bracket index (parameter s in the ASHA paper) means earlier stopping (i.e., less total resources used)
+
+# other parameters specified in main.py::get_args
+checkpoint_path: null
+cpu: false
+data_workers: 4
+eval: false
+label_file: null
+limit_train_batches: 1.0
+limit_val_batches: 1.0
+limit_test_batches: 1.0
+metric_threshold: 0.5
+result_dir: runs
+save_k_predictions: 0
+silent: true
+val_size: 0.2

From ab61dadd75e751a751b7f6491dba9dc502132fb7 Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Wed, 12 Jul 2023 00:57:20 +0800
Subject: [PATCH 16/19] Update search space.

---
 example_config/EUR-Lex/laat_tune.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/example_config/EUR-Lex/laat_tune.yml b/example_config/EUR-Lex/laat_tune.yml
index 3bdb9f23..dda705d4 100644
--- a/example_config/EUR-Lex/laat_tune.yml
+++ b/example_config/EUR-Lex/laat_tune.yml
@@ -38,8 +38,8 @@ init_weight: kaiming_uniform
 network_config:
   embed_dropout: ['grid_search', [0, 0.2, 0.4]] # 0.3
   encoder_dropout: ['grid_search', [0, 0.2, 0.4, 0.6, 0.8]] # ['grid_search', [0, 0.2, 0.4]]
-  rnn_dim: ['grid_search', [256, 512, 1024]] # ['grid_search', [256, 512, 768, 1024]]
-  rnn_layers: 1
+  rnn_dim: ['grid_search', [1024, 512, 256]] # ['grid_search', [256, 512, 768, 1024]]
+  num_layers: 1
   d_a: ['grid_search', [128, 256, 512]]
   freeze_embed: false
 

From 31dab37bfe49adac04c8e705de0afddead1ff90a Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Wed, 12 Jul 2023 00:59:24 +0800
Subject: [PATCH 17/19] Reduce search space.

---
 example_config/EUR-Lex/laat_tune.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/example_config/EUR-Lex/laat_tune.yml b/example_config/EUR-Lex/laat_tune.yml
index dda705d4..0a490093 100644
--- a/example_config/EUR-Lex/laat_tune.yml
+++ b/example_config/EUR-Lex/laat_tune.yml
@@ -37,8 +37,8 @@ loss_function: binary_cross_entropy_with_logits
 init_weight: kaiming_uniform
 network_config:
   embed_dropout: ['grid_search', [0, 0.2, 0.4]] # 0.3
-  encoder_dropout: ['grid_search', [0, 0.2, 0.4, 0.6, 0.8]] # ['grid_search', [0, 0.2, 0.4]]
-  rnn_dim: ['grid_search', [1024, 512, 256]] # ['grid_search', [256, 512, 768, 1024]]
+  encoder_dropout: ['grid_search', [0, 0.2, 0.4, 0.8]] # ['grid_search', [0, 0.2, 0.4]]
+  rnn_dim: ['grid_search', [1024, 512]] # ['grid_search', [256, 512, 768, 1024]]
   num_layers: 1
   d_a: ['grid_search', [128, 256, 512]]
   freeze_embed: false

From 21338a82f203f52a30e22e8dd74988fc47785a82 Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Thu, 13 Jul 2023 11:28:12 +0800
Subject: [PATCH 18/19] Update EUR-Lex/laat_tune.yml

---
 example_config/EUR-Lex/laat_tune.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/example_config/EUR-Lex/laat_tune.yml b/example_config/EUR-Lex/laat_tune.yml
index 3bdb9f23..ca628a63 100644
--- a/example_config/EUR-Lex/laat_tune.yml
+++ b/example_config/EUR-Lex/laat_tune.yml
@@ -36,11 +36,11 @@ model_name: LAAT
 loss_function: binary_cross_entropy_with_logits
 init_weight: kaiming_uniform
 network_config:
-  embed_dropout: ['grid_search', [0, 0.2, 0.4]] # 0.3
-  encoder_dropout: ['grid_search', [0, 0.2, 0.4, 0.6, 0.8]] # ['grid_search', [0, 0.2, 0.4]]
-  rnn_dim: ['grid_search', [256, 512, 1024]] # ['grid_search', [256, 512, 768, 1024]]
+  embed_dropout: ['grid_search', [0, 0.2, 0.4, 0.6, 0.8]]
+  encoder_dropout: ['grid_search', [0, 0.2, 0.4]]
+  rnn_dim: ['grid_search', [512, 768, 1024]]
   rnn_layers: 1
-  d_a: ['grid_search', [128, 256, 512]]
+  d_a: ['grid_search', [128, 256, 384, 512]] # add d_a = 128 in EUR-Lex
   freeze_embed: false
 
 # pretrained vocab / embeddings

From 823ba8f3694905ae42435af707557f75d24638ff Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Fri, 14 Jul 2023 09:26:02 +0800
Subject: [PATCH 19/19] num_layers

---
 example_config/EUR-Lex/laat_tune.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example_config/EUR-Lex/laat_tune.yml b/example_config/EUR-Lex/laat_tune.yml
index 50936953..20d0d551 100644
--- a/example_config/EUR-Lex/laat_tune.yml
+++ b/example_config/EUR-Lex/laat_tune.yml
@@ -39,7 +39,7 @@ network_config:
   embed_dropout: ['grid_search', [0, 0.2, 0.4, 0.6, 0.8]]
   encoder_dropout: ['grid_search', [0, 0.2, 0.4]]
   rnn_dim: ['grid_search', [512, 768, 1024]] # 256, 512, 1024
-  rnn_layers: 1
+  num_layers: 1
   d_a: ['grid_search', [128, 256, 384, 512]] # add d_a = 128 in EUR-Lex
   freeze_embed: false