From cabec9e6cd0896ef487caf149bd2e743366c7353 Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Mon, 15 May 2023 14:56:49 +0800 Subject: [PATCH 01/19] Add MIMICIV configs. --- example_config/MIMICIV-ICD10-50/laat.yml | 39 +++++++++++ example_config/MIMICIV-ICD10-50/laat_tune.yml | 65 +++++++++++++++++++ example_config/MIMICIV-ICD9-50/laat.yml | 39 +++++++++++ example_config/MIMICIV-ICD9-50/laat_tune.yml | 65 +++++++++++++++++++ 4 files changed, 208 insertions(+) create mode 100644 example_config/MIMICIV-ICD10-50/laat.yml create mode 100644 example_config/MIMICIV-ICD10-50/laat_tune.yml create mode 100644 example_config/MIMICIV-ICD9-50/laat.yml create mode 100644 example_config/MIMICIV-ICD9-50/laat_tune.yml diff --git a/example_config/MIMICIV-ICD10-50/laat.yml b/example_config/MIMICIV-ICD10-50/laat.yml new file mode 100644 index 00000000..b9ee7d22 --- /dev/null +++ b/example_config/MIMICIV-ICD10-50/laat.yml @@ -0,0 +1,39 @@ +# data +training_file: data/MIMICIV-ICD10-50/train.txt +val_file: data/MIMICIV-ICD10-50/valid.txt +test_file: data/MIMICIV-ICD10-50/test.txt +data_name: MIMICIV-ICD10-50 +min_vocab_freq: 1 +max_seq_length: 4000 +include_test_labels: true +remove_no_label_data: true + +# train +seed: 0 +epochs: 50 +batch_size: 8 +optimizer: adamw +learning_rate: 0.001 +weight_decay: 0 +patience: 6 +shuffle: true + +# eval +eval_batch_size: 8 +monitor_metrics: ['Loss', 'Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@5', 'P@8', 'P@15'] +val_metric: Micro-F1 + +# model +model_name: LAAT +init_weight: kaiming_uniform +network_config: + embed_dropout: 0.3 + encoder_dropout: 0 + rnn_dim: 512 # 512//2 = 256 + num_layers: 1 + d_a: 256 + +# pretrained vocab / embeddings +vocab_file: data/MIMICIV-ICD10-50/vocab.csv # 179,131, min_vocab_freq=1 +embed_file: data/MIMICIV-ICD10-50/processed_full.embed +normalize_embed: false diff --git a/example_config/MIMICIV-ICD10-50/laat_tune.yml b/example_config/MIMICIV-ICD10-50/laat_tune.yml new file mode 100644 index 00000000..6101a0c4 --- /dev/null +++ b/example_config/MIMICIV-ICD10-50/laat_tune.yml @@ -0,0 +1,65 @@ +# data +training_file: data/MIMICIV-ICD10-50/train.txt +val_file: data/MIMICIV-ICD10-50/valid.txt +test_file: data/MIMICIV-ICD10-50/test.txt +data_name: MIMICIV-ICD10-50 +min_vocab_freq: 1 +max_seq_length: 4000 +include_test_labels: true +remove_no_label_data: true +add_special_tokens: false + +# train +seed: 0 +epochs: 50 +batch_size: 8 +optimizer: adamw +learning_rate: 0.001 +momentum: 0.9 +weight_decay: 0 +patience: 6 +early_stopping_metric: Micro-F1 +shuffle: true + +# eval +eval_batch_size: 8 +monitor_metrics: ['Loss', 'Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@5', 'P@8', 'P@15'] +val_metric: Micro-F1 + +# model +model_name: LAAT +loss_function: binary_cross_entropy_with_logits +init_weight: kaiming_uniform +network_config: + embed_dropout: 0.3 + encoder_dropout: 0 + rnn_dim: ['grid_search', [512, 1024, 256]] + num_layers: 1 + d_a: ['grid_search', [256, 512, 128]] + +# pretrained vocab / embeddings +vocab_file: data/MIMICIV-ICD10-50/vocab.csv # 179,131, min_vocab_freq=1 +embed_file: data/MIMICIV-ICD10-50/processed_full.embed +normalize_embed: false + +# hyperparamter search +search_alg: basic_variant +embed_cache_dir: null +num_samples: 1 +scheduler: null +no_merge_train_val: true # do not retrain + +# other parameters specified in main.py::get_args +checkpoint_path: null +cpu: false +data_workers: 8 +eval: false +label_file: null +limit_train_batches: 1.0 +limit_val_batches: 1.0 +limit_test_batches: 1.0 +metric_threshold: 0.5 +result_dir: runs +save_k_predictions: 0 +silent: true +val_size: 0.2 diff --git a/example_config/MIMICIV-ICD9-50/laat.yml b/example_config/MIMICIV-ICD9-50/laat.yml new file mode 100644 index 00000000..0fd281b3 --- /dev/null +++ b/example_config/MIMICIV-ICD9-50/laat.yml @@ -0,0 +1,39 @@ +# data +training_file: data/MIMICIV-ICD9-50/train.txt +val_file: data/MIMICIV-ICD9-50/valid.txt +test_file: data/MIMICIV-ICD9-50/test.txt +data_name: MIMICIV-ICD9-50 +min_vocab_freq: 1 +max_seq_length: 4000 +include_test_labels: true +remove_no_label_data: true + +# train +seed: 0 # 1337 +epochs: 50 +batch_size: 8 +optimizer: adamw +learning_rate: 0.001 +weight_decay: 0 +patience: 6 +shuffle: true + +# eval +eval_batch_size: 8 +monitor_metrics: ['Loss', 'Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@5', 'P@8', 'P@15'] +val_metric: Micro-F1 + +# model +model_name: LAAT +init_weight: kaiming_uniform +network_config: + embed_dropout: 0.3 + encoder_dropout: 0 + rnn_dim: 512 # 512//2 = 256 + num_layers: 1 + d_a: 256 + +# pretrained vocab / embeddings +vocab_file: data/MIMICIV-ICD9-50/vocab.csv # 282,173, min_vocab_freq=1 +embed_file: data/MIMICIV-ICD9-50/processed_full.embed +normalize_embed: false diff --git a/example_config/MIMICIV-ICD9-50/laat_tune.yml b/example_config/MIMICIV-ICD9-50/laat_tune.yml new file mode 100644 index 00000000..42c4c3c6 --- /dev/null +++ b/example_config/MIMICIV-ICD9-50/laat_tune.yml @@ -0,0 +1,65 @@ +# data +training_file: data/MIMICIV-ICD9-50/train.txt +val_file: data/MIMICIV-ICD9-50/valid.txt +test_file: data/MIMICIV-ICD9-50/test.txt +data_name: MIMICIV-ICD9-50 +min_vocab_freq: 1 +max_seq_length: 4000 +include_test_labels: true +remove_no_label_data: true +add_special_tokens: false + +# train +seed: 0 +epochs: 50 +batch_size: 8 +optimizer: adamw +learning_rate: 0.001 +momentum: 0.9 +weight_decay: 0 +patience: 6 +early_stopping_metric: Micro-F1 +shuffle: true + +# eval +eval_batch_size: 8 +monitor_metrics: ['Loss', 'Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@5', 'P@8', 'P@15'] +val_metric: Micro-F1 + +# model +model_name: LAAT +loss_function: binary_cross_entropy_with_logits +init_weight: kaiming_uniform +network_config: + embed_dropout: 0.3 + encoder_dropout: 0 + rnn_dim: ['grid_search', [512, 1024, 256]] + num_layers: 1 + d_a: ['grid_search', [256, 512, 128]] + +# pretrained vocab / embeddings +vocab_file: data/MIMICIV-ICD9-50/vocab.csv # 282,173, min_vocab_freq=1 +embed_file: data/MIMICIV-ICD9-50/processed_full.embed +normalize_embed: false + +# hyperparamter search +search_alg: basic_variant +embed_cache_dir: null +num_samples: 1 +scheduler: null +no_merge_train_val: true # do not retrain + +# other parameters specified in main.py::get_args +checkpoint_path: null +cpu: false +data_workers: 8 +eval: false +label_file: null +limit_train_batches: 1.0 +limit_val_batches: 1.0 +limit_test_batches: 1.0 +metric_threshold: 0.5 +result_dir: runs +save_k_predictions: 0 +silent: true +val_size: 0.2 From 3771f4459457e98ecd8ff4518c0741dc2a6e8995 Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Mon, 15 May 2023 15:25:45 +0800 Subject: [PATCH 02/19] Add LAAT reproduce code. --- example_config/MIMICIV-ICD10-50/laat_tune.yml | 4 +- example_config/MIMICIV-ICD9-50/laat_tune.yml | 4 +- libmultilabel/nn/data_utils.py | 11 ++- libmultilabel/nn/model.py | 18 ++++- libmultilabel/nn/networks/__init__.py | 1 + libmultilabel/nn/networks/laat.py | 75 +++++++++++++++++++ libmultilabel/nn/networks/modules.py | 2 +- 7 files changed, 106 insertions(+), 9 deletions(-) create mode 100644 libmultilabel/nn/networks/laat.py diff --git a/example_config/MIMICIV-ICD10-50/laat_tune.yml b/example_config/MIMICIV-ICD10-50/laat_tune.yml index 6101a0c4..558ac3be 100644 --- a/example_config/MIMICIV-ICD10-50/laat_tune.yml +++ b/example_config/MIMICIV-ICD10-50/laat_tune.yml @@ -33,9 +33,9 @@ init_weight: kaiming_uniform network_config: embed_dropout: 0.3 encoder_dropout: 0 - rnn_dim: ['grid_search', [512, 1024, 256]] + rnn_dim: ['grid_search', [512, 1024, 768, 256]] # 2u num_layers: 1 - d_a: ['grid_search', [256, 512, 128]] + d_a: ['grid_search', [256, 512, 384, 128]] # pretrained vocab / embeddings vocab_file: data/MIMICIV-ICD10-50/vocab.csv # 179,131, min_vocab_freq=1 diff --git a/example_config/MIMICIV-ICD9-50/laat_tune.yml b/example_config/MIMICIV-ICD9-50/laat_tune.yml index 42c4c3c6..cc03991c 100644 --- a/example_config/MIMICIV-ICD9-50/laat_tune.yml +++ b/example_config/MIMICIV-ICD9-50/laat_tune.yml @@ -33,9 +33,9 @@ init_weight: kaiming_uniform network_config: embed_dropout: 0.3 encoder_dropout: 0 - rnn_dim: ['grid_search', [512, 1024, 256]] + rnn_dim: ['grid_search', [512, 1024, 768, 256]] # 2u num_layers: 1 - d_a: ['grid_search', [256, 512, 128]] + d_a: ['grid_search', [256, 512, 384, 128]] # pretrained vocab / embeddings vocab_file: data/MIMICIV-ICD9-50/vocab.csv # 282,173, min_vocab_freq=1 diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py index 73a7f9d6..7c25c819 100644 --- a/libmultilabel/nn/data_utils.py +++ b/libmultilabel/nn/data_utils.py @@ -45,12 +45,13 @@ def __init__( tokenizer=None, word_dict=None, ): - self.data = data + # self.data = data self.classes = classes self.max_seq_length = max_seq_length self.word_dict = word_dict self.tokenizer = tokenizer self.add_special_tokens = add_special_tokens + self.data = self.sort_by_length(data) # LAAT self.num_classes = len(self.classes) self.label_binarizer = MultiLabelBinarizer().fit([classes]) @@ -77,6 +78,10 @@ def __getitem__(self, index): "label": torch.IntTensor(self.label_binarizer.transform([data["label"]])[0]), } + def sort_by_length(self, data): + # reverse for LAAT + return sorted(data, key=lambda x: -len(x['text'][:self.max_seq_length])) + def tokenize(text): """Tokenize text. @@ -387,8 +392,10 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache=N if load_embedding_from_file: # Add UNK embedding # AttentionXML: np.random.uniform(-1.0, 1.0, embed_size) + # LAAT: np.random.uniform(-0.25, 0.25, embedding_size) # CAML: np.random.randn(embed_size) - unk_vector = torch.randn(embed_size) + import numpy as np + unk_vector = torch.tensor(np.random.uniform(-0.25, 0.25, embed_size)) embedding_weights[word_dict[UNK]] = unk_vector # Store pretrained word embedding diff --git a/libmultilabel/nn/model.py b/libmultilabel/nn/model.py index bc6cf134..e0a8399d 100644 --- a/libmultilabel/nn/model.py +++ b/libmultilabel/nn/model.py @@ -40,6 +40,7 @@ def __init__( multiclass=False, silent=False, save_k_predictions=0, + val_metric='Micro-F1', # LAAT **kwargs ): super().__init__() @@ -59,6 +60,7 @@ def __init__( self.multiclass = multiclass top_k = 1 if self.multiclass else None self.eval_metric = get_metrics(metric_threshold, monitor_metrics, num_classes, top_k=top_k) + self.val_metric = val_metric # LAAT @abstractmethod def shared_step(self, batch): @@ -82,8 +84,20 @@ def configure_optimizers(self): else: raise RuntimeError("Unsupported optimizer: {self.optimizer}") - torch.nn.utils.clip_grad_value_(parameters, 0.5) - + # torch.nn.utils.clip_grad_value_(parameters, 0.5) + # LAAT hard code + if self.val_metric is not None: + return { + "optimizer": optimizer, + "lr_scheduler": { + "scheduler": optim.lr_scheduler.ReduceLROnPlateau( + optimizer, mode="max", + factor=0.9, + patience=5, + min_lr=0.0001), + "monitor": self.val_metric, + }, + } return optimizer def training_step(self, batch, batch_idx): diff --git a/libmultilabel/nn/networks/__init__.py b/libmultilabel/nn/networks/__init__.py index a5667856..4bc59d24 100644 --- a/libmultilabel/nn/networks/__init__.py +++ b/libmultilabel/nn/networks/__init__.py @@ -3,6 +3,7 @@ from .bert import BERT from .bert_attention import BERTAttention from .caml import CAML +from .laat import LAAT from .kim_cnn import KimCNN from .xml_cnn import XMLCNN from .labelwise_attention_networks import BiGRULWAN diff --git a/libmultilabel/nn/networks/laat.py b/libmultilabel/nn/networks/laat.py new file mode 100644 index 00000000..171e1f37 --- /dev/null +++ b/libmultilabel/nn/networks/laat.py @@ -0,0 +1,75 @@ +import torch +import torch.nn as nn + +from .modules import Embedding, LSTMEncoder + + +class LAAT(nn.Module): + """LAAT Vu. + + Args: + embed_vecs (torch.Tensor): The pre-trained word vectors of shape (vocab_size, embed_dim). + num_classes (int): Total number of classes. + filter_sizes (list): Size of convolutional filters. + num_filter_per_size (int): The number of filters in convolutional layers in each size. Defaults to 50. + dropout (float): The dropout rate of the word embedding. Defaults to 0.2. + """ + + def __init__( + self, + embed_vecs, + num_classes, + rnn_dim=1024, + num_layers=1, + d_a=512, + embed_dropout=0.3, + encoder_dropout=0, + ): + super(LAAT, self).__init__() + + self.embedding = Embedding(embed_vecs, embed_dropout) + self.num_layers = num_layers + self.rnn_dim = rnn_dim + + # Initialize rnn layer (H: 2u * N) + self.encoder = LSTMEncoder( + input_size=embed_vecs.shape[1], + hidden_size=rnn_dim//2, num_layers=num_layers, dropout=encoder_dropout) + + mean = 0.0 + std = 0.3 + # first linear + # Z = tanh(WH), W: (d_a * 2u), H: (2u * N), Z: (d_a * N) + self.W = nn.Linear(rnn_dim, d_a, bias=False) + torch.nn.init.normal(self.W.weight, mean, std) + + """Context vectors for computing attention with + (in_features, out_features) = (d_a, num_classes) + """ + # second linear + # A = softmax(UZ), U: (|L| * d_a), Z: (d_a * N), A: |L| * N + self.Q = nn.Linear(d_a, num_classes, bias=False) + torch.nn.init.normal(self.Q.weight, mean, std) + + # Final layer: create a matrix to use for the #labels binary classifiers + self.output = nn.Linear(rnn_dim, num_classes, bias=True) + torch.nn.init.normal(self.output.weight, mean, std) + + def forward(self, input): + # Get embeddings and apply dropout + x = self.embedding(input['text']) # (batch_size, length, embed_dim) + + x = self.encoder(x, input['length']) # (batch_size, length, rnn_dim) + Z = torch.tanh(self.W(x)) # (batch_size, length, d_a) + + # (batch_size, class_num, length) + alpha = torch.softmax(self.Q.weight.matmul(Z.transpose(1, 2)), dim=2) + + # Document representations are weighted sums using the attention + E = alpha.matmul(x) + + # Compute a probability for each label + logits = self.output.weight.mul(E).sum(dim=2).add( + self.output.bias) # (batch_size, num_classes) + + return {'logits': logits, 'attention': alpha} diff --git a/libmultilabel/nn/networks/modules.py b/libmultilabel/nn/networks/modules.py index d41918ea..416f4c81 100644 --- a/libmultilabel/nn/networks/modules.py +++ b/libmultilabel/nn/networks/modules.py @@ -16,7 +16,7 @@ class Embedding(nn.Module): def __init__(self, embed_vecs, dropout=0.2): super(Embedding, self).__init__() - self.embedding = nn.Embedding.from_pretrained(embed_vecs, freeze=False, padding_idx=0) + self.embedding = nn.Embedding.from_pretrained(embed_vecs, freeze=True, padding_idx=0) # LAAT self.dropout = nn.Dropout(dropout) def forward(self, input): From 61e3d30a2bd9401ab3e6dd6c337a0bc8f18782b0 Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Sun, 21 May 2023 07:05:20 +0800 Subject: [PATCH 03/19] Backup print learning rate --- libmultilabel/nn/model.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libmultilabel/nn/model.py b/libmultilabel/nn/model.py index e0a8399d..c093db46 100644 --- a/libmultilabel/nn/model.py +++ b/libmultilabel/nn/model.py @@ -111,6 +111,10 @@ def validation_step_end(self, batch_parts): return self._shared_eval_step_end(batch_parts) def validation_epoch_end(self, step_outputs): + # print learning rate (LAAT) + lightning_optimizer = self.optimizers() + for param_group in lightning_optimizer.optimizer.param_groups: + print(f"\nLearning Rate: {param_group['lr']}\n") return self._shared_eval_epoch_end(step_outputs, "val") def test_step(self, batch, batch_idx): From 4d777ca5c7741aab9d14fefdae2303a5c6bf29a3 Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Thu, 25 May 2023 14:16:53 +0800 Subject: [PATCH 04/19] Update laat_tune config. (MIMICIV-ICD9-50) --- example_config/MIMICIV-ICD9-50/laat_tune.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/example_config/MIMICIV-ICD9-50/laat_tune.yml b/example_config/MIMICIV-ICD9-50/laat_tune.yml index cc03991c..95ded40f 100644 --- a/example_config/MIMICIV-ICD9-50/laat_tune.yml +++ b/example_config/MIMICIV-ICD9-50/laat_tune.yml @@ -14,7 +14,7 @@ seed: 0 epochs: 50 batch_size: 8 optimizer: adamw -learning_rate: 0.001 +learning_rate: ['grid_search', [0.001, 0.0003]] # LAAT: 0.001 momentum: 0.9 weight_decay: 0 patience: 6 @@ -32,10 +32,10 @@ loss_function: binary_cross_entropy_with_logits init_weight: kaiming_uniform network_config: embed_dropout: 0.3 - encoder_dropout: 0 - rnn_dim: ['grid_search', [512, 1024, 768, 256]] # 2u + encoder_dropout: ['grid_search', [0, 0.2, 0.4]] # LAAT: 0 + rnn_dim: ['grid_search', [512, 768, 1024]] # LAAT: 512 num_layers: 1 - d_a: ['grid_search', [256, 512, 384, 128]] + d_a: ['grid_search', [256, 384, 512]] # LAAT: 256 # pretrained vocab / embeddings vocab_file: data/MIMICIV-ICD9-50/vocab.csv # 282,173, min_vocab_freq=1 From d463d8af96e0db136d123d10e4ba01d003f0964d Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Thu, 25 May 2023 16:15:37 +0800 Subject: [PATCH 05/19] 0.3 to 0.03 ... bug!! --- libmultilabel/nn/networks/laat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmultilabel/nn/networks/laat.py b/libmultilabel/nn/networks/laat.py index 171e1f37..864616de 100644 --- a/libmultilabel/nn/networks/laat.py +++ b/libmultilabel/nn/networks/laat.py @@ -37,7 +37,7 @@ def __init__( hidden_size=rnn_dim//2, num_layers=num_layers, dropout=encoder_dropout) mean = 0.0 - std = 0.3 + std = 0.03 # first linear # Z = tanh(WH), W: (d_a * 2u), H: (2u * N), Z: (d_a * N) self.W = nn.Linear(rnn_dim, d_a, bias=False) From f3502948f4189b184b097f7b7421a8e3ce26c7e3 Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Mon, 5 Jun 2023 04:02:43 +0800 Subject: [PATCH 06/19] Update LAAT. --- libmultilabel/nn/data_utils.py | 43 +++++++++++++++++++--------- libmultilabel/nn/model.py | 23 +++++++++++++-- libmultilabel/nn/networks/laat.py | 28 ++++++++++-------- libmultilabel/nn/networks/modules.py | 30 ++++++++++++------- libmultilabel/nn/nn_utils.py | 2 ++ torch_trainer.py | 8 +++++- 6 files changed, 95 insertions(+), 39 deletions(-) diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py index 7c25c819..4bc4aa68 100644 --- a/libmultilabel/nn/data_utils.py +++ b/libmultilabel/nn/data_utils.py @@ -1,6 +1,7 @@ import csv import gc import logging +import random import warnings import pandas as pd @@ -82,6 +83,10 @@ def sort_by_length(self, data): # reverse for LAAT return sorted(data, key=lambda x: -len(x['text'][:self.max_seq_length])) + def shuffle_data(self): + # LAAT + random.shuffle(self.data) + def tokenize(text): """Tokenize text. @@ -145,7 +150,8 @@ def get_dataset_loader( dataset_loader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, - shuffle=shuffle, + # shuffle=shuffle, + shuffle=False, # use TextDataset.shuffle_data() num_workers=data_workers, collate_fn=generate_batch, pin_memory="cuda" in device.type, @@ -374,7 +380,8 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache=N vector_dict = {} for word_vector in tqdm(word_vectors, disable=silent): word, vector = word_vector.rstrip().split(" ", 1) - vector = torch.Tensor(list(map(float, vector.split()))) + # vector = torch.Tensor(list(map(float, vector.split()))) + vector = list(map(float, vector.split())) # LAAT vector_dict[word] = vector else: logging.info(f"Load pretrained embedding from torchtext.") @@ -387,16 +394,21 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache=N vector_dict = pretrained_aliases[embed_file](cache=cache) embed_size = vector_dict.dim - embedding_weights = torch.zeros(len(word_dict), embed_size) - - if load_embedding_from_file: - # Add UNK embedding - # AttentionXML: np.random.uniform(-1.0, 1.0, embed_size) - # LAAT: np.random.uniform(-0.25, 0.25, embedding_size) - # CAML: np.random.randn(embed_size) - import numpy as np - unk_vector = torch.tensor(np.random.uniform(-0.25, 0.25, embed_size)) - embedding_weights[word_dict[UNK]] = unk_vector + # LAAT init: set default to UNK vector + import numpy as np + unk_vector = np.random.uniform(-0.25, 0.25, embed_size) + embedding_weights = [unk_vector] * (len(word_dict)) + embedding_weights[0] = np.zeros(embed_size) + + # embedding_weights = torch.zeros(len(word_dict), embed_size) + # if load_embedding_from_file: + # # Add UNK embedding + # # AttentionXML: np.random.uniform(-1.0, 1.0, embed_size) + # # LAAT: np.random.uniform(-0.25, 0.25, embedding_size) + # # CAML: np.random.randn(embed_size) + # import numpy as np + # unk_vector = torch.tensor(np.random.uniform(-0.25, 0.25, embed_size)) + # embedding_weights[word_dict[UNK]] = unk_vector # Store pretrained word embedding vec_counts = 0 @@ -410,4 +422,9 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache=N logging.info(f"loaded {vec_counts}/{len(word_dict)} word embeddings") - return embedding_weights + # For resolving `UserWarning: Creating a tensor from a list of numpy.ndarrays is + # extremely slow. Please consider converting the list to a single numpy.ndarray + # with numpy.array() before converting to a tensor.` + embedding_weights = np.array(embedding_weights, dtype=np.float32) + return torch.FloatTensor(embedding_weights) # LAAT + # return embedding_weights diff --git a/libmultilabel/nn/model.py b/libmultilabel/nn/model.py index c093db46..50eb2e10 100644 --- a/libmultilabel/nn/model.py +++ b/libmultilabel/nn/model.py @@ -2,6 +2,7 @@ import numpy as np import pytorch_lightning as pl +from pytorch_lightning.utilities.types import EPOCH_OUTPUT import torch import torch.nn.functional as F import torch.optim as optim @@ -41,6 +42,7 @@ def __init__( silent=False, save_k_predictions=0, val_metric='Micro-F1', # LAAT + shuffle=True, **kwargs ): super().__init__() @@ -61,6 +63,8 @@ def __init__( top_k = 1 if self.multiclass else None self.eval_metric = get_metrics(metric_threshold, monitor_metrics, num_classes, top_k=top_k) self.val_metric = val_metric # LAAT + self.shuffle = shuffle # LAAT + self.num_classes = num_classes @abstractmethod def shared_step(self, batch): @@ -78,7 +82,11 @@ def configure_optimizers(self): elif optimizer_name == "adam": optimizer = optim.Adam(parameters, weight_decay=self.weight_decay, lr=self.learning_rate) elif optimizer_name == "adamw": - optimizer = optim.AdamW(parameters, weight_decay=self.weight_decay, lr=self.learning_rate) + # optimizer = optim.AdamW( + # parameters, weight_decay=self.weight_decay, lr=self.learning_rate) + from transformers import AdamW + optimizer = AdamW(filter(lambda p: p.requires_grad, self.parameters()), + lr=self.learning_rate, weight_decay=self.weight_decay) elif optimizer_name == "adamax": optimizer = optim.Adamax(parameters, weight_decay=self.weight_decay, lr=self.learning_rate) else: @@ -101,9 +109,20 @@ def configure_optimizers(self): return optimizer def training_step(self, batch, batch_idx): - loss, _ = self.shared_step(batch) + # LAAT + idx = torch.argsort(batch['length'], descending=True, stable=True) + sorted_batch = {k: v[idx] for k, v in batch.items()} + loss, _ = self.shared_step(sorted_batch) + loss = self.num_classes * loss # LAAT + loss = loss / self.num_classes return loss + def training_epoch_end(self, outputs: EPOCH_OUTPUT) -> None: + print(f'Reshuffling the data') + if self.shuffle: + self.trainer.train_dataloader.dataset.datasets.shuffle_data() + return super().training_epoch_end(outputs) + def validation_step(self, batch, batch_idx): return self._shared_eval_step(batch, batch_idx) diff --git a/libmultilabel/nn/networks/laat.py b/libmultilabel/nn/networks/laat.py index 864616de..79db83e4 100644 --- a/libmultilabel/nn/networks/laat.py +++ b/libmultilabel/nn/networks/laat.py @@ -24,10 +24,11 @@ def __init__( d_a=512, embed_dropout=0.3, encoder_dropout=0, + freeze_embed=True, ): super(LAAT, self).__init__() - self.embedding = Embedding(embed_vecs, embed_dropout) + self.embedding = Embedding(embed_vecs, embed_dropout, freeze_embed) self.num_layers = num_layers self.rnn_dim = rnn_dim @@ -39,34 +40,37 @@ def __init__( mean = 0.0 std = 0.03 # first linear - # Z = tanh(WH), W: (d_a * 2u), H: (2u * N), Z: (d_a * N) self.W = nn.Linear(rnn_dim, d_a, bias=False) - torch.nn.init.normal(self.W.weight, mean, std) """Context vectors for computing attention with (in_features, out_features) = (d_a, num_classes) """ - # second linear - # A = softmax(UZ), U: (|L| * d_a), Z: (d_a * N), A: |L| * N + # second linear (U in the paper) self.Q = nn.Linear(d_a, num_classes, bias=False) - torch.nn.init.normal(self.Q.weight, mean, std) # Final layer: create a matrix to use for the #labels binary classifiers self.output = nn.Linear(rnn_dim, num_classes, bias=True) - torch.nn.init.normal(self.output.weight, mean, std) + + torch.nn.init.normal_(self.W.weight, mean, std) + torch.nn.init.normal_(self.Q.weight, mean, std) + torch.nn.init.normal_(self.output.weight, mean, std) def forward(self, input): # Get embeddings and apply dropout x = self.embedding(input['text']) # (batch_size, length, embed_dim) + H = self.encoder(x, input['length']) # (batch_size, length, rnn_dim) - x = self.encoder(x, input['length']) # (batch_size, length, rnn_dim) - Z = torch.tanh(self.W(x)) # (batch_size, length, d_a) + # (4) Z = tanh(WH), W: (d_a * 2u), H: (2u * N), Z: (d_a * N) + Z = torch.tanh(self.W(H)) # (batch_size, length, d_a) - # (batch_size, class_num, length) - alpha = torch.softmax(self.Q.weight.matmul(Z.transpose(1, 2)), dim=2) + # (5) A = softmax(UZ), A: (batch_size, class_num, length) + # Q: (|L| * d_a), Z: (d_a * N), A: |L| * N + alpha = self.Q(Z) + alpha = torch.softmax(alpha, 1).transpose(1, 2) + # alpha = torch.softmax(self.Q.weight.matmul(Z.transpose(1, 2)), dim=2) # Document representations are weighted sums using the attention - E = alpha.matmul(x) + E = alpha.matmul(H) # Compute a probability for each label logits = self.output.weight.mul(E).sum(dim=2).add( diff --git a/libmultilabel/nn/networks/modules.py b/libmultilabel/nn/networks/modules.py index 416f4c81..22ec606e 100644 --- a/libmultilabel/nn/networks/modules.py +++ b/libmultilabel/nn/networks/modules.py @@ -4,6 +4,7 @@ import torch.nn as nn import torch.nn.functional as F from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence +import copy class Embedding(nn.Module): @@ -14,9 +15,14 @@ class Embedding(nn.Module): dropout (float): The dropout rate of the word embedding. Defaults to 0.2. """ - def __init__(self, embed_vecs, dropout=0.2): + def __init__(self, embed_vecs, dropout=0.2, freeze_embed=False): super(Embedding, self).__init__() - self.embedding = nn.Embedding.from_pretrained(embed_vecs, freeze=True, padding_idx=0) # LAAT + # self.embedding = nn.Embedding.from_pretrained( + # embed_vecs, freeze=freeze_embed, padding_idx=0) # LAAT + + self.embedding = nn.Embedding(embed_vecs.shape[0], embed_vecs.shape[1]) + self.embedding.weight = nn.Parameter(copy.deepcopy( + embed_vecs), requires_grad=False) self.dropout = nn.Dropout(dropout) def forward(self, input): @@ -35,19 +41,20 @@ class RNNEncoder(ABC, nn.Module): def __init__(self, input_size, hidden_size, num_layers, dropout=0): super(RNNEncoder, self).__init__() - self.rnn = self._get_rnn(input_size, hidden_size, num_layers) - self.dropout = nn.Dropout(dropout) + # self.rnn = self._get_rnn(input_size, hidden_size, num_layers) + # self.dropout = nn.Dropout(dropout) + self.rnn = self._get_rnn(input_size, hidden_size, num_layers, dropout) def forward(self, input, length, **kwargs): self.rnn.flatten_parameters() - idx = torch.argsort(length, descending=True) + idx = torch.argsort(length, descending=True, stable=True) length_clamped = length[idx].cpu().clamp(min=1) # avoid the empty text with length 0 packed_input = pack_padded_sequence(input[idx], length_clamped, batch_first=True) outputs, _ = pad_packed_sequence(self.rnn(packed_input)[0], batch_first=True) - return self.dropout(outputs[torch.argsort(idx)]) + return outputs[torch.argsort(idx)] @abstractmethod - def _get_rnn(self, input_size, hidden_size, num_layers): + def _get_rnn(self, input_size, hidden_size, num_layers, dropout): raise NotImplementedError @@ -64,8 +71,8 @@ class GRUEncoder(RNNEncoder): def __init__(self, input_size, hidden_size, num_layers, dropout=0): super(GRUEncoder, self).__init__(input_size, hidden_size, num_layers, dropout) - def _get_rnn(self, input_size, hidden_size, num_layers): - return nn.GRU(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True) + def _get_rnn(self, input_size, hidden_size, num_layers, dropout): + return nn.GRU(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True, dropout=dropout) class LSTMEncoder(RNNEncoder): @@ -81,8 +88,9 @@ class LSTMEncoder(RNNEncoder): def __init__(self, input_size, hidden_size, num_layers, dropout=0): super(LSTMEncoder, self).__init__(input_size, hidden_size, num_layers, dropout) - def _get_rnn(self, input_size, hidden_size, num_layers): - return nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True) + def _get_rnn(self, input_size, hidden_size, num_layers, dropout): + return nn.LSTM(input_size, hidden_size, num_layers, bidirectional=True, dropout=dropout) + # return nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True) class CNNEncoder(nn.Module): diff --git a/libmultilabel/nn/nn_utils.py b/libmultilabel/nn/nn_utils.py index 8eabf02c..924731be 100644 --- a/libmultilabel/nn/nn_utils.py +++ b/libmultilabel/nn/nn_utils.py @@ -52,6 +52,7 @@ def init_model( loss_function="binary_cross_entropy_with_logits", silent=False, save_k_predictions=0, + shuffle=False, # LAAT ): """Initialize a `Model` class for initializing and training a neural network. @@ -108,6 +109,7 @@ def init_model( loss_function=loss_function, silent=silent, save_k_predictions=save_k_predictions, + shuffle=shuffle, # LAAT ) return model diff --git a/torch_trainer.py b/torch_trainer.py index f37706a9..e8dbffbf 100644 --- a/torch_trainer.py +++ b/torch_trainer.py @@ -124,7 +124,10 @@ def _setup_model( if self.config.embed_file is not None: logging.info("Load word dictionary ") word_dict, embed_vecs = data_utils.load_or_build_text_dict( - dataset=self.datasets["train"], + # dataset=self.datasets["train"], + # LAAT + dataset=self.datasets["train"] + \ + self.datasets["val"] + self.datasets["test"], vocab_file=self.config.vocab_file, min_vocab_freq=self.config.min_vocab_freq, embed_file=self.config.embed_file, @@ -169,6 +172,7 @@ def _setup_model( loss_function=self.config.loss_function, silent=self.config.silent, save_k_predictions=self.config.save_k_predictions, + shuffle=self.config.shuffle, # LAAT ) def _get_dataset_loader(self, split, shuffle=False): @@ -208,6 +212,8 @@ def train(self): self.trainer.fit(self.model, train_loader) else: val_loader = self._get_dataset_loader(split="val") + if self.config.shuffle: + train_loader.dataset.shuffle_data() self.trainer.fit(self.model, train_loader, val_loader) # Set model to the best model. If the validation process is skipped during From c8e4c9e29c695afc187a86e30f5d0bd7ba66264b Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Fri, 30 Jun 2023 14:27:18 +0800 Subject: [PATCH 07/19] Remove unused settings --- libmultilabel/nn/data_utils.py | 49 ++++++++-------------------- libmultilabel/nn/model.py | 22 ++----------- libmultilabel/nn/networks/laat.py | 13 ++++---- libmultilabel/nn/networks/modules.py | 9 ++--- libmultilabel/nn/nn_utils.py | 2 -- torch_trainer.py | 11 +++---- 6 files changed, 30 insertions(+), 76 deletions(-) diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py index 4bc4aa68..006ed241 100644 --- a/libmultilabel/nn/data_utils.py +++ b/libmultilabel/nn/data_utils.py @@ -1,7 +1,6 @@ import csv import gc import logging -import random import warnings import pandas as pd @@ -46,13 +45,12 @@ def __init__( tokenizer=None, word_dict=None, ): - # self.data = data + self.data = data self.classes = classes self.max_seq_length = max_seq_length self.word_dict = word_dict self.tokenizer = tokenizer self.add_special_tokens = add_special_tokens - self.data = self.sort_by_length(data) # LAAT self.num_classes = len(self.classes) self.label_binarizer = MultiLabelBinarizer().fit([classes]) @@ -79,14 +77,6 @@ def __getitem__(self, index): "label": torch.IntTensor(self.label_binarizer.transform([data["label"]])[0]), } - def sort_by_length(self, data): - # reverse for LAAT - return sorted(data, key=lambda x: -len(x['text'][:self.max_seq_length])) - - def shuffle_data(self): - # LAAT - random.shuffle(self.data) - def tokenize(text): """Tokenize text. @@ -150,8 +140,7 @@ def get_dataset_loader( dataset_loader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, - # shuffle=shuffle, - shuffle=False, # use TextDataset.shuffle_data() + shuffle=shuffle, num_workers=data_workers, collate_fn=generate_batch, pin_memory="cuda" in device.type, @@ -380,8 +369,7 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache=N vector_dict = {} for word_vector in tqdm(word_vectors, disable=silent): word, vector = word_vector.rstrip().split(" ", 1) - # vector = torch.Tensor(list(map(float, vector.split()))) - vector = list(map(float, vector.split())) # LAAT + vector = torch.Tensor(list(map(float, vector.split()))) vector_dict[word] = vector else: logging.info(f"Load pretrained embedding from torchtext.") @@ -394,21 +382,15 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache=N vector_dict = pretrained_aliases[embed_file](cache=cache) embed_size = vector_dict.dim - # LAAT init: set default to UNK vector - import numpy as np - unk_vector = np.random.uniform(-0.25, 0.25, embed_size) - embedding_weights = [unk_vector] * (len(word_dict)) - embedding_weights[0] = np.zeros(embed_size) - - # embedding_weights = torch.zeros(len(word_dict), embed_size) - # if load_embedding_from_file: - # # Add UNK embedding - # # AttentionXML: np.random.uniform(-1.0, 1.0, embed_size) - # # LAAT: np.random.uniform(-0.25, 0.25, embedding_size) - # # CAML: np.random.randn(embed_size) - # import numpy as np - # unk_vector = torch.tensor(np.random.uniform(-0.25, 0.25, embed_size)) - # embedding_weights[word_dict[UNK]] = unk_vector + embedding_weights = torch.zeros(len(word_dict), embed_size) + + if load_embedding_from_file: + # Add UNK embedding + # AttentionXML: np.random.uniform(-1.0, 1.0, embed_size) + # LAAT: np.random.uniform(-0.25, 0.25, embedding_size) + # CAML: np.random.randn(embed_size) + unk_vector = torch.randn(embed_size) + embedding_weights[word_dict[UNK]] = unk_vector # Store pretrained word embedding vec_counts = 0 @@ -422,9 +404,4 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache=N logging.info(f"loaded {vec_counts}/{len(word_dict)} word embeddings") - # For resolving `UserWarning: Creating a tensor from a list of numpy.ndarrays is - # extremely slow. Please consider converting the list to a single numpy.ndarray - # with numpy.array() before converting to a tensor.` - embedding_weights = np.array(embedding_weights, dtype=np.float32) - return torch.FloatTensor(embedding_weights) # LAAT - # return embedding_weights + return embedding_weights diff --git a/libmultilabel/nn/model.py b/libmultilabel/nn/model.py index 50eb2e10..fb9290d9 100644 --- a/libmultilabel/nn/model.py +++ b/libmultilabel/nn/model.py @@ -2,7 +2,6 @@ import numpy as np import pytorch_lightning as pl -from pytorch_lightning.utilities.types import EPOCH_OUTPUT import torch import torch.nn.functional as F import torch.optim as optim @@ -92,8 +91,8 @@ def configure_optimizers(self): else: raise RuntimeError("Unsupported optimizer: {self.optimizer}") - # torch.nn.utils.clip_grad_value_(parameters, 0.5) - # LAAT hard code + torch.nn.utils.clip_grad_value_(parameters, 0.5) + # LAAT hard code (Shao-Syuan) if self.val_metric is not None: return { "optimizer": optimizer, @@ -109,20 +108,9 @@ def configure_optimizers(self): return optimizer def training_step(self, batch, batch_idx): - # LAAT - idx = torch.argsort(batch['length'], descending=True, stable=True) - sorted_batch = {k: v[idx] for k, v in batch.items()} - loss, _ = self.shared_step(sorted_batch) - loss = self.num_classes * loss # LAAT - loss = loss / self.num_classes + loss, _ = self.shared_step(batch) return loss - def training_epoch_end(self, outputs: EPOCH_OUTPUT) -> None: - print(f'Reshuffling the data') - if self.shuffle: - self.trainer.train_dataloader.dataset.datasets.shuffle_data() - return super().training_epoch_end(outputs) - def validation_step(self, batch, batch_idx): return self._shared_eval_step(batch, batch_idx) @@ -130,10 +118,6 @@ def validation_step_end(self, batch_parts): return self._shared_eval_step_end(batch_parts) def validation_epoch_end(self, step_outputs): - # print learning rate (LAAT) - lightning_optimizer = self.optimizers() - for param_group in lightning_optimizer.optimizer.param_groups: - print(f"\nLearning Rate: {param_group['lr']}\n") return self._shared_eval_epoch_end(step_outputs, "val") def test_step(self, batch, batch_idx): diff --git a/libmultilabel/nn/networks/laat.py b/libmultilabel/nn/networks/laat.py index 79db83e4..a995f27b 100644 --- a/libmultilabel/nn/networks/laat.py +++ b/libmultilabel/nn/networks/laat.py @@ -28,7 +28,8 @@ def __init__( ): super(LAAT, self).__init__() - self.embedding = Embedding(embed_vecs, embed_dropout, freeze_embed) + self.embedding = Embedding( + embed_vecs, dropout=embed_dropout, freeze_embed=freeze_embed) self.num_layers = num_layers self.rnn_dim = rnn_dim @@ -37,8 +38,8 @@ def __init__( input_size=embed_vecs.shape[1], hidden_size=rnn_dim//2, num_layers=num_layers, dropout=encoder_dropout) - mean = 0.0 - std = 0.03 + # mean = 0.0 + # std = 0.03 # first linear self.W = nn.Linear(rnn_dim, d_a, bias=False) @@ -51,9 +52,9 @@ def __init__( # Final layer: create a matrix to use for the #labels binary classifiers self.output = nn.Linear(rnn_dim, num_classes, bias=True) - torch.nn.init.normal_(self.W.weight, mean, std) - torch.nn.init.normal_(self.Q.weight, mean, std) - torch.nn.init.normal_(self.output.weight, mean, std) + # torch.nn.init.normal_(self.W.weight, mean, std) + # torch.nn.init.normal_(self.Q.weight, mean, std) + # torch.nn.init.normal_(self.output.weight, mean, std) def forward(self, input): # Get embeddings and apply dropout diff --git a/libmultilabel/nn/networks/modules.py b/libmultilabel/nn/networks/modules.py index 22ec606e..b900045e 100644 --- a/libmultilabel/nn/networks/modules.py +++ b/libmultilabel/nn/networks/modules.py @@ -17,12 +17,8 @@ class Embedding(nn.Module): def __init__(self, embed_vecs, dropout=0.2, freeze_embed=False): super(Embedding, self).__init__() - # self.embedding = nn.Embedding.from_pretrained( - # embed_vecs, freeze=freeze_embed, padding_idx=0) # LAAT - - self.embedding = nn.Embedding(embed_vecs.shape[0], embed_vecs.shape[1]) - self.embedding.weight = nn.Parameter(copy.deepcopy( - embed_vecs), requires_grad=False) + self.embedding = nn.Embedding.from_pretrained( + embed_vecs, freeze=freeze_embed, padding_idx=0) self.dropout = nn.Dropout(dropout) def forward(self, input): @@ -41,6 +37,7 @@ class RNNEncoder(ABC, nn.Module): def __init__(self, input_size, hidden_size, num_layers, dropout=0): super(RNNEncoder, self).__init__() + # Li-Chung: PR316 # self.rnn = self._get_rnn(input_size, hidden_size, num_layers) # self.dropout = nn.Dropout(dropout) self.rnn = self._get_rnn(input_size, hidden_size, num_layers, dropout) diff --git a/libmultilabel/nn/nn_utils.py b/libmultilabel/nn/nn_utils.py index 924731be..8eabf02c 100644 --- a/libmultilabel/nn/nn_utils.py +++ b/libmultilabel/nn/nn_utils.py @@ -52,7 +52,6 @@ def init_model( loss_function="binary_cross_entropy_with_logits", silent=False, save_k_predictions=0, - shuffle=False, # LAAT ): """Initialize a `Model` class for initializing and training a neural network. @@ -109,7 +108,6 @@ def init_model( loss_function=loss_function, silent=silent, save_k_predictions=save_k_predictions, - shuffle=shuffle, # LAAT ) return model diff --git a/torch_trainer.py b/torch_trainer.py index e8dbffbf..1035d148 100644 --- a/torch_trainer.py +++ b/torch_trainer.py @@ -124,10 +124,10 @@ def _setup_model( if self.config.embed_file is not None: logging.info("Load word dictionary ") word_dict, embed_vecs = data_utils.load_or_build_text_dict( - # dataset=self.datasets["train"], - # LAAT - dataset=self.datasets["train"] + \ - self.datasets["val"] + self.datasets["test"], + dataset=self.datasets["train"], + # # LAAT + # dataset=self.datasets["train"] + \ + # self.datasets["val"] + self.datasets["test"], vocab_file=self.config.vocab_file, min_vocab_freq=self.config.min_vocab_freq, embed_file=self.config.embed_file, @@ -172,7 +172,6 @@ def _setup_model( loss_function=self.config.loss_function, silent=self.config.silent, save_k_predictions=self.config.save_k_predictions, - shuffle=self.config.shuffle, # LAAT ) def _get_dataset_loader(self, split, shuffle=False): @@ -212,8 +211,6 @@ def train(self): self.trainer.fit(self.model, train_loader) else: val_loader = self._get_dataset_loader(split="val") - if self.config.shuffle: - train_loader.dataset.shuffle_data() self.trainer.fit(self.model, train_loader, val_loader) # Set model to the best model. If the validation process is skipped during From 1bd97c760b296123d45c9707db2af7735518990a Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Sun, 2 Jul 2023 19:46:27 +0800 Subject: [PATCH 08/19] Add eps for Adam --- libmultilabel/nn/model.py | 21 ++++++++++----------- libmultilabel/nn/nn_utils.py | 3 +++ main.py | 3 +++ torch_trainer.py | 1 + 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/libmultilabel/nn/model.py b/libmultilabel/nn/model.py index fb9290d9..dba0194f 100644 --- a/libmultilabel/nn/model.py +++ b/libmultilabel/nn/model.py @@ -19,6 +19,7 @@ class MultiLabelModel(pl.LightningModule): optimizer (str, optional): Optimizer name (i.e., sgd, adam, or adamw). Defaults to 'adam'. momentum (float, optional): Momentum factor for SGD only. Defaults to 0.9. weight_decay (int, optional): Weight decay factor. Defaults to 0. + eps (float, optional): Epsilon of Adam-based optimizer (e.g., adam, adamw, or adamax). Defaults to 1e-08. metric_threshold (float, optional): The decision value threshold over which a label is predicted as positive. Defaults to 0.5. monitor_metrics (list, optional): Metrics to monitor while validating. Defaults to None. log_path (str): Path to a directory holding the log files and models. @@ -34,14 +35,14 @@ def __init__( optimizer="adam", momentum=0.9, weight_decay=0, + eps=1e-08, metric_threshold=0.5, monitor_metrics=None, log_path=None, multiclass=False, silent=False, save_k_predictions=0, - val_metric='Micro-F1', # LAAT - shuffle=True, + val_metric='Micro-F1', # LAAT (remove after PR317 merged to master) **kwargs ): super().__init__() @@ -51,6 +52,7 @@ def __init__( self.optimizer = optimizer self.momentum = momentum self.weight_decay = weight_decay + self.eps = eps # dump log self.log_path = log_path @@ -61,8 +63,7 @@ def __init__( self.multiclass = multiclass top_k = 1 if self.multiclass else None self.eval_metric = get_metrics(metric_threshold, monitor_metrics, num_classes, top_k=top_k) - self.val_metric = val_metric # LAAT - self.shuffle = shuffle # LAAT + self.val_metric = val_metric # LAAT (remove after PR317 merged to master) self.num_classes = num_classes @abstractmethod @@ -79,15 +80,13 @@ def configure_optimizers(self): parameters, self.learning_rate, momentum=self.momentum, weight_decay=self.weight_decay ) elif optimizer_name == "adam": - optimizer = optim.Adam(parameters, weight_decay=self.weight_decay, lr=self.learning_rate) + optimizer = optim.Adam(parameters, weight_decay=self.weight_decay, eps=self.eps, lr=self.learning_rate) elif optimizer_name == "adamw": - # optimizer = optim.AdamW( - # parameters, weight_decay=self.weight_decay, lr=self.learning_rate) - from transformers import AdamW - optimizer = AdamW(filter(lambda p: p.requires_grad, self.parameters()), - lr=self.learning_rate, weight_decay=self.weight_decay) + optimizer = optim.AdamW( + parameters, weight_decay=self.weight_decay, eps=self.eps, lr=self.learning_rate) elif optimizer_name == "adamax": - optimizer = optim.Adamax(parameters, weight_decay=self.weight_decay, lr=self.learning_rate) + optimizer = optim.Adamax( + parameters, weight_decay=self.weight_decay, eps=self.eps, lr=self.learning_rate) else: raise RuntimeError("Unsupported optimizer: {self.optimizer}") diff --git a/libmultilabel/nn/nn_utils.py b/libmultilabel/nn/nn_utils.py index 8eabf02c..e5ac9f99 100644 --- a/libmultilabel/nn/nn_utils.py +++ b/libmultilabel/nn/nn_utils.py @@ -46,6 +46,7 @@ def init_model( optimizer="adam", momentum=0.9, weight_decay=0, + eps=1e-08, metric_threshold=0.5, monitor_metrics=None, multiclass=False, @@ -71,6 +72,7 @@ def init_model( optimizer (str, optional): Optimizer name (i.e., sgd, adam, or adamw). Defaults to 'adam'. momentum (float, optional): Momentum factor for SGD only. Defaults to 0.9. weight_decay (int, optional): Weight decay factor. Defaults to 0. + eps (float, optional): Epsilon of Adam-based optimizer (e.g., adam, adamw, or adamax). Defaults to 1e-08. metric_threshold (float, optional): The decision value threshold over which a label is predicted as positive. Defaults to 0.5. monitor_metrics (list, optional): Metrics to monitor while validating. Defaults to None. multiclass (bool, optional): Enable multiclass mode. Defaults to False. @@ -102,6 +104,7 @@ def init_model( optimizer=optimizer, momentum=momentum, weight_decay=weight_decay, + eps=eps, metric_threshold=metric_threshold, monitor_metrics=monitor_metrics, multiclass=multiclass, diff --git a/main.py b/main.py index e45c6942..ef31bb0b 100644 --- a/main.py +++ b/main.py @@ -86,6 +86,9 @@ def add_all_arguments(parser): parser.add_argument( "--momentum", type=float, default=0.9, help="Momentum factor for SGD only (default: %(default)s)" ) + parser.add_argument( + "--eps", type=float, default=1e-08, help="Epsilon of Adam-based optimizer (e.g., adam, adamw, or adamax) (default: %(default)s)" + ) parser.add_argument( "--patience", type=int, diff --git a/torch_trainer.py b/torch_trainer.py index 1035d148..e1f82c01 100644 --- a/torch_trainer.py +++ b/torch_trainer.py @@ -166,6 +166,7 @@ def _setup_model( optimizer=self.config.optimizer, momentum=self.config.momentum, weight_decay=self.config.weight_decay, + eps=self.config.eps, metric_threshold=self.config.metric_threshold, monitor_metrics=self.config.monitor_metrics, multiclass=self.config.multiclass, From 8f41aeea2eabc13006d5eb675170370ec3d3b86e Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Sun, 2 Jul 2023 19:50:38 +0800 Subject: [PATCH 09/19] Add eps to laat configs. --- example_config/MIMICIV-ICD10-50/laat.yml | 1 + example_config/MIMICIV-ICD10-50/laat_tune.yml | 1 + example_config/MIMICIV-ICD9-50/laat.yml | 1 + example_config/MIMICIV-ICD9-50/laat_tune.yml | 1 + 4 files changed, 4 insertions(+) diff --git a/example_config/MIMICIV-ICD10-50/laat.yml b/example_config/MIMICIV-ICD10-50/laat.yml index b9ee7d22..d9800bb1 100644 --- a/example_config/MIMICIV-ICD10-50/laat.yml +++ b/example_config/MIMICIV-ICD10-50/laat.yml @@ -15,6 +15,7 @@ batch_size: 8 optimizer: adamw learning_rate: 0.001 weight_decay: 0 +eps: 1e-06 patience: 6 shuffle: true diff --git a/example_config/MIMICIV-ICD10-50/laat_tune.yml b/example_config/MIMICIV-ICD10-50/laat_tune.yml index 558ac3be..ad3b60ce 100644 --- a/example_config/MIMICIV-ICD10-50/laat_tune.yml +++ b/example_config/MIMICIV-ICD10-50/laat_tune.yml @@ -17,6 +17,7 @@ optimizer: adamw learning_rate: 0.001 momentum: 0.9 weight_decay: 0 +eps: 1e-06 patience: 6 early_stopping_metric: Micro-F1 shuffle: true diff --git a/example_config/MIMICIV-ICD9-50/laat.yml b/example_config/MIMICIV-ICD9-50/laat.yml index 0fd281b3..d795ce50 100644 --- a/example_config/MIMICIV-ICD9-50/laat.yml +++ b/example_config/MIMICIV-ICD9-50/laat.yml @@ -15,6 +15,7 @@ batch_size: 8 optimizer: adamw learning_rate: 0.001 weight_decay: 0 +eps: 1e-06 patience: 6 shuffle: true diff --git a/example_config/MIMICIV-ICD9-50/laat_tune.yml b/example_config/MIMICIV-ICD9-50/laat_tune.yml index 95ded40f..89c8d6dd 100644 --- a/example_config/MIMICIV-ICD9-50/laat_tune.yml +++ b/example_config/MIMICIV-ICD9-50/laat_tune.yml @@ -17,6 +17,7 @@ optimizer: adamw learning_rate: ['grid_search', [0.001, 0.0003]] # LAAT: 0.001 momentum: 0.9 weight_decay: 0 +eps: 1e-06 patience: 6 early_stopping_metric: Micro-F1 shuffle: true From 0a180953f6fadbcd7b936eed9631a038523a48f7 Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Sun, 2 Jul 2023 19:59:33 +0800 Subject: [PATCH 10/19] Clean up LAAT. --- example_config/MIMICIV-ICD10-50/laat.yml | 2 +- example_config/MIMICIV-ICD10-50/laat_tune.yml | 2 +- example_config/MIMICIV-ICD9-50/laat.yml | 2 +- example_config/MIMICIV-ICD9-50/laat_tune.yml | 2 +- libmultilabel/nn/networks/laat.py | 12 ++---------- 5 files changed, 6 insertions(+), 14 deletions(-) diff --git a/example_config/MIMICIV-ICD10-50/laat.yml b/example_config/MIMICIV-ICD10-50/laat.yml index d9800bb1..1e82582e 100644 --- a/example_config/MIMICIV-ICD10-50/laat.yml +++ b/example_config/MIMICIV-ICD10-50/laat.yml @@ -26,7 +26,7 @@ val_metric: Micro-F1 # model model_name: LAAT -init_weight: kaiming_uniform +init_weight: null network_config: embed_dropout: 0.3 encoder_dropout: 0 diff --git a/example_config/MIMICIV-ICD10-50/laat_tune.yml b/example_config/MIMICIV-ICD10-50/laat_tune.yml index ad3b60ce..55e1c48e 100644 --- a/example_config/MIMICIV-ICD10-50/laat_tune.yml +++ b/example_config/MIMICIV-ICD10-50/laat_tune.yml @@ -30,7 +30,7 @@ val_metric: Micro-F1 # model model_name: LAAT loss_function: binary_cross_entropy_with_logits -init_weight: kaiming_uniform +init_weight: null network_config: embed_dropout: 0.3 encoder_dropout: 0 diff --git a/example_config/MIMICIV-ICD9-50/laat.yml b/example_config/MIMICIV-ICD9-50/laat.yml index d795ce50..0ef83638 100644 --- a/example_config/MIMICIV-ICD9-50/laat.yml +++ b/example_config/MIMICIV-ICD9-50/laat.yml @@ -26,7 +26,7 @@ val_metric: Micro-F1 # model model_name: LAAT -init_weight: kaiming_uniform +init_weight: null network_config: embed_dropout: 0.3 encoder_dropout: 0 diff --git a/example_config/MIMICIV-ICD9-50/laat_tune.yml b/example_config/MIMICIV-ICD9-50/laat_tune.yml index 89c8d6dd..4c0dd79a 100644 --- a/example_config/MIMICIV-ICD9-50/laat_tune.yml +++ b/example_config/MIMICIV-ICD9-50/laat_tune.yml @@ -30,7 +30,7 @@ val_metric: Micro-F1 # model model_name: LAAT loss_function: binary_cross_entropy_with_logits -init_weight: kaiming_uniform +init_weight: null network_config: embed_dropout: 0.3 encoder_dropout: ['grid_search', [0, 0.2, 0.4]] # LAAT: 0 diff --git a/libmultilabel/nn/networks/laat.py b/libmultilabel/nn/networks/laat.py index a995f27b..1af8b8f9 100644 --- a/libmultilabel/nn/networks/laat.py +++ b/libmultilabel/nn/networks/laat.py @@ -38,33 +38,25 @@ def __init__( input_size=embed_vecs.shape[1], hidden_size=rnn_dim//2, num_layers=num_layers, dropout=encoder_dropout) - # mean = 0.0 - # std = 0.03 - # first linear self.W = nn.Linear(rnn_dim, d_a, bias=False) """Context vectors for computing attention with (in_features, out_features) = (d_a, num_classes) """ - # second linear (U in the paper) self.Q = nn.Linear(d_a, num_classes, bias=False) # Final layer: create a matrix to use for the #labels binary classifiers self.output = nn.Linear(rnn_dim, num_classes, bias=True) - # torch.nn.init.normal_(self.W.weight, mean, std) - # torch.nn.init.normal_(self.Q.weight, mean, std) - # torch.nn.init.normal_(self.output.weight, mean, std) - def forward(self, input): # Get embeddings and apply dropout x = self.embedding(input['text']) # (batch_size, length, embed_dim) H = self.encoder(x, input['length']) # (batch_size, length, rnn_dim) - # (4) Z = tanh(WH), W: (d_a * 2u), H: (2u * N), Z: (d_a * N) + # Equation (4) Z = tanh(WH), W: (d_a * 2u), H: (2u * N), Z: (d_a * N) Z = torch.tanh(self.W(H)) # (batch_size, length, d_a) - # (5) A = softmax(UZ), A: (batch_size, class_num, length) + # Equation (5) A = softmax(UZ), A: (batch_size, class_num, length) # Q: (|L| * d_a), Z: (d_a * N), A: |L| * N alpha = self.Q(Z) alpha = torch.softmax(alpha, 1).transpose(1, 2) From 6b97f832222f650a5ee3fc537f427942451bf029 Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Thu, 6 Jul 2023 23:45:44 +0800 Subject: [PATCH 11/19] Reduce LAAT changes (use CAML implementation), train+val as discussed --- libmultilabel/nn/networks/laat.py | 6 +++--- torch_trainer.py | 7 +++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/libmultilabel/nn/networks/laat.py b/libmultilabel/nn/networks/laat.py index 1af8b8f9..bbe9ad23 100644 --- a/libmultilabel/nn/networks/laat.py +++ b/libmultilabel/nn/networks/laat.py @@ -58,9 +58,9 @@ def forward(self, input): # Equation (5) A = softmax(UZ), A: (batch_size, class_num, length) # Q: (|L| * d_a), Z: (d_a * N), A: |L| * N - alpha = self.Q(Z) - alpha = torch.softmax(alpha, 1).transpose(1, 2) - # alpha = torch.softmax(self.Q.weight.matmul(Z.transpose(1, 2)), dim=2) + # alpha = self.Q(Z) + # alpha = torch.softmax(alpha, 1).transpose(1, 2) + alpha = torch.softmax(self.Q.weight.matmul(Z.transpose(1, 2)), dim=2) # Document representations are weighted sums using the attention E = alpha.matmul(H) diff --git a/torch_trainer.py b/torch_trainer.py index d2fe0e90..e7e2a11c 100644 --- a/torch_trainer.py +++ b/torch_trainer.py @@ -124,10 +124,9 @@ def _setup_model( if self.config.embed_file is not None: logging.info("Load word dictionary ") word_dict, embed_vecs = data_utils.load_or_build_text_dict( - dataset=self.datasets["train"], - # # LAAT - # dataset=self.datasets["train"] + \ - # self.datasets["val"] + self.datasets["test"], + # add vocab in the validation set + # CAML: train, LAAT: train, val, and test + dataset=self.datasets["train"] + self.datasets.get("val", []), vocab_file=self.config.vocab_file, min_vocab_freq=self.config.min_vocab_freq, embed_file=self.config.embed_file, From 4f8b86a2fed4454f1cc62b46f5dd0280ea4a6a01 Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Fri, 7 Jul 2023 00:38:38 +0800 Subject: [PATCH 12/19] Remove unused variables. --- libmultilabel/nn/model.py | 1 - libmultilabel/nn/networks/modules.py | 6 ++---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/libmultilabel/nn/model.py b/libmultilabel/nn/model.py index 1b00a786..c450eb4f 100644 --- a/libmultilabel/nn/model.py +++ b/libmultilabel/nn/model.py @@ -70,7 +70,6 @@ def __init__( self.multiclass = multiclass top_k = 1 if self.multiclass else None self.eval_metric = get_metrics(metric_threshold, monitor_metrics, num_classes, top_k=top_k) - self.num_classes = num_classes @abstractmethod def shared_step(self, batch): diff --git a/libmultilabel/nn/networks/modules.py b/libmultilabel/nn/networks/modules.py index b900045e..4a9c1a3c 100644 --- a/libmultilabel/nn/networks/modules.py +++ b/libmultilabel/nn/networks/modules.py @@ -4,7 +4,6 @@ import torch.nn as nn import torch.nn.functional as F from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence -import copy class Embedding(nn.Module): @@ -44,7 +43,7 @@ def __init__(self, input_size, hidden_size, num_layers, dropout=0): def forward(self, input, length, **kwargs): self.rnn.flatten_parameters() - idx = torch.argsort(length, descending=True, stable=True) + idx = torch.argsort(length, descending=True) length_clamped = length[idx].cpu().clamp(min=1) # avoid the empty text with length 0 packed_input = pack_padded_sequence(input[idx], length_clamped, batch_first=True) outputs, _ = pad_packed_sequence(self.rnn(packed_input)[0], batch_first=True) @@ -86,8 +85,7 @@ def __init__(self, input_size, hidden_size, num_layers, dropout=0): super(LSTMEncoder, self).__init__(input_size, hidden_size, num_layers, dropout) def _get_rnn(self, input_size, hidden_size, num_layers, dropout): - return nn.LSTM(input_size, hidden_size, num_layers, bidirectional=True, dropout=dropout) - # return nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True) + return nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True, dropout=dropout) class CNNEncoder(nn.Module): From 2e271a97e487ce414455a56b3b4510c49e21034e Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Fri, 7 Jul 2023 16:23:06 +0800 Subject: [PATCH 13/19] Update MIMIC tune configs. --- example_config/MIMIC/laat_tune.yml | 81 +++++++++++++++++++ example_config/MIMICIV-ICD10-50/laat_tune.yml | 14 +++- example_config/MIMICIV-ICD9-50/laat_tune.yml | 14 +++- 3 files changed, 101 insertions(+), 8 deletions(-) create mode 100644 example_config/MIMIC/laat_tune.yml diff --git a/example_config/MIMIC/laat_tune.yml b/example_config/MIMIC/laat_tune.yml new file mode 100644 index 00000000..b0baea8d --- /dev/null +++ b/example_config/MIMIC/laat_tune.yml @@ -0,0 +1,81 @@ +# data +training_file: data/MIMIC/train.txt +val_file: data/MIMIC/valid.txt +test_file: data/MIMIC/test.txt +data_name: MIMIC +min_vocab_freq: 1 +max_seq_length: 4000 +include_test_labels: true +remove_no_label_data: true +add_special_tokens: false + +# train +seed: 0 +epochs: 50 +batch_size: 8 +optimizer: adamw +learning_rate: ['grid_search', [0.001, 0.0003]] +eps: 0.00000001 # 1e-08 (expand str for ray) +momentum: 0.9 # not used +weight_decay: 0 +patience: 6 +early_stopping_metric: Micro-F1 +shuffle: true +lr_scheduler: ReduceLROnPlateau +scheduler_config: + factor: 0.9 + patience: 5 + min_lr: 0.0001 + +# eval +eval_batch_size: 8 +monitor_metrics: ['Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@5', 'P@8', 'P@15'] +val_metric: Micro-F1 + +# model +model_name: LAAT +loss_function: binary_cross_entropy_with_logits +init_weight: kaiming_uniform +network_config: + embed_dropout: ['grid_search', [0, 0.2, 0.4, 0.6, 0.8]] + encoder_dropout: ['grid_search', [0, 0.2, 0.4]] # LAAT: 0 + rnn_dim: ['grid_search', [512, 768, 1024]] # LAAT: 512 + num_layers: 1 + d_a: ['grid_search', [256, 384, 512]] # LAAT: 256 + freeze_embed: false + +# pretrained vocab / embeddings +# vocab_file: data/MIMIC/vocab.csv +embed_file: data/MIMIC-50/word2vec_sg0_100.embed +normalize_embed: false + +# hyperparamter search +search_alg: basic_variant +embed_cache_dir: .vector_cache +num_samples: 1 +scheduler: null +no_merge_train_val: true # do not retrain + +# Uncomment the following lines to enable the ASHAScheduler. +# See the documentation here: https://docs.ray.io/en/latest/tune/api_docs/schedulers.html#asha-tune-schedulers-ashascheduler +# scheduler: +# time_attr: training_iteration +# max_t: 50 # the maximum epochs to run for each config (parameter R in the ASHA paper) +# grace_period: 10 # the minimum epochs to run for each config (parameter r in the ASHA paper) +# reduction_factor: 3 # reduce the number of configuration to floor(1/reduction_factor) each round of successive halving (called rung in ASHA paper) +# brackets: 1 # number of brackets. A smaller bracket index (parameter s in the ASHA paper) means earlier stopping (i.e., less total resources used) + +# other parameters specified in main.py::get_args +checkpoint_path: null +cpu: false +data_workers: 8 +eval: false +label_file: null +limit_train_batches: 1.0 +limit_val_batches: 1.0 +limit_test_batches: 1.0 +metric_threshold: 0.5 +result_dir: runs +save_k_predictions: 0 +silent: true +val_size: 0.2 \ No newline at end of file diff --git a/example_config/MIMICIV-ICD10-50/laat_tune.yml b/example_config/MIMICIV-ICD10-50/laat_tune.yml index 55e1c48e..eb219478 100644 --- a/example_config/MIMICIV-ICD10-50/laat_tune.yml +++ b/example_config/MIMICIV-ICD10-50/laat_tune.yml @@ -15,28 +15,34 @@ epochs: 50 batch_size: 8 optimizer: adamw learning_rate: 0.001 +eps: 0.00000001 # 1e-08 (expand str for ray) momentum: 0.9 weight_decay: 0 -eps: 1e-06 patience: 6 early_stopping_metric: Micro-F1 shuffle: true +lr_scheduler: ReduceLROnPlateau +scheduler_config: + factor: 0.9 + patience: 5 + min_lr: 0.0001 # eval eval_batch_size: 8 -monitor_metrics: ['Loss', 'Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@5', 'P@8', 'P@15'] +monitor_metrics: ['Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@5', 'P@8', 'P@15'] val_metric: Micro-F1 # model model_name: LAAT loss_function: binary_cross_entropy_with_logits -init_weight: null +init_weight: kaiming_uniform # null network_config: embed_dropout: 0.3 encoder_dropout: 0 rnn_dim: ['grid_search', [512, 1024, 768, 256]] # 2u num_layers: 1 d_a: ['grid_search', [256, 512, 384, 128]] + freeze_embed: false # pretrained vocab / embeddings vocab_file: data/MIMICIV-ICD10-50/vocab.csv # 179,131, min_vocab_freq=1 @@ -45,7 +51,7 @@ normalize_embed: false # hyperparamter search search_alg: basic_variant -embed_cache_dir: null +embed_cache_dir: .vector_cache num_samples: 1 scheduler: null no_merge_train_val: true # do not retrain diff --git a/example_config/MIMICIV-ICD9-50/laat_tune.yml b/example_config/MIMICIV-ICD9-50/laat_tune.yml index 4c0dd79a..03ba5222 100644 --- a/example_config/MIMICIV-ICD9-50/laat_tune.yml +++ b/example_config/MIMICIV-ICD9-50/laat_tune.yml @@ -15,28 +15,34 @@ epochs: 50 batch_size: 8 optimizer: adamw learning_rate: ['grid_search', [0.001, 0.0003]] # LAAT: 0.001 +eps: 0.00000001 momentum: 0.9 weight_decay: 0 -eps: 1e-06 patience: 6 early_stopping_metric: Micro-F1 shuffle: true +lr_scheduler: ReduceLROnPlateau +scheduler_config: + factor: 0.9 + patience: 5 + min_lr: 0.0001 # eval eval_batch_size: 8 -monitor_metrics: ['Loss', 'Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@5', 'P@8', 'P@15'] +monitor_metrics: ['Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@5', 'P@8', 'P@15'] val_metric: Micro-F1 # model model_name: LAAT loss_function: binary_cross_entropy_with_logits -init_weight: null +init_weight: kaiming_uniform # null network_config: embed_dropout: 0.3 encoder_dropout: ['grid_search', [0, 0.2, 0.4]] # LAAT: 0 rnn_dim: ['grid_search', [512, 768, 1024]] # LAAT: 512 num_layers: 1 d_a: ['grid_search', [256, 384, 512]] # LAAT: 256 + freeze_embed: false # true # pretrained vocab / embeddings vocab_file: data/MIMICIV-ICD9-50/vocab.csv # 282,173, min_vocab_freq=1 @@ -45,7 +51,7 @@ normalize_embed: false # hyperparamter search search_alg: basic_variant -embed_cache_dir: null +embed_cache_dir: .vector_cache num_samples: 1 scheduler: null no_merge_train_val: true # do not retrain From c04fcaca29de85de9596ce76a65ca4d239c12c12 Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Fri, 7 Jul 2023 16:28:28 +0800 Subject: [PATCH 14/19] Update MIMIC*/laat_tune.yml --- example_config/MIMIC/laat_tune.yml | 4 ++-- example_config/MIMICIV-ICD9-50/laat_tune.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/example_config/MIMIC/laat_tune.yml b/example_config/MIMIC/laat_tune.yml index b0baea8d..e18aeb79 100644 --- a/example_config/MIMIC/laat_tune.yml +++ b/example_config/MIMIC/laat_tune.yml @@ -45,13 +45,13 @@ network_config: freeze_embed: false # pretrained vocab / embeddings -# vocab_file: data/MIMIC/vocab.csv +vocab_file: null # generate min_vocab_freq=1 by LibMultiLabel embed_file: data/MIMIC-50/word2vec_sg0_100.embed normalize_embed: false # hyperparamter search search_alg: basic_variant -embed_cache_dir: .vector_cache +embed_cache_dir: null num_samples: 1 scheduler: null no_merge_train_val: true # do not retrain diff --git a/example_config/MIMICIV-ICD9-50/laat_tune.yml b/example_config/MIMICIV-ICD9-50/laat_tune.yml index 03ba5222..80a04ba3 100644 --- a/example_config/MIMICIV-ICD9-50/laat_tune.yml +++ b/example_config/MIMICIV-ICD9-50/laat_tune.yml @@ -51,7 +51,7 @@ normalize_embed: false # hyperparamter search search_alg: basic_variant -embed_cache_dir: .vector_cache +embed_cache_dir: null num_samples: 1 scheduler: null no_merge_train_val: true # do not retrain From 15cedc3195da7c61d3b5bbdd4af7c60974bfb67b Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Wed, 12 Jul 2023 00:38:30 +0800 Subject: [PATCH 15/19] Add EUR-Lex laat_tune.yml. --- example_config/EUR-Lex/laat_tune.yml | 80 ++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 example_config/EUR-Lex/laat_tune.yml diff --git a/example_config/EUR-Lex/laat_tune.yml b/example_config/EUR-Lex/laat_tune.yml new file mode 100644 index 00000000..3bdb9f23 --- /dev/null +++ b/example_config/EUR-Lex/laat_tune.yml @@ -0,0 +1,80 @@ +# data +training_file: data/EUR-Lex/train.txt +test_file: data/EUR-Lex/test.txt +data_name: EUR-Lex +min_vocab_freq: 1 +max_seq_length: 500 +include_test_labels: true # false +remove_no_label_data: true # false +add_special_tokens: false + +# train +seed: 0 # 1337 +epochs: 50 +batch_size: 16 +optimizer: adamw +learning_rate: ['grid_search', [0.001, 0.0003]] +eps: 0.00000001 # 1e-08 (expand str for ray) +momentum: 0.9 # not used +weight_decay: 0 +patience: 10 +early_stopping_metric: RP@5 +shuffle: true +lr_scheduler: ReduceLROnPlateau +scheduler_config: + factor: 0.9 + patience: 9 + min_lr: 0.0001 + +# eval +eval_batch_size: 16 +monitor_metrics: ['Another-Macro-F1', 'Macro-F1', 'Micro-F1', 'P@1', 'P@5', 'RP@5', 'nDCG@5'] +val_metric: RP@5 + +# model +model_name: LAAT +loss_function: binary_cross_entropy_with_logits +init_weight: kaiming_uniform +network_config: + embed_dropout: ['grid_search', [0, 0.2, 0.4]] # 0.3 + encoder_dropout: ['grid_search', [0, 0.2, 0.4, 0.6, 0.8]] # ['grid_search', [0, 0.2, 0.4]] + rnn_dim: ['grid_search', [256, 512, 1024]] # ['grid_search', [256, 512, 768, 1024]] + rnn_layers: 1 + d_a: ['grid_search', [128, 256, 512]] + freeze_embed: false + +# pretrained vocab / embeddings +vocab_file: null +embed_file: glove.6B.200d +normalize_embed: false + +# hyperparamter search +search_alg: basic_variant +embed_cache_dir: .vector_cache +num_samples: 1 +scheduler: null +no_merge_train_val: true # do not retrain + +# Uncomment the following lines to enable the ASHAScheduler. +# See the documentation here: https://docs.ray.io/en/latest/tune/api_docs/schedulers.html#asha-tune-schedulers-ashascheduler +#scheduler: + #time_attr: training_iteration + #max_t: 50 # the maximum epochs to run for each config (parameter R in the ASHA paper) + #grace_period: 10 # the minimum epochs to run for each config (parameter r in the ASHA paper) + #reduction_factor: 3 # reduce the number of configuration to floor(1/reduction_factor) each round of successive halving (called rung in ASHA paper) + #brackets: 1 # number of brackets. A smaller bracket index (parameter s in the ASHA paper) means earlier stopping (i.e., less total resources used) + +# other parameters specified in main.py::get_args +checkpoint_path: null +cpu: false +data_workers: 4 +eval: false +label_file: null +limit_train_batches: 1.0 +limit_val_batches: 1.0 +limit_test_batches: 1.0 +metric_threshold: 0.5 +result_dir: runs +save_k_predictions: 0 +silent: true +val_size: 0.2 From ab61dadd75e751a751b7f6491dba9dc502132fb7 Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Wed, 12 Jul 2023 00:57:20 +0800 Subject: [PATCH 16/19] Update search space. --- example_config/EUR-Lex/laat_tune.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example_config/EUR-Lex/laat_tune.yml b/example_config/EUR-Lex/laat_tune.yml index 3bdb9f23..dda705d4 100644 --- a/example_config/EUR-Lex/laat_tune.yml +++ b/example_config/EUR-Lex/laat_tune.yml @@ -38,8 +38,8 @@ init_weight: kaiming_uniform network_config: embed_dropout: ['grid_search', [0, 0.2, 0.4]] # 0.3 encoder_dropout: ['grid_search', [0, 0.2, 0.4, 0.6, 0.8]] # ['grid_search', [0, 0.2, 0.4]] - rnn_dim: ['grid_search', [256, 512, 1024]] # ['grid_search', [256, 512, 768, 1024]] - rnn_layers: 1 + rnn_dim: ['grid_search', [1024, 512, 256]] # ['grid_search', [256, 512, 768, 1024]] + num_layers: 1 d_a: ['grid_search', [128, 256, 512]] freeze_embed: false From 31dab37bfe49adac04c8e705de0afddead1ff90a Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Wed, 12 Jul 2023 00:59:24 +0800 Subject: [PATCH 17/19] Reduce search space. --- example_config/EUR-Lex/laat_tune.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example_config/EUR-Lex/laat_tune.yml b/example_config/EUR-Lex/laat_tune.yml index dda705d4..0a490093 100644 --- a/example_config/EUR-Lex/laat_tune.yml +++ b/example_config/EUR-Lex/laat_tune.yml @@ -37,8 +37,8 @@ loss_function: binary_cross_entropy_with_logits init_weight: kaiming_uniform network_config: embed_dropout: ['grid_search', [0, 0.2, 0.4]] # 0.3 - encoder_dropout: ['grid_search', [0, 0.2, 0.4, 0.6, 0.8]] # ['grid_search', [0, 0.2, 0.4]] - rnn_dim: ['grid_search', [1024, 512, 256]] # ['grid_search', [256, 512, 768, 1024]] + encoder_dropout: ['grid_search', [0, 0.2, 0.4, 0.8]] # ['grid_search', [0, 0.2, 0.4]] + rnn_dim: ['grid_search', [1024, 512]] # ['grid_search', [256, 512, 768, 1024]] num_layers: 1 d_a: ['grid_search', [128, 256, 512]] freeze_embed: false From 21338a82f203f52a30e22e8dd74988fc47785a82 Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Thu, 13 Jul 2023 11:28:12 +0800 Subject: [PATCH 18/19] Update EUR-Lex/laat_tune.yml --- example_config/EUR-Lex/laat_tune.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/example_config/EUR-Lex/laat_tune.yml b/example_config/EUR-Lex/laat_tune.yml index 3bdb9f23..ca628a63 100644 --- a/example_config/EUR-Lex/laat_tune.yml +++ b/example_config/EUR-Lex/laat_tune.yml @@ -36,11 +36,11 @@ model_name: LAAT loss_function: binary_cross_entropy_with_logits init_weight: kaiming_uniform network_config: - embed_dropout: ['grid_search', [0, 0.2, 0.4]] # 0.3 - encoder_dropout: ['grid_search', [0, 0.2, 0.4, 0.6, 0.8]] # ['grid_search', [0, 0.2, 0.4]] - rnn_dim: ['grid_search', [256, 512, 1024]] # ['grid_search', [256, 512, 768, 1024]] + embed_dropout: ['grid_search', [0, 0.2, 0.4, 0.6, 0.8]] + encoder_dropout: ['grid_search', [0, 0.2, 0.4]] + rnn_dim: ['grid_search', [512, 768, 1024]] rnn_layers: 1 - d_a: ['grid_search', [128, 256, 512]] + d_a: ['grid_search', [128, 256, 384, 512]] # add d_a = 128 in EUR-Lex freeze_embed: false # pretrained vocab / embeddings From 823ba8f3694905ae42435af707557f75d24638ff Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Fri, 14 Jul 2023 09:26:02 +0800 Subject: [PATCH 19/19] num_layers --- example_config/EUR-Lex/laat_tune.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example_config/EUR-Lex/laat_tune.yml b/example_config/EUR-Lex/laat_tune.yml index 50936953..20d0d551 100644 --- a/example_config/EUR-Lex/laat_tune.yml +++ b/example_config/EUR-Lex/laat_tune.yml @@ -39,7 +39,7 @@ network_config: embed_dropout: ['grid_search', [0, 0.2, 0.4, 0.6, 0.8]] encoder_dropout: ['grid_search', [0, 0.2, 0.4]] rnn_dim: ['grid_search', [512, 768, 1024]] # 256, 512, 1024 - rnn_layers: 1 + num_layers: 1 d_a: ['grid_search', [128, 256, 384, 512]] # add d_a = 128 in EUR-Lex freeze_embed: false