diff --git a/multimolecule/models/conversion_utils.py b/multimolecule/models/conversion_utils.py index 08c8a061..9e0269a7 100644 --- a/multimolecule/models/conversion_utils.py +++ b/multimolecule/models/conversion_utils.py @@ -41,7 +41,12 @@ def write_model( model.save_pretrained(output_path, safe_serialization=False) if tokenizer_config is None: tokenizer_config = get_tokenizer_config() - tokenizer_config["model_max_length"] = getattr(model.config, "max_position_embeddings", None) + if hasattr(model.config, "max_position_embeddings") and "model_max_length" not in tokenizer_config: + position_embedding_type = getattr(model.config, "position_embedding_type", None) + if position_embedding_type == "absolute": + tokenizer_config["model_max_length"] = model.config.max_position_embeddings + else: + tokenizer_config["model_max_length"] = None tokenizer = tokenizer_class_from_name(tokenizer_config["tokenizer_class"])(**tokenizer_config) tokenizer.save_pretrained(output_path) diff --git a/multimolecule/models/ernierna/README.ernierna.ss.md b/multimolecule/models/ernierna/README.ernierna.ss.md index 4f179683..0f566b38 100644 --- a/multimolecule/models/ernierna/README.ernierna.ss.md +++ b/multimolecule/models/ernierna/README.ernierna.ss.md @@ -61,7 +61,7 @@ ERNIE-RNA is a [bert](https://huggingface.co/google-bert/bert-base-uncased)-styl ### Variations - **[`multimolecule/ernierna`](https://huggingface.co/multimolecule/ernierna)**: The ERNIE-RNA model pre-trained on non-coding RNA sequences. -- **[`multimolecule/ernierna.ss`](https://huggingface.co/multimolecule/ernierna.ss)**: The ERNIE-RNA model fine-tuned on RNA secondary structure prediction. +- **[`multimolecule/ernierna-ss`](https://huggingface.co/multimolecule/ernierna-ss)**: The ERNIE-RNA model fine-tuned on RNA secondary structure prediction. ### Model Specification @@ -93,7 +93,7 @@ You can use this model directly with a pipeline for masked language modeling: ```python >>> import multimolecule # you must import multimolecule to register models >>> from transformers import pipeline ->>> unmasker = pipeline("fill-mask", model="multimolecule/ernierna.ss") +>>> unmasker = pipeline("fill-mask", model="multimolecule/ernierna-ss") >>> unmasker("gguccucugguuagaccagaucugagccu") [{'score': 0.2066272348165512, @@ -128,8 +128,8 @@ Here is how to use this model to get the features of a given sequence in PyTorch from multimolecule import RnaTokenizer, ErnieRnaModel -tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna.ss") -model = ErnieRnaModel.from_pretrained("multimolecule/ernierna.ss") +tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna-ss") +model = ErnieRnaModel.from_pretrained("multimolecule/ernierna-ss") text = "UAGCUUAUCAGACUGAUGUUGA" input = tokenizer(text, return_tensors="pt") @@ -148,8 +148,8 @@ import torch from multimolecule import RnaTokenizer, ErnieRnaForSequencePrediction -tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna.ss") -model = ErnieRnaForSequencePrediction.from_pretrained("multimolecule/ernierna.ss") +tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna-ss") +model = ErnieRnaForSequencePrediction.from_pretrained("multimolecule/ernierna-ss") text = "UAGCUUAUCAGACUGAUGUUGA" input = tokenizer(text, return_tensors="pt") @@ -169,8 +169,8 @@ import torch from multimolecule import RnaTokenizer, ErnieRnaForTokenPrediction -tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna.ss") -model = ErnieRnaForTokenPrediction.from_pretrained("multimolecule/ernierna.ss") +tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna-ss") +model = ErnieRnaForTokenPrediction.from_pretrained("multimolecule/ernierna-ss") text = "UAGCUUAUCAGACUGAUGUUGA" input = tokenizer(text, return_tensors="pt") @@ -190,8 +190,8 @@ import torch from multimolecule import RnaTokenizer, ErnieRnaForContactPrediction -tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna.ss") -model = ErnieRnaForContactPrediction.from_pretrained("multimolecule/ernierna.ss") +tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna-ss") +model = ErnieRnaForContactPrediction.from_pretrained("multimolecule/ernierna-ss") text = "UAGCUUAUCAGACUGAUGUUGA" input = tokenizer(text, return_tensors="pt") diff --git a/multimolecule/models/ernierna/README.md b/multimolecule/models/ernierna/README.md index b1e2d400..4ed7700c 100644 --- a/multimolecule/models/ernierna/README.md +++ b/multimolecule/models/ernierna/README.md @@ -60,7 +60,7 @@ ERNIE-RNA is a [bert](https://huggingface.co/google-bert/bert-base-uncased)-styl ### Variations - **[`multimolecule/ernierna`](https://huggingface.co/multimolecule/ernierna)**: The ERNIE-RNA model pre-trained on non-coding RNA sequences. -- **[`multimolecule/ernierna.ss`](https://huggingface.co/multimolecule/ernierna.ss)**: The ERNIE-RNA model fine-tuned on RNA secondary structure prediction. +- **[`multimolecule/ernierna-ss`](https://huggingface.co/multimolecule/ernierna-ss)**: The ERNIE-RNA model fine-tuned on RNA secondary structure prediction. ### Model Specification diff --git a/multimolecule/models/ernierna/convert_checkpoint.py b/multimolecule/models/ernierna/convert_checkpoint.py index b047d3e3..04b3e078 100644 --- a/multimolecule/models/ernierna/convert_checkpoint.py +++ b/multimolecule/models/ernierna/convert_checkpoint.py @@ -23,7 +23,7 @@ from multimolecule.models import ErnieRnaConfig as Config from multimolecule.models import ErnieRnaForContactClassification, ErnieRnaForPreTraining from multimolecule.models.conversion_utils import ConvertConfig as ConvertConfig_ -from multimolecule.models.conversion_utils import get_tokenizer_config, save_checkpoint +from multimolecule.models.conversion_utils import save_checkpoint from multimolecule.tokenisers.rna.utils import convert_word_embeddings, get_alphabet torch.manual_seed(1016) @@ -126,8 +126,7 @@ def convert_checkpoint(convert_config): model.load_state_dict(state_dict) - tokenizer_config = get_tokenizer_config() - save_checkpoint(convert_config, model, tokenizer_config=tokenizer_config) + save_checkpoint(convert_config, model) class ConvertConfig(ConvertConfig_): diff --git a/multimolecule/models/splicebert/README.md b/multimolecule/models/splicebert/README.md index 84fb1962..972610ea 100644 --- a/multimolecule/models/splicebert/README.md +++ b/multimolecule/models/splicebert/README.md @@ -60,8 +60,8 @@ SpliceBERT is a [bert](https://huggingface.co/google-bert/bert-base-uncased)-sty ### Variations - **[`multimolecule/splicebert`](https://huggingface.co/multimolecule/splicebert)**: The SpliceBERT model. -- **[`multimolecule/splicebert.510nt`](https://huggingface.co/multimolecule/splicebert.510nt)**: The intermediate SpliceBERT model. -- **[`multimolecule/splicebert-human.510nt`](https://huggingface.co/multimolecule/splicebert-human.510nt)**: The intermediate SpliceBERT model pre-trained on human data only. +- **[`multimolecule/splicebert.510`](https://huggingface.co/multimolecule/splicebert.510)**: The intermediate SpliceBERT model. +- **[`multimolecule/splicebert-human.510`](https://huggingface.co/multimolecule/splicebert-human.510)**: The intermediate SpliceBERT model pre-trained on human data only. ### Model Specification @@ -92,12 +92,12 @@ SpliceBERT is a [bert](https://huggingface.co/google-bert/bert-base-uncased)-sty 1024 - splicebert.510nt + splicebert.510 19.45 510 - splicebert-human.510nt + splicebert-human.510 @@ -270,9 +270,9 @@ SpliceBERT trained model in a two-stage training process: 1. Pre-train with sequences of a fixed length of 510 nucleotides. 2. Pre-train with sequences of a variable length between 64 and 1024 nucleotides. -The intermediate model after the first stage is available as `multimolecule/splicebert.510nt`. +The intermediate model after the first stage is available as `multimolecule/splicebert.510`. -SpliceBERT also pre-trained a model on human data only to validate the contribution of multi-species pre-training. The intermediate model after the first stage is available as `multimolecule/splicebert-human.510nt`. +SpliceBERT also pre-trained a model on human data only to validate the contribution of multi-species pre-training. The intermediate model after the first stage is available as `multimolecule/splicebert-human.510`. ## Citation diff --git a/multimolecule/models/utrlm/README.md b/multimolecule/models/utrlm/README.md index bc9cd00d..1e2d3e44 100644 --- a/multimolecule/models/utrlm/README.md +++ b/multimolecule/models/utrlm/README.md @@ -76,8 +76,8 @@ UTR-LM is a [bert](https://huggingface.co/google-bert/bert-base-uncased)-style m ### Variations -- **[`multimolecule/utrlm.te_el`](https://huggingface.co/multimolecule/utrlm.te_el)**: The UTR-LM model for Translation Efficiency of transcripts and mRNA Expression Level. -- **[`multimolecule/utrlm.mrl`](https://huggingface.co/multimolecule/utrlm.mrl)**: The UTR-LM model for Mean Ribosome Loading. +- **[`multimolecule/utrlm-te_el`](https://huggingface.co/multimolecule/utrlm-te_el)**: The UTR-LM model for Translation Efficiency of transcripts and mRNA Expression Level. +- **[`multimolecule/utrlm-mrl`](https://huggingface.co/multimolecule/utrlm-mrl)**: The UTR-LM model for Mean Ribosome Loading. ### Model Specification @@ -140,7 +140,7 @@ You can use this model directly with a pipeline for masked language modeling: ```python >>> import multimolecule # you must import multimolecule to register models >>> from transformers import pipeline ->>> unmasker = pipeline("fill-mask", model="multimolecule/utrlm.te_el") +>>> unmasker = pipeline("fill-mask", model="multimolecule/utrlm-te_el") >>> unmasker("gguccucugguuagaccagaucugagccu") [{'score': 0.07707168161869049, @@ -175,8 +175,8 @@ Here is how to use this model to get the features of a given sequence in PyTorch from multimolecule import RnaTokenizer, UtrLmModel -tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm.te_el") -model = UtrLmModel.from_pretrained("multimolecule/utrlm.te_el") +tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm-te_el") +model = UtrLmModel.from_pretrained("multimolecule/utrlm-te_el") text = "UAGCUUAUCAGACUGAUGUUGA" input = tokenizer(text, return_tensors="pt") @@ -195,8 +195,8 @@ import torch from multimolecule import RnaTokenizer, UtrLmForSequencePrediction -tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm.te_el") -model = UtrLmForSequencePrediction.from_pretrained("multimolecule/utrlm.te_el") +tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm-te_el") +model = UtrLmForSequencePrediction.from_pretrained("multimolecule/utrlm-te_el") text = "UAGCUUAUCAGACUGAUGUUGA" input = tokenizer(text, return_tensors="pt") @@ -216,8 +216,8 @@ import torch from multimolecule import RnaTokenizer, UtrLmForTokenPrediction -tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm.te_el") -model = UtrLmForTokenPrediction.from_pretrained("multimolecule/utrlm.te_el") +tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm-te_el") +model = UtrLmForTokenPrediction.from_pretrained("multimolecule/utrlm-te_el") text = "UAGCUUAUCAGACUGAUGUUGA" input = tokenizer(text, return_tensors="pt") @@ -237,8 +237,8 @@ import torch from multimolecule import RnaTokenizer, UtrLmForContactPrediction -tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm.te_el") -model = UtrLmForContactPrediction.from_pretrained("multimolecule/utrlm.te_el") +tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm-te_el") +model = UtrLmForContactPrediction.from_pretrained("multimolecule/utrlm-te_el") text = "UAGCUUAUCAGACUGAUGUUGA" input = tokenizer(text, return_tensors="pt")