minor fixes

update model names to stardard format fix model_max_length incorrectly set for model using relative pe fix typo
DLS5-Omics · Dec 11, 2024 · 43e847e · 43e847e
1 parent 9a365d0
commit 43e847e
Show file tree

Hide file tree

Showing 9 changed files with 42 additions and 38 deletions.
diff --git a/multimolecule/models/conversion_utils.py b/multimolecule/models/conversion_utils.py
@@ -41,7 +41,12 @@ def write_model(
     model.save_pretrained(output_path, safe_serialization=False)
     if tokenizer_config is None:
         tokenizer_config = get_tokenizer_config()
-        tokenizer_config["model_max_length"] = getattr(model.config, "max_position_embeddings", None)
+        if hasattr(model.config, "max_position_embeddings") and "model_max_length" not in tokenizer_config:
+            position_embedding_type = getattr(model.config, "position_embedding_type", None)
+            if position_embedding_type == "absolute":
+                tokenizer_config["model_max_length"] = model.config.max_position_embeddings
+            else:
+                tokenizer_config["model_max_length"] = None
     tokenizer = tokenizer_class_from_name(tokenizer_config["tokenizer_class"])(**tokenizer_config)
     tokenizer.save_pretrained(output_path)
 

diff --git a/multimolecule/models/ernierna/README.ernierna.ss.md b/multimolecule/models/ernierna/README.ernierna.ss.md
@@ -61,7 +61,7 @@ ERNIE-RNA is a [bert](https://huggingface.co/google-bert/bert-base-uncased)-styl
 ### Variations
 
 - **[`multimolecule/ernierna`](https://huggingface.co/multimolecule/ernierna)**: The ERNIE-RNA model pre-trained on non-coding RNA sequences.
-- **[`multimolecule/ernierna.ss`](https://huggingface.co/multimolecule/ernierna.ss)**: The ERNIE-RNA model fine-tuned on RNA secondary structure prediction.
+- **[`multimolecule/ernierna-ss`](https://huggingface.co/multimolecule/ernierna-ss)**: The ERNIE-RNA model fine-tuned on RNA secondary structure prediction.
 
 ### Model Specification
 
@@ -93,7 +93,7 @@ You can use this model directly with a pipeline for masked language modeling:
 ```python
 >>> import multimolecule  # you must import multimolecule to register models
 >>> from transformers import pipeline
->>> unmasker = pipeline("fill-mask", model="multimolecule/ernierna.ss")
+>>> unmasker = pipeline("fill-mask", model="multimolecule/ernierna-ss")
 >>> unmasker("gguc<mask>cucugguuagaccagaucugagccu")
 
 [{'score': 0.2066272348165512,
@@ -128,8 +128,8 @@ Here is how to use this model to get the features of a given sequence in PyTorch
 from multimolecule import RnaTokenizer, ErnieRnaModel
 
 
-tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna.ss")
-model = ErnieRnaModel.from_pretrained("multimolecule/ernierna.ss")
+tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna-ss")
+model = ErnieRnaModel.from_pretrained("multimolecule/ernierna-ss")
 
 text = "UAGCUUAUCAGACUGAUGUUGA"
 input = tokenizer(text, return_tensors="pt")
@@ -148,8 +148,8 @@ import torch
 from multimolecule import RnaTokenizer, ErnieRnaForSequencePrediction
 
 
-tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna.ss")
-model = ErnieRnaForSequencePrediction.from_pretrained("multimolecule/ernierna.ss")
+tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna-ss")
+model = ErnieRnaForSequencePrediction.from_pretrained("multimolecule/ernierna-ss")
 
 text = "UAGCUUAUCAGACUGAUGUUGA"
 input = tokenizer(text, return_tensors="pt")
@@ -169,8 +169,8 @@ import torch
 from multimolecule import RnaTokenizer, ErnieRnaForTokenPrediction
 
 
-tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna.ss")
-model = ErnieRnaForTokenPrediction.from_pretrained("multimolecule/ernierna.ss")
+tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna-ss")
+model = ErnieRnaForTokenPrediction.from_pretrained("multimolecule/ernierna-ss")
 
 text = "UAGCUUAUCAGACUGAUGUUGA"
 input = tokenizer(text, return_tensors="pt")
@@ -190,8 +190,8 @@ import torch
 from multimolecule import RnaTokenizer, ErnieRnaForContactPrediction
 
 
-tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna.ss")
-model = ErnieRnaForContactPrediction.from_pretrained("multimolecule/ernierna.ss")
+tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna-ss")
+model = ErnieRnaForContactPrediction.from_pretrained("multimolecule/ernierna-ss")
 
 text = "UAGCUUAUCAGACUGAUGUUGA"
 input = tokenizer(text, return_tensors="pt")

diff --git a/multimolecule/models/ernierna/README.md b/multimolecule/models/ernierna/README.md
@@ -60,7 +60,7 @@ ERNIE-RNA is a [bert](https://huggingface.co/google-bert/bert-base-uncased)-styl
 ### Variations
 
 - **[`multimolecule/ernierna`](https://huggingface.co/multimolecule/ernierna)**: The ERNIE-RNA model pre-trained on non-coding RNA sequences.
-- **[`multimolecule/ernierna.ss`](https://huggingface.co/multimolecule/ernierna.ss)**: The ERNIE-RNA model fine-tuned on RNA secondary structure prediction.
+- **[`multimolecule/ernierna-ss`](https://huggingface.co/multimolecule/ernierna-ss)**: The ERNIE-RNA model fine-tuned on RNA secondary structure prediction.
 
 ### Model Specification
 

diff --git a/multimolecule/models/ernierna/convert_checkpoint.py b/multimolecule/models/ernierna/convert_checkpoint.py
@@ -23,7 +23,7 @@
 from multimolecule.models import ErnieRnaConfig as Config
 from multimolecule.models import ErnieRnaForContactClassification, ErnieRnaForPreTraining
 from multimolecule.models.conversion_utils import ConvertConfig as ConvertConfig_
-from multimolecule.models.conversion_utils import get_tokenizer_config, save_checkpoint
+from multimolecule.models.conversion_utils import save_checkpoint
 from multimolecule.tokenisers.rna.utils import convert_word_embeddings, get_alphabet
 
 torch.manual_seed(1016)
@@ -126,8 +126,7 @@ def convert_checkpoint(convert_config):
 
     model.load_state_dict(state_dict)
 
-    tokenizer_config = get_tokenizer_config()
-    save_checkpoint(convert_config, model, tokenizer_config=tokenizer_config)
+    save_checkpoint(convert_config, model)
 
 
 class ConvertConfig(ConvertConfig_):

diff --git a/multimolecule/models/splicebert/README.md b/multimolecule/models/splicebert/README.md
@@ -60,8 +60,8 @@ SpliceBERT is a [bert](https://huggingface.co/google-bert/bert-base-uncased)-sty
 ### Variations
 
 - **[`multimolecule/splicebert`](https://huggingface.co/multimolecule/splicebert)**: The SpliceBERT model.
-- **[`multimolecule/splicebert.510nt`](https://huggingface.co/multimolecule/splicebert.510nt)**: The intermediate SpliceBERT model.
-- **[`multimolecule/splicebert-human.510nt`](https://huggingface.co/multimolecule/splicebert-human.510nt)**: The intermediate SpliceBERT model pre-trained on human data only.
+- **[`multimolecule/splicebert.510`](https://huggingface.co/multimolecule/splicebert.510)**: The intermediate SpliceBERT model.
+- **[`multimolecule/splicebert-human.510`](https://huggingface.co/multimolecule/splicebert-human.510)**: The intermediate SpliceBERT model pre-trained on human data only.
 
 ### Model Specification
 
@@ -92,12 +92,12 @@ SpliceBERT is a [bert](https://huggingface.co/google-bert/bert-base-uncased)-sty
     <td>1024</td>
   </tr>
   <tr>
-    <td>splicebert.510nt</td>
+    <td>splicebert.510</td>
     <td rowspan="2">19.45</td>
     <td rowspan="2">510</td>
   </tr>
   <tr>
-    <td>splicebert-human.510nt</td>
+    <td>splicebert-human.510</td>
   </tr>
 </tbody>
 </table>
@@ -270,9 +270,9 @@ SpliceBERT trained model in a two-stage training process:
 1. Pre-train with sequences of a fixed length of 510 nucleotides.
 2. Pre-train with sequences of a variable length between 64 and 1024 nucleotides.
 
-The intermediate model after the first stage is available as `multimolecule/splicebert.510nt`.
+The intermediate model after the first stage is available as `multimolecule/splicebert.510`.
 
-SpliceBERT also pre-trained a model on human data only to validate the contribution of multi-species pre-training. The intermediate model after the first stage is available as `multimolecule/splicebert-human.510nt`.
+SpliceBERT also pre-trained a model on human data only to validate the contribution of multi-species pre-training. The intermediate model after the first stage is available as `multimolecule/splicebert-human.510`.
 
 ## Citation
 

diff --git a/multimolecule/models/utrlm/README.md b/multimolecule/models/utrlm/README.md
@@ -76,8 +76,8 @@ UTR-LM is a [bert](https://huggingface.co/google-bert/bert-base-uncased)-style m
 
 ### Variations
 
-- **[`multimolecule/utrlm.te_el`](https://huggingface.co/multimolecule/utrlm.te_el)**: The UTR-LM model for Translation Efficiency of transcripts and mRNA Expression Level.
-- **[`multimolecule/utrlm.mrl`](https://huggingface.co/multimolecule/utrlm.mrl)**: The UTR-LM model for Mean Ribosome Loading.
+- **[`multimolecule/utrlm-te_el`](https://huggingface.co/multimolecule/utrlm-te_el)**: The UTR-LM model for Translation Efficiency of transcripts and mRNA Expression Level.
+- **[`multimolecule/utrlm-mrl`](https://huggingface.co/multimolecule/utrlm-mrl)**: The UTR-LM model for Mean Ribosome Loading.
 
 ### Model Specification
 
@@ -140,7 +140,7 @@ You can use this model directly with a pipeline for masked language modeling:
 ```python
 >>> import multimolecule  # you must import multimolecule to register models
 >>> from transformers import pipeline
->>> unmasker = pipeline("fill-mask", model="multimolecule/utrlm.te_el")
+>>> unmasker = pipeline("fill-mask", model="multimolecule/utrlm-te_el")
 >>> unmasker("gguc<mask>cucugguuagaccagaucugagccu")
 
 [{'score': 0.07707168161869049,
@@ -175,8 +175,8 @@ Here is how to use this model to get the features of a given sequence in PyTorch
 from multimolecule import RnaTokenizer, UtrLmModel
 
 
-tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm.te_el")
-model = UtrLmModel.from_pretrained("multimolecule/utrlm.te_el")
+tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm-te_el")
+model = UtrLmModel.from_pretrained("multimolecule/utrlm-te_el")
 
 text = "UAGCUUAUCAGACUGAUGUUGA"
 input = tokenizer(text, return_tensors="pt")
@@ -195,8 +195,8 @@ import torch
 from multimolecule import RnaTokenizer, UtrLmForSequencePrediction
 
 
-tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm.te_el")
-model = UtrLmForSequencePrediction.from_pretrained("multimolecule/utrlm.te_el")
+tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm-te_el")
+model = UtrLmForSequencePrediction.from_pretrained("multimolecule/utrlm-te_el")
 
 text = "UAGCUUAUCAGACUGAUGUUGA"
 input = tokenizer(text, return_tensors="pt")
@@ -216,8 +216,8 @@ import torch
 from multimolecule import RnaTokenizer, UtrLmForTokenPrediction
 
 
-tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm.te_el")
-model = UtrLmForTokenPrediction.from_pretrained("multimolecule/utrlm.te_el")
+tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm-te_el")
+model = UtrLmForTokenPrediction.from_pretrained("multimolecule/utrlm-te_el")
 
 text = "UAGCUUAUCAGACUGAUGUUGA"
 input = tokenizer(text, return_tensors="pt")
@@ -237,8 +237,8 @@ import torch
 from multimolecule import RnaTokenizer, UtrLmForContactPrediction
 
 
-tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm.te_el")
-model = UtrLmForContactPrediction.from_pretrained("multimolecule/utrlm.te_el")
+tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm-te_el")
+model = UtrLmForContactPrediction.from_pretrained("multimolecule/utrlm-te_el")
 
 text = "UAGCUUAUCAGACUGAUGUUGA"
 input = tokenizer(text, return_tensors="pt")

diff --git a/multimolecule/tokenisers/dna/tokenization_dna.py b/multimolecule/tokenisers/dna/tokenization_dna.py
@@ -98,14 +98,14 @@ def __init__(
         )
         self.replace_U_with_T = replace_U_with_T
         self.nmers = nmers
-        self.condon = codon
+        self.codon = codon
 
     def _tokenize(self, text: str, **kwargs):
         if self.do_upper_case:
             text = text.upper()
         if self.replace_U_with_T:
             text = text.replace("U", "T")
-        if self.condon:
+        if self.codon:
             if len(text) % 3 != 0:
                 raise ValueError(
                     f"length of input sequence must be a multiple of 3 for codon tokenization, but got {len(text)}"

diff --git a/multimolecule/tokenisers/dot_bracket/tokenization_db.py b/multimolecule/tokenisers/dot_bracket/tokenization_db.py
@@ -88,10 +88,10 @@ def __init__(
             **kwargs,
         )
         self.nmers = nmers
-        self.condon = codon
+        self.codon = codon
 
     def _tokenize(self, text: str, **kwargs):
-        if self.condon:
+        if self.codon:
             if len(text) % 3 != 0:
                 raise ValueError(
                     f"length of input sequence must be a multiple of 3 for codon tokenization, but got {len(text)}"

diff --git a/multimolecule/tokenisers/rna/tokenization_rna.py b/multimolecule/tokenisers/rna/tokenization_rna.py
@@ -98,14 +98,14 @@ def __init__(
         )
         self.replace_T_with_U = replace_T_with_U
         self.nmers = nmers
-        self.condon = codon
+        self.codon = codon
 
     def _tokenize(self, text: str, **kwargs):
         if self.do_upper_case:
             text = text.upper()
         if self.replace_T_with_U:
             text = text.replace("T", "U")
-        if self.condon:
+        if self.codon:
             if len(text) % 3 != 0:
                 raise ValueError(
                     f"length of input sequence must be a multiple of 3 for codon tokenization, but got {len(text)}"