Skip to content

Commit

Permalink
minor fixes
Browse files Browse the repository at this point in the history
update model names to stardard format
fix model_max_length incorrectly set for model using relative pe
fix typo
  • Loading branch information
ZhiyuanChen committed Dec 11, 2024
1 parent 9a365d0 commit 43e847e
Show file tree
Hide file tree
Showing 9 changed files with 42 additions and 38 deletions.
7 changes: 6 additions & 1 deletion multimolecule/models/conversion_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,12 @@ def write_model(
model.save_pretrained(output_path, safe_serialization=False)
if tokenizer_config is None:
tokenizer_config = get_tokenizer_config()
tokenizer_config["model_max_length"] = getattr(model.config, "max_position_embeddings", None)
if hasattr(model.config, "max_position_embeddings") and "model_max_length" not in tokenizer_config:
position_embedding_type = getattr(model.config, "position_embedding_type", None)
if position_embedding_type == "absolute":
tokenizer_config["model_max_length"] = model.config.max_position_embeddings
else:
tokenizer_config["model_max_length"] = None
tokenizer = tokenizer_class_from_name(tokenizer_config["tokenizer_class"])(**tokenizer_config)
tokenizer.save_pretrained(output_path)

Expand Down
20 changes: 10 additions & 10 deletions multimolecule/models/ernierna/README.ernierna.ss.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ ERNIE-RNA is a [bert](https://huggingface.co/google-bert/bert-base-uncased)-styl
### Variations

- **[`multimolecule/ernierna`](https://huggingface.co/multimolecule/ernierna)**: The ERNIE-RNA model pre-trained on non-coding RNA sequences.
- **[`multimolecule/ernierna.ss`](https://huggingface.co/multimolecule/ernierna.ss)**: The ERNIE-RNA model fine-tuned on RNA secondary structure prediction.
- **[`multimolecule/ernierna-ss`](https://huggingface.co/multimolecule/ernierna-ss)**: The ERNIE-RNA model fine-tuned on RNA secondary structure prediction.

### Model Specification

Expand Down Expand Up @@ -93,7 +93,7 @@ You can use this model directly with a pipeline for masked language modeling:
```python
>>> import multimolecule # you must import multimolecule to register models
>>> from transformers import pipeline
>>> unmasker = pipeline("fill-mask", model="multimolecule/ernierna.ss")
>>> unmasker = pipeline("fill-mask", model="multimolecule/ernierna-ss")
>>> unmasker("gguc<mask>cucugguuagaccagaucugagccu")

[{'score': 0.2066272348165512,
Expand Down Expand Up @@ -128,8 +128,8 @@ Here is how to use this model to get the features of a given sequence in PyTorch
from multimolecule import RnaTokenizer, ErnieRnaModel


tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna.ss")
model = ErnieRnaModel.from_pretrained("multimolecule/ernierna.ss")
tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna-ss")
model = ErnieRnaModel.from_pretrained("multimolecule/ernierna-ss")

text = "UAGCUUAUCAGACUGAUGUUGA"
input = tokenizer(text, return_tensors="pt")
Expand All @@ -148,8 +148,8 @@ import torch
from multimolecule import RnaTokenizer, ErnieRnaForSequencePrediction


tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna.ss")
model = ErnieRnaForSequencePrediction.from_pretrained("multimolecule/ernierna.ss")
tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna-ss")
model = ErnieRnaForSequencePrediction.from_pretrained("multimolecule/ernierna-ss")

text = "UAGCUUAUCAGACUGAUGUUGA"
input = tokenizer(text, return_tensors="pt")
Expand All @@ -169,8 +169,8 @@ import torch
from multimolecule import RnaTokenizer, ErnieRnaForTokenPrediction


tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna.ss")
model = ErnieRnaForTokenPrediction.from_pretrained("multimolecule/ernierna.ss")
tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna-ss")
model = ErnieRnaForTokenPrediction.from_pretrained("multimolecule/ernierna-ss")

text = "UAGCUUAUCAGACUGAUGUUGA"
input = tokenizer(text, return_tensors="pt")
Expand All @@ -190,8 +190,8 @@ import torch
from multimolecule import RnaTokenizer, ErnieRnaForContactPrediction


tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna.ss")
model = ErnieRnaForContactPrediction.from_pretrained("multimolecule/ernierna.ss")
tokenizer = RnaTokenizer.from_pretrained("multimolecule/ernierna-ss")
model = ErnieRnaForContactPrediction.from_pretrained("multimolecule/ernierna-ss")

text = "UAGCUUAUCAGACUGAUGUUGA"
input = tokenizer(text, return_tensors="pt")
Expand Down
2 changes: 1 addition & 1 deletion multimolecule/models/ernierna/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ ERNIE-RNA is a [bert](https://huggingface.co/google-bert/bert-base-uncased)-styl
### Variations

- **[`multimolecule/ernierna`](https://huggingface.co/multimolecule/ernierna)**: The ERNIE-RNA model pre-trained on non-coding RNA sequences.
- **[`multimolecule/ernierna.ss`](https://huggingface.co/multimolecule/ernierna.ss)**: The ERNIE-RNA model fine-tuned on RNA secondary structure prediction.
- **[`multimolecule/ernierna-ss`](https://huggingface.co/multimolecule/ernierna-ss)**: The ERNIE-RNA model fine-tuned on RNA secondary structure prediction.

### Model Specification

Expand Down
5 changes: 2 additions & 3 deletions multimolecule/models/ernierna/convert_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from multimolecule.models import ErnieRnaConfig as Config
from multimolecule.models import ErnieRnaForContactClassification, ErnieRnaForPreTraining
from multimolecule.models.conversion_utils import ConvertConfig as ConvertConfig_
from multimolecule.models.conversion_utils import get_tokenizer_config, save_checkpoint
from multimolecule.models.conversion_utils import save_checkpoint
from multimolecule.tokenisers.rna.utils import convert_word_embeddings, get_alphabet

torch.manual_seed(1016)
Expand Down Expand Up @@ -126,8 +126,7 @@ def convert_checkpoint(convert_config):

model.load_state_dict(state_dict)

tokenizer_config = get_tokenizer_config()
save_checkpoint(convert_config, model, tokenizer_config=tokenizer_config)
save_checkpoint(convert_config, model)


class ConvertConfig(ConvertConfig_):
Expand Down
12 changes: 6 additions & 6 deletions multimolecule/models/splicebert/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ SpliceBERT is a [bert](https://huggingface.co/google-bert/bert-base-uncased)-sty
### Variations

- **[`multimolecule/splicebert`](https://huggingface.co/multimolecule/splicebert)**: The SpliceBERT model.
- **[`multimolecule/splicebert.510nt`](https://huggingface.co/multimolecule/splicebert.510nt)**: The intermediate SpliceBERT model.
- **[`multimolecule/splicebert-human.510nt`](https://huggingface.co/multimolecule/splicebert-human.510nt)**: The intermediate SpliceBERT model pre-trained on human data only.
- **[`multimolecule/splicebert.510`](https://huggingface.co/multimolecule/splicebert.510)**: The intermediate SpliceBERT model.
- **[`multimolecule/splicebert-human.510`](https://huggingface.co/multimolecule/splicebert-human.510)**: The intermediate SpliceBERT model pre-trained on human data only.

### Model Specification

Expand Down Expand Up @@ -92,12 +92,12 @@ SpliceBERT is a [bert](https://huggingface.co/google-bert/bert-base-uncased)-sty
<td>1024</td>
</tr>
<tr>
<td>splicebert.510nt</td>
<td>splicebert.510</td>
<td rowspan="2">19.45</td>
<td rowspan="2">510</td>
</tr>
<tr>
<td>splicebert-human.510nt</td>
<td>splicebert-human.510</td>
</tr>
</tbody>
</table>
Expand Down Expand Up @@ -270,9 +270,9 @@ SpliceBERT trained model in a two-stage training process:
1. Pre-train with sequences of a fixed length of 510 nucleotides.
2. Pre-train with sequences of a variable length between 64 and 1024 nucleotides.

The intermediate model after the first stage is available as `multimolecule/splicebert.510nt`.
The intermediate model after the first stage is available as `multimolecule/splicebert.510`.

SpliceBERT also pre-trained a model on human data only to validate the contribution of multi-species pre-training. The intermediate model after the first stage is available as `multimolecule/splicebert-human.510nt`.
SpliceBERT also pre-trained a model on human data only to validate the contribution of multi-species pre-training. The intermediate model after the first stage is available as `multimolecule/splicebert-human.510`.

## Citation

Expand Down
22 changes: 11 additions & 11 deletions multimolecule/models/utrlm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ UTR-LM is a [bert](https://huggingface.co/google-bert/bert-base-uncased)-style m

### Variations

- **[`multimolecule/utrlm.te_el`](https://huggingface.co/multimolecule/utrlm.te_el)**: The UTR-LM model for Translation Efficiency of transcripts and mRNA Expression Level.
- **[`multimolecule/utrlm.mrl`](https://huggingface.co/multimolecule/utrlm.mrl)**: The UTR-LM model for Mean Ribosome Loading.
- **[`multimolecule/utrlm-te_el`](https://huggingface.co/multimolecule/utrlm-te_el)**: The UTR-LM model for Translation Efficiency of transcripts and mRNA Expression Level.
- **[`multimolecule/utrlm-mrl`](https://huggingface.co/multimolecule/utrlm-mrl)**: The UTR-LM model for Mean Ribosome Loading.

### Model Specification

Expand Down Expand Up @@ -140,7 +140,7 @@ You can use this model directly with a pipeline for masked language modeling:
```python
>>> import multimolecule # you must import multimolecule to register models
>>> from transformers import pipeline
>>> unmasker = pipeline("fill-mask", model="multimolecule/utrlm.te_el")
>>> unmasker = pipeline("fill-mask", model="multimolecule/utrlm-te_el")
>>> unmasker("gguc<mask>cucugguuagaccagaucugagccu")

[{'score': 0.07707168161869049,
Expand Down Expand Up @@ -175,8 +175,8 @@ Here is how to use this model to get the features of a given sequence in PyTorch
from multimolecule import RnaTokenizer, UtrLmModel


tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm.te_el")
model = UtrLmModel.from_pretrained("multimolecule/utrlm.te_el")
tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm-te_el")
model = UtrLmModel.from_pretrained("multimolecule/utrlm-te_el")

text = "UAGCUUAUCAGACUGAUGUUGA"
input = tokenizer(text, return_tensors="pt")
Expand All @@ -195,8 +195,8 @@ import torch
from multimolecule import RnaTokenizer, UtrLmForSequencePrediction


tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm.te_el")
model = UtrLmForSequencePrediction.from_pretrained("multimolecule/utrlm.te_el")
tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm-te_el")
model = UtrLmForSequencePrediction.from_pretrained("multimolecule/utrlm-te_el")

text = "UAGCUUAUCAGACUGAUGUUGA"
input = tokenizer(text, return_tensors="pt")
Expand All @@ -216,8 +216,8 @@ import torch
from multimolecule import RnaTokenizer, UtrLmForTokenPrediction


tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm.te_el")
model = UtrLmForTokenPrediction.from_pretrained("multimolecule/utrlm.te_el")
tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm-te_el")
model = UtrLmForTokenPrediction.from_pretrained("multimolecule/utrlm-te_el")

text = "UAGCUUAUCAGACUGAUGUUGA"
input = tokenizer(text, return_tensors="pt")
Expand All @@ -237,8 +237,8 @@ import torch
from multimolecule import RnaTokenizer, UtrLmForContactPrediction


tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm.te_el")
model = UtrLmForContactPrediction.from_pretrained("multimolecule/utrlm.te_el")
tokenizer = RnaTokenizer.from_pretrained("multimolecule/utrlm-te_el")
model = UtrLmForContactPrediction.from_pretrained("multimolecule/utrlm-te_el")

text = "UAGCUUAUCAGACUGAUGUUGA"
input = tokenizer(text, return_tensors="pt")
Expand Down
4 changes: 2 additions & 2 deletions multimolecule/tokenisers/dna/tokenization_dna.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,14 @@ def __init__(
)
self.replace_U_with_T = replace_U_with_T
self.nmers = nmers
self.condon = codon
self.codon = codon

def _tokenize(self, text: str, **kwargs):
if self.do_upper_case:
text = text.upper()
if self.replace_U_with_T:
text = text.replace("U", "T")
if self.condon:
if self.codon:
if len(text) % 3 != 0:
raise ValueError(
f"length of input sequence must be a multiple of 3 for codon tokenization, but got {len(text)}"
Expand Down
4 changes: 2 additions & 2 deletions multimolecule/tokenisers/dot_bracket/tokenization_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,10 @@ def __init__(
**kwargs,
)
self.nmers = nmers
self.condon = codon
self.codon = codon

def _tokenize(self, text: str, **kwargs):
if self.condon:
if self.codon:
if len(text) % 3 != 0:
raise ValueError(
f"length of input sequence must be a multiple of 3 for codon tokenization, but got {len(text)}"
Expand Down
4 changes: 2 additions & 2 deletions multimolecule/tokenisers/rna/tokenization_rna.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,14 @@ def __init__(
)
self.replace_T_with_U = replace_T_with_U
self.nmers = nmers
self.condon = codon
self.codon = codon

def _tokenize(self, text: str, **kwargs):
if self.do_upper_case:
text = text.upper()
if self.replace_T_with_U:
text = text.replace("T", "U")
if self.condon:
if self.codon:
if len(text) % 3 != 0:
raise ValueError(
f"length of input sequence must be a multiple of 3 for codon tokenization, but got {len(text)}"
Expand Down

0 comments on commit 43e847e

Please sign in to comment.