add support of RnaMsm

Signed-off-by: Zhiyuan Chen <[email protected]>
DLS5-Omics · Apr 2, 2024 · 5f2f9dc · 5f2f9dc
1 parent b5fc0bf
commit 5f2f9dc
Show file tree

Hide file tree

Showing 5 changed files with 1,468 additions and 1 deletion.
diff --git a/multimolecule/models/__init__.py b/multimolecule/models/__init__.py
@@ -1,10 +1,17 @@
+from ..tokenizers.rna import RnaTokenizer
 from .rnabert import (
     RnaBertConfig,
     RnaBertForMaskedLM,
     RnaBertForSequenceClassification,
     RnaBertForTokenClassification,
     RnaBertModel,
-    RnaTokenizer,
+)
+from .rnamsm import (
+    RnaMsmConfig,
+    RnaMsmForMaskedLM,
+    RnaMsmForSequenceClassification,
+    RnaMsmForTokenClassification,
+    RnaMsmModel,
 )
 
 __all__ = [
@@ -13,5 +20,10 @@
     "RnaBertForMaskedLM",
     "RnaBertForSequenceClassification",
     "RnaBertForTokenClassification",
+    "RnaMsmConfig",
+    "RnaMsmModel",
+    "RnaMsmForMaskedLM",
+    "RnaMsmForSequenceClassification",
+    "RnaMsmForTokenClassification",
     "RnaTokenizer",
 ]
diff --git a/multimolecule/models/rnamsm/__init__.py b/multimolecule/models/rnamsm/__init__.py
@@ -0,0 +1,36 @@
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForMaskedLM,
+    AutoModelForSequenceClassification,
+    AutoModelForTokenClassification,
+    AutoModelWithLMHead,
+    AutoTokenizer,
+)
+
+from multimolecule.tokenizers.rna import RnaTokenizer
+
+from .configuration_rnamsm import RnaMsmConfig
+from .modeling_rnamsm import (
+    RnaMsmForMaskedLM,
+    RnaMsmForSequenceClassification,
+    RnaMsmForTokenClassification,
+    RnaMsmModel,
+)
+
+__all__ = [
+    "RnaMsmConfig",
+    "RnaMsmModel",
+    "RnaTokenizer",
+    "RnaMsmForMaskedLM",
+    "RnaMsmForSequenceClassification",
+    "RnaMsmForTokenClassification",
+]
+
+AutoConfig.register("rnamsm", RnaMsmConfig)
+AutoModel.register(RnaMsmConfig, RnaMsmModel)
+AutoModelForMaskedLM.register(RnaMsmConfig, RnaMsmForMaskedLM)
+AutoModelForSequenceClassification.register(RnaMsmConfig, RnaMsmForSequenceClassification)
+AutoModelForTokenClassification.register(RnaMsmConfig, RnaMsmForTokenClassification)
+AutoModelWithLMHead.register(RnaMsmConfig, RnaMsmForTokenClassification)
+AutoTokenizer.register(RnaMsmConfig, RnaTokenizer)
diff --git a/multimolecule/models/rnamsm/configuration_rnamsm.py b/multimolecule/models/rnamsm/configuration_rnamsm.py
@@ -0,0 +1,106 @@
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class RnaMsmConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RnaMsmModel`]. It is used to instantiate a
+    RnaMsm model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the RnaMsm
+    [yikunpku/RNA-MSM](https://github.com/yikunpku/RNA-MSM) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*):
+            Vocabulary size of the RnaMsm model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`RnaMsmModel`].
+        mask_token_id (`int`, *optional*):
+            The index of the mask token in the vocabulary. This must be included in the config because of the
+            "mask-dropout" scaling trick, which will scale the inputs depending on the number of masked tokens.
+        pad_token_id (`int`, *optional*):
+            The index of the padding token in the vocabulary. This must be included in the config because certain parts
+            of the RnaMsm code use this instead of the attention mask.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 1026):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+
+    Examples:
+
+    ```python
+    >>> from multimolecule import RnaMsmModel, RnaMsmConfig
+
+    >>> # Initializing a RnaMsm style configuration >>> configuration = RnaMsmConfig()
+
+    >>> # Initializing a model from the configuration >>> model = RnaMsmModel(configuration)
+
+    >>> # Accessing the model configuration >>> configuration = model.config
+    ```
+    """
+
+    model_type = "rnamsm"
+
+    def __init__(
+        self,
+        vocab_size=25,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=1024,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        bos_token_id=1,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        max_tokens_per_msa=2**14,
+        attention_type="standard",
+        embed_positions_msa=True,
+        attention_bias=True,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+        self.max_tokens_per_msa = max_tokens_per_msa
+        self.attention_type = attention_type
+        self.embed_positions_msa = embed_positions_msa
+        self.attention_bias = attention_bias
diff --git a/multimolecule/models/rnamsm/convert_checkpoint.py b/multimolecule/models/rnamsm/convert_checkpoint.py
@@ -0,0 +1,101 @@
+import os
+from typing import Optional
+
+import chanfig
+import torch
+from torch import nn
+
+from multimolecule.models import RnaMsmConfig as Config
+from multimolecule.models import RnaMsmForMaskedLM as Model
+from multimolecule.tokenizers.rna.utils import get_special_tokens_map, get_tokenizer_config, get_vocab_list
+
+try:
+    from huggingface_hub import HfApi
+except:
+    HfApi = None
+
+CONFIG = {
+    "architectures": ["RnaMsmModel"],
+    "attention_probs_dropout_prob": 0.1,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "intermediate_size": 3072,
+    "max_position_embeddings": 1024,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 10,
+    "vocab_size": 25,
+    "pad_token_id": 0,
+    "embed_positions_msa": True,
+}
+
+original_vocab_list = ["<cls>", "<pad>", "<eos>", "<unk>", "A", "G", "C", "U", "X", "N", "-", "<mask>"]
+vocab_list = get_vocab_list()
+
+
+def _convert_checkpoint(config, original_state_dict):
+    state_dict = {}
+    for key, value in original_state_dict.items():
+        key = key.replace("layers", "rnamsm.encoder.layer")
+        key = key.replace("msa_position_embedding", "rnamsm.embeddings.msa_embeddings")
+        key = key.replace("embed_tokens", "rnamsm.embeddings.word_embeddings")
+        key = key.replace("embed_positions", "rnamsm.embeddings.position_embeddings")
+        key = key.replace("emb_layer_norm_before", "rnamsm.embeddings.layer_norm")
+        key = key.replace("emb_layer_norm_after", "rnamsm.encoder.layer_norm")
+        state_dict[key] = value
+
+    word_embed = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+    predictions_bias = torch.zeros(config.vocab_size)
+    # nn.init.normal_(pos_embed.weight, std=0.02)
+    for original_index, original_token in enumerate(original_vocab_list):
+        new_index = vocab_list.index(original_token)
+        word_embed.weight.data[new_index] = state_dict["rnamsm.embeddings.word_embeddings.weight"][original_index]
+        predictions_bias[new_index] = state_dict["lm_head.bias"][original_index]
+    state_dict["rnamsm.embeddings.word_embeddings.weight"] = word_embed.weight.data
+    state_dict["lm_head.weight"] = word_embed.weight.data
+    state_dict["lm_head.bias"] = predictions_bias
+    return state_dict
+
+
+def convert_checkpoint(convert_config):
+    config = Config.from_dict(chanfig.FlatDict(CONFIG))
+    config.vocab_size = len(vocab_list)
+
+    model = Model(config)
+
+    ckpt = torch.load(convert_config.checkpoint_path, map_location=torch.device("cpu"))
+    state_dict = _convert_checkpoint(config, ckpt)
+
+    model.load_state_dict(state_dict)
+    model.save_pretrained(convert_config.output_path, safe_serialization=True)
+    model.save_pretrained(convert_config.output_path, safe_serialization=False)
+    chanfig.NestedDict(get_special_tokens_map()).json(
+        os.path.join(convert_config.output_path, "special_tokens_map.json")
+    )
+    chanfig.NestedDict(get_tokenizer_config()).json(os.path.join(convert_config.output_path, "tokenizer_config.json"))
+
+    if convert_config.push_to_hub:
+        if HfApi is None:
+            raise ImportError("Please install huggingface_hub to push to the hub.")
+        api = HfApi()
+        api.create_repo(
+            convert_config.repo_id,
+            token=convert_config.token,
+            exist_ok=True,
+        )
+        api.upload_folder(repo_id=convert_config.repo_id, folder_path=convert_config.output_path, token=convert_config.token)
+
+
+@chanfig.configclass
+class ConvertConfig:
+    checkpoint_path: str
+    output_path: str = Config.model_type
+    push_to_hub: bool = False
+    repo_id: str = "ZhiyuanChen/" + output_path
+    token: Optional[str] = None
+
+
+if __name__ == "__main__":
+    config = ConvertConfig()
+    config.parse()
+    convert_checkpoint(config)