From 1141aabee2ce912c40eb5c2ea9490d8acbe82e76 Mon Sep 17 00:00:00 2001
From: Zhiyuan Chen <this@zyc.ai>
Date: Thu, 24 Oct 2024 18:31:39 +0800
Subject: [PATCH] add RNAstarlign & ArchiveII datasets

Signed-off-by: Zhiyuan Chen <this@zyc.ai>
---
 docs/docs/datasets/archiveii.md               |   9 ++
 docs/docs/datasets/rnastralign.md             |   9 ++
 docs/mkdocs.yml                               |   2 +
 multimolecule/datasets/README.md              |   2 +
 multimolecule/datasets/README.zh.md           |   2 +
 multimolecule/datasets/archiveii/README.md    | 108 +++++++++++++++
 multimolecule/datasets/archiveii/archiveii.py | 105 +++++++++++++++
 multimolecule/datasets/bprna_new/README.md    |   1 +
 multimolecule/datasets/bprna_spot/README.md   |   1 +
 multimolecule/datasets/rnastralign/README.md  | 110 ++++++++++++++++
 .../datasets/rnastralign/rnastralign.py       | 123 ++++++++++++++++++
 11 files changed, 472 insertions(+)
 create mode 100644 docs/docs/datasets/archiveii.md
 create mode 100644 docs/docs/datasets/rnastralign.md
 create mode 100644 multimolecule/datasets/archiveii/README.md
 create mode 100644 multimolecule/datasets/archiveii/archiveii.py
 create mode 100644 multimolecule/datasets/rnastralign/README.md
 create mode 100644 multimolecule/datasets/rnastralign/rnastralign.py

diff --git a/docs/docs/datasets/archiveii.md b/docs/docs/datasets/archiveii.md
new file mode 100644
index 00000000..14588f82
--- /dev/null
+++ b/docs/docs/datasets/archiveii.md
@@ -0,0 +1,9 @@
+---
+authors:
+  - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# ArchiveII
+
+--8<-- "multimolecule/datasets/archiveii/README.md:24:"
diff --git a/docs/docs/datasets/rnastralign.md b/docs/docs/datasets/rnastralign.md
new file mode 100644
index 00000000..a08e6399
--- /dev/null
+++ b/docs/docs/datasets/rnastralign.md
@@ -0,0 +1,9 @@
+---
+authors:
+  - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# RNAStrAlign
+
+--8<-- "multimolecule/datasets/rnastralign/README.md:24:"
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index 0a7cb1ca..9c8d52de 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -22,6 +22,8 @@ nav:
           - bpRNA-1m: datasets/bprna.md
           - bpRNA-spot: datasets/bprna-spot.md
           - bpRNA-new: datasets/bprna-new.md
+          - RNAStrAlign: datasets/rnastralign.md
+          - ArchiveII: datasets/archiveii.md
           - RYOS: datasets/ryos.md
           - EternaBench-CM: datasets/eternabench-cm.md
           - EternaBench-Switch: datasets/eternabench-switch.md
diff --git a/multimolecule/datasets/README.md b/multimolecule/datasets/README.md
index fda42caa..07506e7b 100644
--- a/multimolecule/datasets/README.md
+++ b/multimolecule/datasets/README.md
@@ -25,6 +25,8 @@ date: 2024-05-04
 - [EternaBench-CM](eternabench-cm)
 - [EternaBench-Switch](eternabench-switch)
 - [EternaBench-External](eternabench-external)
+- [RNAStrAlign](rnastralign)
+- [ArchiveII](archiveii)
 
 ## Usage
 
diff --git a/multimolecule/datasets/README.zh.md b/multimolecule/datasets/README.zh.md
index a38dd356..18d34840 100644
--- a/multimolecule/datasets/README.zh.md
+++ b/multimolecule/datasets/README.zh.md
@@ -25,6 +25,8 @@ date: 2024-05-04
 - [EternaBench-CM](eternabench-cm)
 - [EternaBench-Switch](eternabench-switch)
 - [EternaBench-External](eternabench-external)
+- [RNAStrAlign](rnastralign)
+- [ArchiveII](archiveii)
 
 ## 使用
 
diff --git a/multimolecule/datasets/archiveii/README.md b/multimolecule/datasets/archiveii/README.md
new file mode 100644
index 00000000..21c0eb87
--- /dev/null
+++ b/multimolecule/datasets/archiveii/README.md
@@ -0,0 +1,108 @@
+---
+language: rna
+tags:
+  - Biology
+  - RNA
+license:
+  - agpl-3.0
+size_categories:
+  - 10K<n<100K
+source_datasets:
+  - multimolecule/bprna
+  - multimolecule/pdb
+task_categories:
+  - text-generation
+  - fill-mask
+task_ids:
+  - language-modeling
+  - masked-language-modeling
+pretty_name: ArchiveII
+library_name: multimolecule
+---
+
+# ArchiveII
+
+ArchiveII is a dataset of RNA sequences and their secondary structures, widely used in RNA secondary structure prediction benchmarks.
+
+ArchiveII contains 2975 RNA samples across 10 RNA families, with sequence lengths ranging from 28 to 2968 nucleotides.
+This dataset is frequently used to evaluate RNA secondary structure prediction methods, including those that handle both pseudoknotted and non-pseudoknotted structures.
+
+It is considered complementary to the [RNAStrAlign](./rnastralign) dataset.
+
+## Disclaimer
+
+This is an UNOFFICIAL release of the ArchiveII by Mehdi Saman Booy, et al.
+
+**The team releasing ArchiveII did not write this dataset card for this dataset so this dataset card has been written by the MultiMolecule team.**
+
+## Dataset Description
+
+- **Homepage**: https://multimolecule.danling.org/datasets/archiveii
+- **datasets**: https://huggingface.co/datasets/multimolecule/archiveii
+- **Point of Contact**: [Mehdi Saman Booy](mailto:mehdi.samanbooy@aalto.fi)
+
+## Example Entry
+
+| id                  | sequence                            | secondary_structure                  | family     |
+| ------------------- | ----------------------------------- | ------------------------------------ | ---------- |
+| 16S_rRNA-A.fulgidus | AUUCUGGUUGAUCCUGCCAGAGGCCGCUGCUA... | ...(((((...(((.))))).((((((((((....  | 16S_rRNA   |
+
+## Column Description
+
+- **id**:
+    A unique identifier for each RNA entry. This ID is derived from the family and the original `.sta` file name, and serves as a reference to the specific RNA structure within the dataset.
+
+- **sequence**:
+    The nucleotide sequence of the RNA molecule, represented using the standard RNA bases:
+
+    - **A**: Adenine
+    - **C**: Cytosine
+    - **G**: Guanine
+    - **U**: Uracil
+
+- **secondary_structure**:
+    The secondary structure of the RNA represented in dot-bracket notation, using up to three types of symbols to indicate base pairing and unpaired regions, as per bpRNA's standard:
+
+    - **Dots (`.`)**: Represent unpaired nucleotides.
+    - **Parentheses (`(` and `)`)**: Represent base pairs in standard stems (page 1).
+
+- **family**:
+    The RNA family to which the sequence belongs, such as 16S rRNA, 5S rRNA, etc.
+
+## Variations
+
+This dataset is available in two additional variants:
+
+- [archiveii](https://huggingface.co/datasets/multimolecule/archiveii): The main ArchiveII dataset.
+- [archiveii.512](https://huggingface.co/datasets/multimolecule/archiveii.512): ArchiveII dataset with sequences no longer than 512 nucleotides.
+- [archiveii.1024](https://huggingface.co/datasets/multimolecule/archiveii.1024): ArchiveII dataset with sequences no longer than 1024 nucleotides.
+
+## Related Datasets
+
+- [RNAStrAlign](https://huggingface.co/datasets/multimolecule/rnastralign): A database of RNA secondary with the same families as ArchiveII, usually used for training.
+- [bpRNA-spot](https://huggingface.co/datasets/multimolecule/bprna-spot): Another commonly used database in RNA secondary structures prediction.
+
+## License
+
+This dataset is licensed under the [AGPL-3.0 License](https://www.gnu.org/licenses/agpl-3.0.html).
+
+```spdx
+SPDX-License-Identifier: AGPL-3.0-or-later
+```
+
+## Citation
+
+```bibtex
+@article{samanbooy2022rna,
+  author    = {Saman Booy, Mehdi and Ilin, Alexander and Orponen, Pekka},
+  journal   = {BMC Bioinformatics},
+  keywords  = {Deep learning; Pseudoknotted structures; RNA structure prediction},
+  month     = feb,
+  number    = 1,
+  pages     = {58},
+  publisher = {Springer Science and Business Media LLC},
+  title     = {{RNA} secondary structure prediction with convolutional neural networks},
+  volume    = 23,
+  year      = 2022
+}
+```
diff --git a/multimolecule/datasets/archiveii/archiveii.py b/multimolecule/datasets/archiveii/archiveii.py
new file mode 100644
index 00000000..1492d4c9
--- /dev/null
+++ b/multimolecule/datasets/archiveii/archiveii.py
@@ -0,0 +1,105 @@
+# MultiMolecule
+# Copyright (C) 2024-Present  MultiMolecule
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import annotations
+
+import os
+from collections.abc import Mapping
+from pathlib import Path
+
+import torch
+from tqdm import tqdm
+
+from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_
+from multimolecule.datasets.conversion_utils import save_dataset
+
+torch.manual_seed(1016)
+
+
+def convert_ct(file) -> Mapping:
+    if not isinstance(file, Path):
+        file = Path(file)
+    with open(file) as f:
+        lines = f.readlines()
+
+    first_line = lines[0].strip().split()
+    num_bases = int(first_line[0])
+
+    sequence = []
+    dot_bracket = ["."] * num_bases
+
+    for i in range(1, num_bases + 1):
+        line = lines[i].strip().split()
+        sequence.append(line[1])
+        pair_index = int(line[4])
+
+        if pair_index > 0:
+            if int(lines[pair_index].strip().split()[4]) != i:
+                raise ValueError(
+                    f"Invalid pairing at position {i}: pair_index {pair_index} does not point back correctly."
+                )
+            if pair_index > i:
+                dot_bracket[i - 1] = "("
+                dot_bracket[pair_index - 1] = ")"
+
+    family, name = file.stem.split("_", 1)
+    if family in ("5s", "16s", "23s"):
+        family = family.upper() + "_rRNA"
+    elif family == "srp":
+        family = family.upper()
+    elif family == "grp1":
+        family = "group_I_intron"
+    elif family == "grp2":
+        family = "group_II_intron"
+    id = family + "-" + name
+
+    return {
+        "id": id,
+        "sequence": "".join(sequence),
+        "secondary_structure": "".join(dot_bracket),
+        "family": family,
+    }
+
+
+def convert_dataset(convert_config):
+    max_seq_len = convert_config.max_seq_len
+    files = [
+        os.path.join(convert_config.dataset_path, f)
+        for f in os.listdir(convert_config.dataset_path)
+        if f.endswith(".ct")
+    ]
+    files.sort()
+    data = [convert_ct(file) for file in tqdm(files, total=len(files))]
+    if max_seq_len is not None:
+        data = [d for d in data if len(d["sequence"]) <= max_seq_len]
+    save_dataset(convert_config, data, filename="test.parquet")
+
+
+class ConvertConfig(ConvertConfig_):
+    max_seq_len: int | None = None
+    root: str = os.path.dirname(__file__)
+    output_path: str = os.path.basename(os.path.dirname(__file__))
+
+    def post(self):
+        if self.max_seq_len is not None:
+            self.output_path = f"{self.output_path}.{self.max_seq_len}"
+        super().post()
+
+
+if __name__ == "__main__":
+    config = ConvertConfig()
+    config.parse()  # type: ignore[attr-defined]
+    convert_dataset(config)
diff --git a/multimolecule/datasets/bprna_new/README.md b/multimolecule/datasets/bprna_new/README.md
index 412d5f8e..b6bf70ab 100644
--- a/multimolecule/datasets/bprna_new/README.md
+++ b/multimolecule/datasets/bprna_new/README.md
@@ -42,6 +42,7 @@ This is an UNOFFICIAL release of the bpRNA-new by Kengo Sato, et al.
 
 - [bpRNA-1m](https://huggingface.co/datasets/multimolecule/bprna): A database of single molecule secondary structures annotated using bpRNA.
 - [bpRNA-spot](https://huggingface.co/datasets/multimolecule/bprna-spot): A subset of bpRNA-1m that applies [CD-HIT (CD-HIT-EST)](https://sites.google.com/view/cd-hit) to remove sequences with more than 80% sequence similarity from bpRNA-1m.
+- [ArchiveII](https://huggingface.co/datasets/multimolecule/archiveii): A database of RNA secondary with the same families as RNAStrAlign, usually used for testing.
 
 ## License
 
diff --git a/multimolecule/datasets/bprna_spot/README.md b/multimolecule/datasets/bprna_spot/README.md
index f6fce85a..ddbf562a 100644
--- a/multimolecule/datasets/bprna_spot/README.md
+++ b/multimolecule/datasets/bprna_spot/README.md
@@ -44,6 +44,7 @@ This is an UNOFFICIAL release of the bpRNA-spot by Jaswinder Singh, et al.
 
 - [bpRNA-1m](https://huggingface.co/datasets/multimolecule/bprna): A database of single molecule secondary structures annotated using bpRNA.
 - [bpRNA-new](https://huggingface.co/datasets/multimolecule/bprna-new): A dataset of newly discovered RNA families from Rfam 14.2, designed for cross-family validation to assess generalization capability.
+- [RNAStrAlign](https://huggingface.co/datasets/multimolecule/rnastralign): A database of RNA secondary with the same families as ArchiveII, usually used for training.
 
 ## License
 
diff --git a/multimolecule/datasets/rnastralign/README.md b/multimolecule/datasets/rnastralign/README.md
new file mode 100644
index 00000000..3463dd4a
--- /dev/null
+++ b/multimolecule/datasets/rnastralign/README.md
@@ -0,0 +1,110 @@
+---
+language: rna
+tags:
+  - Biology
+  - RNA
+license:
+  - agpl-3.0
+size_categories:
+  - 10K<n<100K
+source_datasets:
+  - multimolecule/bprna
+  - multimolecule/pdb
+task_categories:
+  - text-generation
+  - fill-mask
+task_ids:
+  - language-modeling
+  - masked-language-modeling
+pretty_name: RNAStrAlign
+library_name: multimolecule
+---
+
+# RNAStrAlign
+
+RNAStrAlign is a comprehensive dataset of RNA sequences and their secondary structures.
+
+RNAStrAlign aggregates data from multiple established RNA structure repositories, covering diverse RNA families such as 5S ribosomal RNA, tRNA, and group I introns.
+
+It is considered complementary to the [ArchiveII](./archiveii) dataset.
+
+## Disclaimer
+
+This is an UNOFFICIAL release of the RNAStrAlign by Zhen Tan, et al.
+
+**The team releasing RNAStrAlign did not write this dataset card for this dataset so this dataset card has been written by the MultiMolecule team.**
+
+## Dataset Description
+
+- **Homepage**: https://multimolecule.danling.org/datasets/rnastralign
+- **datasets**: https://huggingface.co/datasets/multimolecule/rnastralign
+- **Point of Contact**: [David H. Mathews](mailto:David_Mathews@urmc.rochester.edu) and [Gaurav Sharma](mailto:gaurav.sharma@rochester.edu)
+
+## Example Entry
+
+| id                               | sequence                            | secondary_structure                  | family     | subfamily      |
+| -------------------------------- | ----------------------------------- | ------------------------------------ | ---------- | -------------- |
+| 16S_rRNA-Actinobacteria-AB002635 | ACACAUGCAAGCGAACGUGAUCUCCAGCUUGC... | .(((.(((..((..((((.(((((.((....)...  | 16S_rRNA   | Actinobacteria |
+
+## Column Description
+
+- **id**:
+    A unique identifier for each RNA entry. This ID is derived from the family and the original `.sta` file name, and serves as a reference to the specific RNA structure within the dataset.
+
+- **sequence**:
+    The nucleotide sequence of the RNA molecule, represented using the standard RNA bases:
+
+    - **A**: Adenine
+    - **C**: Cytosine
+    - **G**: Guanine
+    - **U**: Uracil
+
+- **secondary_structure**:
+    The secondary structure of the RNA represented in dot-bracket notation, using up to three types of symbols to indicate base pairing and unpaired regions, as per bpRNA's standard:
+
+    - **Dots (`.`)**: Represent unpaired nucleotides.
+    - **Parentheses (`(` and `)`)**: Represent base pairs in standard stems (page 1).
+
+- **family**:
+    The RNA family to which the sequence belongs, such as 16S rRNA, 5S rRNA, etc.
+
+- **subfamily**:
+    A more specific subfamily within the family, such as Actinobacteria for 16S rRNA.
+
+    Not all families have subfamilies, in which case this field will be `None`.
+
+## Variations
+
+This dataset is available in two additional variants:
+
+- [rnastralign](https://huggingface.co/datasets/multimolecule/rnastralign): The main RNAStrAlign dataset.
+- [rnastralign.512](https://huggingface.co/datasets/multimolecule/rnastralign.512): RNAStrAlign dataset with sequences no longer than 512 nucleotides.
+- [rnastralign.1024](https://huggingface.co/datasets/multimolecule/rnastralign.1024): RNAStrAlign dataset with sequences no longer than 1024 nucleotides.
+
+## Related Datasets
+
+- [ArchiveII](https://huggingface.co/datasets/multimolecule/archiveii): A database of RNA secondary with the same families as RNAStrAlign, usually used for testing.
+- [bpRNA-spot](https://huggingface.co/datasets/multimolecule/bprna-spot): Another commonly used database in RNA secondary structures prediction.
+
+## License
+
+This dataset is licensed under the [AGPL-3.0 License](https://www.gnu.org/licenses/agpl-3.0.html).
+
+```spdx
+SPDX-License-Identifier: AGPL-3.0-or-later
+```
+
+## Citation
+
+```bibtex
+@article{ran2017turbofold,
+  author   = {Tan, Zhen and Fu, Yinghan and Sharma, Gaurav and Mathews, David H},
+  journal  = {Nucleic Acids Research},
+  month    = nov,
+  number   = 20,
+  pages    = {11570--11581},
+  title    = {{TurboFold} {II}: {RNA} structural alignment and secondary structure prediction informed by multiple homologs},
+  volume   = 45,
+  year     = 2017
+}
+```
diff --git a/multimolecule/datasets/rnastralign/rnastralign.py b/multimolecule/datasets/rnastralign/rnastralign.py
new file mode 100644
index 00000000..599cdd79
--- /dev/null
+++ b/multimolecule/datasets/rnastralign/rnastralign.py
@@ -0,0 +1,123 @@
+# MultiMolecule
+# Copyright (C) 2024-Present  MultiMolecule
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import annotations
+
+import os
+from collections.abc import Mapping
+from pathlib import Path
+
+import torch
+from tqdm import tqdm
+
+from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_
+from multimolecule.datasets.conversion_utils import save_dataset
+
+torch.manual_seed(1016)
+
+
+def convert_ct(file, family: str) -> Mapping:
+    if not isinstance(file, Path):
+        file = Path(file)
+    with open(file) as f:
+        lines = f.read().splitlines()
+
+    first_line = lines[0].strip().split()
+    num_bases = int(first_line[0])
+
+    sequence = []
+    dot_bracket = ["."] * num_bases
+
+    # `N` does not exist in the ct files, so we need to add it
+    if len(lines) < num_bases + 1:
+        for i in range(1, num_bases + 1):
+            if i >= len(lines):
+                lines.append(f"{i} N {i-1} {i+1} 0 i")  # noqa: E226
+            if int(lines[i].strip().split()[0]) != i:
+                lines.insert(i, f"{i} N {i-1} {i+1} 0 i")  # noqa: E226
+
+    for i in range(1, num_bases + 1):
+        line = lines[i].strip().split()
+        if int(line[0]) != i:
+            raise ValueError(f"Invalid nucleotide index at position {i}: {line[0]} does not match the expected index.")
+        sequence.append(line[1])
+        pair_index = int(line[4])
+
+        if pair_index > 0:
+            if int(lines[pair_index].strip().split()[4]) != i:
+                raise ValueError(
+                    f"Invalid pairing at position {i}: pair_index {pair_index} does not point back correctly."
+                )
+            if pair_index > i:
+                dot_bracket[i - 1] = "("
+                dot_bracket[pair_index - 1] = ")"
+
+    parts = list(file.parts)
+    parts = parts[parts.index(family + "_database") :]
+    parts[0] = parts[0][:-9]
+    parts[-1] = parts[-1][:-3]
+
+    return {
+        "id": "-".join(parts),
+        "sequence": "".join(sequence),
+        "secondary_structure": "".join(dot_bracket),
+        "family": family,
+        "subfamily": parts[1] if len(parts) == 3 else None,
+    }
+
+
+def _convert_dataset(family_dir, max_seq_len: int | None = None):
+    family_dir = Path(family_dir)
+    family = family_dir.stem[:-9]
+    files = [os.path.join(family_dir, f) for f in os.listdir(family_dir) if f.endswith(".ct")]
+    if not files:
+        for subdir in family_dir.iterdir():
+            if subdir.is_dir():
+                files.extend([os.path.join(subdir, f) for f in os.listdir(subdir) if f.endswith(".ct")])
+    files.sort(key=lambda f: ("".join(filter(str.isalpha, f)), int("".join(filter(str.isdigit, f)))))
+    data = [convert_ct(file, family) for file in tqdm(files, total=len(files))]
+    if max_seq_len is not None:
+        data = [d for d in data if len(d["sequence"]) <= max_seq_len]
+    return data
+
+
+def convert_dataset(convert_config):
+    max_seq_len = convert_config.max_seq_len
+    families = [
+        os.path.join(convert_config.dataset_path, f)
+        for f in os.listdir(convert_config.dataset_path)
+        if f.endswith("_database")
+    ]
+    families.sort()
+    data = [i for family in families for i in _convert_dataset(family, max_seq_len)]
+    save_dataset(convert_config, data, filename="train.parquet")
+
+
+class ConvertConfig(ConvertConfig_):
+    max_seq_len: int | None = None
+    root: str = os.path.dirname(__file__)
+    output_path: str = os.path.basename(os.path.dirname(__file__))
+
+    def post(self):
+        if self.max_seq_len is not None:
+            self.output_path = f"{self.output_path}.{self.max_seq_len}"
+        super().post()
+
+
+if __name__ == "__main__":
+    config = ConvertConfig()
+    config.parse()  # type: ignore[attr-defined]
+    convert_dataset(config)