From 1141aabee2ce912c40eb5c2ea9490d8acbe82e76 Mon Sep 17 00:00:00 2001 From: Zhiyuan Chen Date: Thu, 24 Oct 2024 18:31:39 +0800 Subject: [PATCH] add RNAstarlign & ArchiveII datasets Signed-off-by: Zhiyuan Chen --- docs/docs/datasets/archiveii.md | 9 ++ docs/docs/datasets/rnastralign.md | 9 ++ docs/mkdocs.yml | 2 + multimolecule/datasets/README.md | 2 + multimolecule/datasets/README.zh.md | 2 + multimolecule/datasets/archiveii/README.md | 108 +++++++++++++++ multimolecule/datasets/archiveii/archiveii.py | 105 +++++++++++++++ multimolecule/datasets/bprna_new/README.md | 1 + multimolecule/datasets/bprna_spot/README.md | 1 + multimolecule/datasets/rnastralign/README.md | 110 ++++++++++++++++ .../datasets/rnastralign/rnastralign.py | 123 ++++++++++++++++++ 11 files changed, 472 insertions(+) create mode 100644 docs/docs/datasets/archiveii.md create mode 100644 docs/docs/datasets/rnastralign.md create mode 100644 multimolecule/datasets/archiveii/README.md create mode 100644 multimolecule/datasets/archiveii/archiveii.py create mode 100644 multimolecule/datasets/rnastralign/README.md create mode 100644 multimolecule/datasets/rnastralign/rnastralign.py diff --git a/docs/docs/datasets/archiveii.md b/docs/docs/datasets/archiveii.md new file mode 100644 index 00000000..14588f82 --- /dev/null +++ b/docs/docs/datasets/archiveii.md @@ -0,0 +1,9 @@ +--- +authors: + - Zhiyuan Chen +date: 2024-05-04 +--- + +# ArchiveII + +--8<-- "multimolecule/datasets/archiveii/README.md:24:" diff --git a/docs/docs/datasets/rnastralign.md b/docs/docs/datasets/rnastralign.md new file mode 100644 index 00000000..a08e6399 --- /dev/null +++ b/docs/docs/datasets/rnastralign.md @@ -0,0 +1,9 @@ +--- +authors: + - Zhiyuan Chen +date: 2024-05-04 +--- + +# RNAStrAlign + +--8<-- "multimolecule/datasets/rnastralign/README.md:24:" diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 0a7cb1ca..9c8d52de 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -22,6 +22,8 @@ nav: - bpRNA-1m: datasets/bprna.md - bpRNA-spot: datasets/bprna-spot.md - bpRNA-new: datasets/bprna-new.md + - RNAStrAlign: datasets/rnastralign.md + - ArchiveII: datasets/archiveii.md - RYOS: datasets/ryos.md - EternaBench-CM: datasets/eternabench-cm.md - EternaBench-Switch: datasets/eternabench-switch.md diff --git a/multimolecule/datasets/README.md b/multimolecule/datasets/README.md index fda42caa..07506e7b 100644 --- a/multimolecule/datasets/README.md +++ b/multimolecule/datasets/README.md @@ -25,6 +25,8 @@ date: 2024-05-04 - [EternaBench-CM](eternabench-cm) - [EternaBench-Switch](eternabench-switch) - [EternaBench-External](eternabench-external) +- [RNAStrAlign](rnastralign) +- [ArchiveII](archiveii) ## Usage diff --git a/multimolecule/datasets/README.zh.md b/multimolecule/datasets/README.zh.md index a38dd356..18d34840 100644 --- a/multimolecule/datasets/README.zh.md +++ b/multimolecule/datasets/README.zh.md @@ -25,6 +25,8 @@ date: 2024-05-04 - [EternaBench-CM](eternabench-cm) - [EternaBench-Switch](eternabench-switch) - [EternaBench-External](eternabench-external) +- [RNAStrAlign](rnastralign) +- [ArchiveII](archiveii) ## 使用 diff --git a/multimolecule/datasets/archiveii/README.md b/multimolecule/datasets/archiveii/README.md new file mode 100644 index 00000000..21c0eb87 --- /dev/null +++ b/multimolecule/datasets/archiveii/README.md @@ -0,0 +1,108 @@ +--- +language: rna +tags: + - Biology + - RNA +license: + - agpl-3.0 +size_categories: + - 10K. + +from __future__ import annotations + +import os +from collections.abc import Mapping +from pathlib import Path + +import torch +from tqdm import tqdm + +from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_ +from multimolecule.datasets.conversion_utils import save_dataset + +torch.manual_seed(1016) + + +def convert_ct(file) -> Mapping: + if not isinstance(file, Path): + file = Path(file) + with open(file) as f: + lines = f.readlines() + + first_line = lines[0].strip().split() + num_bases = int(first_line[0]) + + sequence = [] + dot_bracket = ["."] * num_bases + + for i in range(1, num_bases + 1): + line = lines[i].strip().split() + sequence.append(line[1]) + pair_index = int(line[4]) + + if pair_index > 0: + if int(lines[pair_index].strip().split()[4]) != i: + raise ValueError( + f"Invalid pairing at position {i}: pair_index {pair_index} does not point back correctly." + ) + if pair_index > i: + dot_bracket[i - 1] = "(" + dot_bracket[pair_index - 1] = ")" + + family, name = file.stem.split("_", 1) + if family in ("5s", "16s", "23s"): + family = family.upper() + "_rRNA" + elif family == "srp": + family = family.upper() + elif family == "grp1": + family = "group_I_intron" + elif family == "grp2": + family = "group_II_intron" + id = family + "-" + name + + return { + "id": id, + "sequence": "".join(sequence), + "secondary_structure": "".join(dot_bracket), + "family": family, + } + + +def convert_dataset(convert_config): + max_seq_len = convert_config.max_seq_len + files = [ + os.path.join(convert_config.dataset_path, f) + for f in os.listdir(convert_config.dataset_path) + if f.endswith(".ct") + ] + files.sort() + data = [convert_ct(file) for file in tqdm(files, total=len(files))] + if max_seq_len is not None: + data = [d for d in data if len(d["sequence"]) <= max_seq_len] + save_dataset(convert_config, data, filename="test.parquet") + + +class ConvertConfig(ConvertConfig_): + max_seq_len: int | None = None + root: str = os.path.dirname(__file__) + output_path: str = os.path.basename(os.path.dirname(__file__)) + + def post(self): + if self.max_seq_len is not None: + self.output_path = f"{self.output_path}.{self.max_seq_len}" + super().post() + + +if __name__ == "__main__": + config = ConvertConfig() + config.parse() # type: ignore[attr-defined] + convert_dataset(config) diff --git a/multimolecule/datasets/bprna_new/README.md b/multimolecule/datasets/bprna_new/README.md index 412d5f8e..b6bf70ab 100644 --- a/multimolecule/datasets/bprna_new/README.md +++ b/multimolecule/datasets/bprna_new/README.md @@ -42,6 +42,7 @@ This is an UNOFFICIAL release of the bpRNA-new by Kengo Sato, et al. - [bpRNA-1m](https://huggingface.co/datasets/multimolecule/bprna): A database of single molecule secondary structures annotated using bpRNA. - [bpRNA-spot](https://huggingface.co/datasets/multimolecule/bprna-spot): A subset of bpRNA-1m that applies [CD-HIT (CD-HIT-EST)](https://sites.google.com/view/cd-hit) to remove sequences with more than 80% sequence similarity from bpRNA-1m. +- [ArchiveII](https://huggingface.co/datasets/multimolecule/archiveii): A database of RNA secondary with the same families as RNAStrAlign, usually used for testing. ## License diff --git a/multimolecule/datasets/bprna_spot/README.md b/multimolecule/datasets/bprna_spot/README.md index f6fce85a..ddbf562a 100644 --- a/multimolecule/datasets/bprna_spot/README.md +++ b/multimolecule/datasets/bprna_spot/README.md @@ -44,6 +44,7 @@ This is an UNOFFICIAL release of the bpRNA-spot by Jaswinder Singh, et al. - [bpRNA-1m](https://huggingface.co/datasets/multimolecule/bprna): A database of single molecule secondary structures annotated using bpRNA. - [bpRNA-new](https://huggingface.co/datasets/multimolecule/bprna-new): A dataset of newly discovered RNA families from Rfam 14.2, designed for cross-family validation to assess generalization capability. +- [RNAStrAlign](https://huggingface.co/datasets/multimolecule/rnastralign): A database of RNA secondary with the same families as ArchiveII, usually used for training. ## License diff --git a/multimolecule/datasets/rnastralign/README.md b/multimolecule/datasets/rnastralign/README.md new file mode 100644 index 00000000..3463dd4a --- /dev/null +++ b/multimolecule/datasets/rnastralign/README.md @@ -0,0 +1,110 @@ +--- +language: rna +tags: + - Biology + - RNA +license: + - agpl-3.0 +size_categories: + - 10K. + +from __future__ import annotations + +import os +from collections.abc import Mapping +from pathlib import Path + +import torch +from tqdm import tqdm + +from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_ +from multimolecule.datasets.conversion_utils import save_dataset + +torch.manual_seed(1016) + + +def convert_ct(file, family: str) -> Mapping: + if not isinstance(file, Path): + file = Path(file) + with open(file) as f: + lines = f.read().splitlines() + + first_line = lines[0].strip().split() + num_bases = int(first_line[0]) + + sequence = [] + dot_bracket = ["."] * num_bases + + # `N` does not exist in the ct files, so we need to add it + if len(lines) < num_bases + 1: + for i in range(1, num_bases + 1): + if i >= len(lines): + lines.append(f"{i} N {i-1} {i+1} 0 i") # noqa: E226 + if int(lines[i].strip().split()[0]) != i: + lines.insert(i, f"{i} N {i-1} {i+1} 0 i") # noqa: E226 + + for i in range(1, num_bases + 1): + line = lines[i].strip().split() + if int(line[0]) != i: + raise ValueError(f"Invalid nucleotide index at position {i}: {line[0]} does not match the expected index.") + sequence.append(line[1]) + pair_index = int(line[4]) + + if pair_index > 0: + if int(lines[pair_index].strip().split()[4]) != i: + raise ValueError( + f"Invalid pairing at position {i}: pair_index {pair_index} does not point back correctly." + ) + if pair_index > i: + dot_bracket[i - 1] = "(" + dot_bracket[pair_index - 1] = ")" + + parts = list(file.parts) + parts = parts[parts.index(family + "_database") :] + parts[0] = parts[0][:-9] + parts[-1] = parts[-1][:-3] + + return { + "id": "-".join(parts), + "sequence": "".join(sequence), + "secondary_structure": "".join(dot_bracket), + "family": family, + "subfamily": parts[1] if len(parts) == 3 else None, + } + + +def _convert_dataset(family_dir, max_seq_len: int | None = None): + family_dir = Path(family_dir) + family = family_dir.stem[:-9] + files = [os.path.join(family_dir, f) for f in os.listdir(family_dir) if f.endswith(".ct")] + if not files: + for subdir in family_dir.iterdir(): + if subdir.is_dir(): + files.extend([os.path.join(subdir, f) for f in os.listdir(subdir) if f.endswith(".ct")]) + files.sort(key=lambda f: ("".join(filter(str.isalpha, f)), int("".join(filter(str.isdigit, f))))) + data = [convert_ct(file, family) for file in tqdm(files, total=len(files))] + if max_seq_len is not None: + data = [d for d in data if len(d["sequence"]) <= max_seq_len] + return data + + +def convert_dataset(convert_config): + max_seq_len = convert_config.max_seq_len + families = [ + os.path.join(convert_config.dataset_path, f) + for f in os.listdir(convert_config.dataset_path) + if f.endswith("_database") + ] + families.sort() + data = [i for family in families for i in _convert_dataset(family, max_seq_len)] + save_dataset(convert_config, data, filename="train.parquet") + + +class ConvertConfig(ConvertConfig_): + max_seq_len: int | None = None + root: str = os.path.dirname(__file__) + output_path: str = os.path.basename(os.path.dirname(__file__)) + + def post(self): + if self.max_seq_len is not None: + self.output_path = f"{self.output_path}.{self.max_seq_len}" + super().post() + + +if __name__ == "__main__": + config = ConvertConfig() + config.parse() # type: ignore[attr-defined] + convert_dataset(config)