Skip to content

Commit

Permalink
add Rivas dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
ZhiyuanChen committed Nov 17, 2024
1 parent 1141aab commit 35733a4
Show file tree
Hide file tree
Showing 5 changed files with 206 additions and 13 deletions.
9 changes: 9 additions & 0 deletions docs/docs/datasets/rivas.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
authors:
- Zhiyuan Chen
date: 2024-05-04
---

# RIVAS

--8<-- "multimolecule/datasets/rivas/README.md:21:"
1 change: 1 addition & 0 deletions docs/mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ nav:
- RNA:
- RNAcentral: datasets/rnacentral.md
- Rfam: datasets/rfam.md
- RIVAS: datasets/rivas.md
- bpRNA-1m: datasets/bprna.md
- bpRNA-spot: datasets/bprna-spot.md
- bpRNA-new: datasets/bprna-new.md
Expand Down
48 changes: 35 additions & 13 deletions multimolecule/datasets/bprna_new/bprna_new.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import os
from collections import namedtuple
from collections.abc import Mapping
from pathlib import Path

import torch
Expand All @@ -30,20 +31,41 @@
RNA_SS_data = namedtuple("RNA_SS_data", "seq ss_label length name pairs")


def convert_bpseq(bpseq):
if isinstance(bpseq, str):
bpseq = Path(bpseq)
with open(bpseq) as f:
def convert_bpseq(file) -> Mapping:
if not isinstance(file, Path):
file = Path(file)
with open(file) as f:
lines = f.read().splitlines()
lines = [[int(i) if i.isdigit() else i for i in j.split()] for j in lines]
sequence, structure = [], ["."] * len(lines)
for row in lines:
index, nucleotide, paired_index = row
sequence.append(nucleotide)
if paired_index > 0 and index < paired_index:
structure[index - 1] = "("
structure[paired_index - 1] = ")"
return {"id": bpseq.stem.split("-")[0], "sequence": "".join(sequence), "secondary_structure": "".join(structure)}

num_bases = len(lines)
sequence = []
dot_bracket = ["."] * num_bases
pairs = [-1] * num_bases

for line in lines:
parts = line.strip().split()
index = int(parts[0]) - 1
base = parts[1]
paired_index = int(parts[2]) - 1

sequence.append(base)

if paired_index >= 0:
if paired_index > index:
dot_bracket[index] = "("
dot_bracket[paired_index] = ")"
elif pairs[paired_index] != index:
raise ValueError(
f"Inconsistent pairing: Base {index} is paired with {paired_index}, "
f"but {paired_index} is not paired with {index}."
)
pairs[index] = paired_index

return {
"id": file.stem.split("-")[0],
"sequence": "".join(sequence),
"secondary_structure": "".join(dot_bracket),
}


def convert_dataset(convert_config):
Expand Down
101 changes: 101 additions & 0 deletions multimolecule/datasets/rivas/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
---
language: rna
tags:
- Biology
- RNA
license:
- agpl-3.0
size_categories:
- 1K<n<10K
task_categories:
- text-generation
- fill-mask
task_ids:
- language-modeling
- masked-language-modeling
pretty_name: RIVAS
library_name: multimolecule
---

# RIVAS

The RIVAS dataset is a curated collection of RNA sequences and their secondary structures, designed for training and evaluating RNA secondary structure prediction methods.
The dataset combines sequences from published studies and databases like Rfam, covering diverse RNA families such as tRNA, SRP RNA, and ribozymes.
The secondary structure data is obtained from experimentally verified structures and consensus structures from Rfam alignments, ensuring high-quality annotations for model training and evaluation.

## Disclaimer

This is an UNOFFICIAL release of the RIVAS dataset by Elena Rivas, et al.

**The team releasing RIVAS did not write this dataset card for this dataset so this dataset card has been written by the MultiMolecule team.**

## Dataset Description

- **Homepage**: https://multimolecule.danling.org/datasets/rivas
- **Point of Contact**: [Elena Rivas](mailto:[email protected])

## Example Entry

| id | sequence | secondary_structure |
| ----------------------- | ----------------------------------- | ----------------------------------- |
| AACY020454584.1_604-676 | ACUGGUUGCGGCCAGUAUAAAUAGUCUUUAAG... | ((((........)))).........((........ |

## Column Description

The converted dataset consists of the following columns, each providing specific information about the RNA secondary structures, consistent with the bpRNA standard:

- **id**:
A unique identifier for each RNA entry. This ID is derived from the original `.sta` file name and serves as a reference to the specific RNA structure within the dataset.

- **sequence**:
The nucleotide sequence of the RNA molecule, represented using the standard RNA bases:

- **A**: Adenine
- **C**: Cytosine
- **G**: Guanine
- **U**: Uracil

- **secondary_structure**:
The secondary structure of the RNA represented in dot-bracket notation, using up to three types of symbols to indicate base pairing and unpaired regions, as per bpRNA's standard:

- **Dots (`.`)**: Represent unpaired nucleotides.
- **Parentheses (`(` and `)`)**: Represent base pairs in standard stems (page 1).
- **Square Brackets (`[` and `]`)**: Represent base pairs in pseudoknots (page 2).
- **Curly Braces (`{` and `}`)**: Represent base pairs in additional pseudoknots (page 3).

## Variations

This dataset is available in three variants:

- [RIVAS](https://huggingface.co/datasets/multimolecule/rivas): Includes TrainSetA (3166 sequences) for training, TestSetA (697 sequences) for validation and TestSetB (430 sequences) for testing.
- [RIVAS-A](https://huggingface.co/datasets/multimolecule/rivas-a): Includes TrainSetA (3166 sequences) and TestSetA (697 sequences), emphasizing sequence diversity while minimizing overlap between training and test sets. Suitable for evaluating RNA secondary structure prediction models on diverse RNA families.
- [RIVAS-B](https://huggingface.co/datasets/multimolecule/rivas-b): Consists of TrainSetB (1094 sequences) and TestSetB (430 sequences) derived from Rfam alignments, offering additional structural diversity and RNA types not present in RIVAS-A. Designed for testing the generalization capability of models trained on different types of RNA structures.

## Related Datasets

- [bpRNA-spot](https://huggingface.co/datasets/multimolecule/bprna-spot): A subset of RIVAS that applies [CD-HIT (CD-HIT-EST)](https://sites.google.com/view/cd-hit) to remove sequences with more than 80% sequence similarity from RIVAS.
- [RNAStrAlign](https://huggingface.co/datasets/multimolecule/rnastralign): A database of RNA secondary with the same families as ArchiveII, usually used for training.

## License

This dataset is licensed under the [AGPL-3.0 License](https://www.gnu.org/licenses/agpl-3.0.html).

```spdx
SPDX-License-Identifier: AGPL-3.0-or-later
```

## Citation

```bibtex
@article{rivas2012a,
author = {Rivas, Elena and Lang, Raymond and Eddy, Sean R},
journal = {RNA},
month = feb,
number = 2,
pages = {193--212},
publisher = {Cold Spring Harbor Laboratory},
title = {A range of complex probabilistic models for {RNA} secondary structure prediction that includes the nearest-neighbor model and more},
volume = 18,
year = 2012
}
```
60 changes: 60 additions & 0 deletions multimolecule/datasets/rivas/rivas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# MultiMolecule
# Copyright (C) 2024-Present MultiMolecule

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from __future__ import annotations

import os

import torch
from tqdm import tqdm

from multimolecule.datasets.bprna_new.bprna_new import convert_bpseq
from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_
from multimolecule.datasets.conversion_utils import get_files, save_dataset

torch.manual_seed(1016)


def _convert_dataset(root):
files = get_files(root)
return [convert_bpseq(file) for file in tqdm(files, total=len(files))]


def convert_dataset(convert_config):
root = convert_config.dataset_path
train_a = _convert_dataset(os.path.join(root, "TrainSetA"))
train_b = _convert_dataset(os.path.join(root, "TrainSetB"))
test_a = _convert_dataset(os.path.join(root, "TestSetA"))
test_b = _convert_dataset(os.path.join(root, "TestSetB"))
output_path, repo_id = convert_config.output_path, convert_config.repo_id
save_dataset(convert_config, {"train": train_a, "validation": test_a, "test": test_b})
convert_config.output_path = output_path + "-a"
convert_config.repo_id = repo_id + "-a"
save_dataset(convert_config, {"train": train_a, "test": test_a})
convert_config.output_path = output_path + "-b"
convert_config.repo_id = repo_id + "-b"
save_dataset(convert_config, {"train": train_b, "test": test_b})


class ConvertConfig(ConvertConfig_):
root: str = os.path.dirname(__file__)
output_path: str = os.path.basename(os.path.dirname(__file__))


if __name__ == "__main__":
config = ConvertConfig()
config.parse() # type: ignore[attr-defined]
convert_dataset(config)

0 comments on commit 35733a4

Please sign in to comment.