From 580bb015ff519539ab35f3e60a7f19741090eb90 Mon Sep 17 00:00:00 2001 From: Zhiyuan Chen Date: Thu, 19 Sep 2024 22:47:53 +0800 Subject: [PATCH] add RYOS dataset Signed-off-by: Zhiyuan Chen --- .pre-commit-config.yaml | 2 +- docs/docs/datasets/ryos.md | 9 ++ docs/mkdocs.yml | 1 + multimolecule/datasets/conversion_utils.py | 2 +- multimolecule/datasets/ryos/README.md | 135 +++++++++++++++++++++ multimolecule/datasets/ryos/ryos.py | 84 +++++++++++++ 6 files changed, 231 insertions(+), 2 deletions(-) create mode 100644 docs/docs/datasets/ryos.md create mode 100644 multimolecule/datasets/ryos/README.md create mode 100644 multimolecule/datasets/ryos/ryos.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9a38cf54..15232962 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -45,7 +45,7 @@ repos: hooks: - id: prettier files: multimolecule - exclude: multimolecule/datasets/bprna/README.md + exclude: multimolecule/datasets/.*/README.md - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.5.0 hooks: diff --git a/docs/docs/datasets/ryos.md b/docs/docs/datasets/ryos.md new file mode 100644 index 00000000..1e1ebd33 --- /dev/null +++ b/docs/docs/datasets/ryos.md @@ -0,0 +1,9 @@ +--- +authors: + - Zhiyuan Chen +date: 2024-05-04 +--- + +# RYOS + +--8<-- "multimolecule/datasets/ryos/README.md:21:" diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 53875f32..5cef7518 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -22,6 +22,7 @@ nav: - bpRNA-1m: datasets/bprna.md - bpRNA-spot: datasets/bprna-spot.md - bpRNA-new: datasets/bprna-new.md + - RYOS: datasets/ryos.md - module: - module/index.md - heads: module/heads.md diff --git a/multimolecule/datasets/conversion_utils.py b/multimolecule/datasets/conversion_utils.py index 7bf90012..c27b9d44 100644 --- a/multimolecule/datasets/conversion_utils.py +++ b/multimolecule/datasets/conversion_utils.py @@ -50,7 +50,7 @@ def write_data( elif isinstance(data, dict): data = Table.from_pydict(data) elif isinstance(data, DataFrame): - data = Table.from_pandas(data) + data = Table.from_pandas(data, preserve_index=False) if not isinstance(data, Table): raise ValueError("Data must be a list, dict, pandas DataFrame, or pyarrow Table.") diff --git a/multimolecule/datasets/ryos/README.md b/multimolecule/datasets/ryos/README.md new file mode 100644 index 00000000..339822b9 --- /dev/null +++ b/multimolecule/datasets/ryos/README.md @@ -0,0 +1,135 @@ +--- +language: rna +tags: + - Biology + - RNA +license: + - agpl-3.0 +size_categories: + - 1K Machine learning has been at the forefront of the movement for free and open access to research. +> +> We see no role for closed access or author-fee publication in the future of machine learning research and believe the adoption of these journals as an outlet of record for the machine learning community would be a retrograde step. + +The MultiMolecule team is committed to the principles of open access and open science. + +We do NOT endorse the publication of manuscripts in Closed Access / Author-Fee journals and encourage the community to support Open Access journals and conferences. + +Please consider signing the [Statement on Nature Machine Intelligence](https://openaccess.engineering.oregonstate.edu). + +## Disclaimer + +This is an UNOFFICIAL release of the [RYOS](https://www.kaggle.com/competitions/stanford-covid-vaccine) by Hannah K. Wayment-Steele, et al. + +**The team releasing RYOS did not write this dataset card for this dataset so this dataset card has been written by the MultiMolecule team.** + +## Dataset Description + +- **Homepage**: https://multimolecule.danling.org/datasets/ryos +- **Point of Contact**: [Rhiju Das](https://biochemistry.stanford.edu/people/rhiju-das/) +- **Kaggle Challenge**: https://www.kaggle.com/competitions/stanford-covid-vaccine +- **Eterna Round 1**: https://eternagame.org/labs/9830365 +- **Eterna Round 2**: https://eternagame.org/labs/10207059 + +## Example Entry + +| id | design | sequence | secondary_structure | reactivity | errors_reactivity | signal_to_noise_reactivity | deg_pH10 | errors_deg_pH10 | signal_to_noise_deg_pH10 | deg_50C | errors_deg_50C | signal_to_noise_deg_50C | deg_Mg_pH10 | errors_deg_Mg_pH10 | signal_to_noise_deg_Mg_pH10 | deg_Mg_50C | errors_deg_Mg_50C | signal_to_noise_deg_Mg_50C | SN_filter | +| ------- | ------- | ------------- | ------------------- | ----------------------------- | ---------------------------- | -------------------------- | ----------------------------- | ---------------------------- | ------------------------ | --------------------------- | -------------- | ---------------------------------- | ----------------------------- | ---------------------------- | --------------------------- | --------------------------- | ---------------------------- | -------------------------- | --------- | +| 9830366 | testing | GGAAAUUUGC... | .......(((... | [0.4167, 1.5941, 1.2359, ...] | [0.1689, 0.2323, 0.193, ...] | 5.326 | [1.5966, 2.6482, 1.3761, ...] | [0.3058, 0.3294, 0.233, ...] | 4.198 | [0.7885, 1.93, 2.0423, ...] | | 3.746 [0.2773, 0.328, 0.3048, ...] | [1.5966, 2.6482, 1.3761, ...] | [0.3058, 0.3294, 0.233, ...] | 4.198 | [0.7885, 1.93, 2.0423, ...] | [0.2773, 0.328, 0.3048, ...] | 3.746 | True | + +## Column Description + +- **id**: + A unique identifier for each RNA sequence entry. + +- **design**: + The name given to each RNA design by contributors, used for easy reference. + +- **sequence**: + The nucleotide sequence of the RNA molecule, represented using the standard RNA bases: + + - **A**: Adenine + - **C**: Cytosine + - **G**: Guanine + - **U**: Uracil + +- **secondary_structure**: + The secondary structure of the RNA represented in dot-bracket notation, using up to three types of symbols to indicate base pairing and unpaired regions, as per bpRNA's standard: + + - **Dots (`.`)**: Represent unpaired nucleotides. + - **Parentheses (`(` and `)`)**: Represent base pairs in standard stems (page 1). + - **Square Brackets (`[` and `]`)**: Represent base pairs in pseudoknots (page 2). + - **Curly Braces (`{` and `}`)**: Represent base pairs in additional pseudoknots (page 3). + +- **reactivity**: + A list of floating-point values that provide an estimate of the likelihood of the RNA backbone being cut at each nucleotide position. + These values help determine the stability of the RNA structure under various experimental conditions. + +- **deg_pH10** and **deg_Mg_pH10**: + Arrays of degradation rates observed under two conditions: incubation at pH 10 without and with magnesium, respectively. + These values provide insight into how different conditions affect the stability of RNA molecules. + +- **deg_50C** and **deg_Mg_50C**: + Arrays of degradation rates after incubation at 50°C, without and with magnesium. + These values capture how RNA sequences respond to elevated temperatures, which is relevant for storage and transportation conditions. + +- **\*\_error\_\* Columns**: + Arrays of floating-point numbers indicating the experimental errors corresponding to the measurements in the **reactivity** and **deg\_** columns. + These values help quantify the uncertainty in the degradation rates and reactivity measurements. + +- **SN_filter**: + A filter applied to the dataset based on the signal-to-noise ratio, indicating whether a specific sequence meets the dataset’s quality criteria. + + If the SN_filter is `True`, the sequence meets the quality criteria; otherwise, it does not. + +Note that due to technical limitations, the ground truth measurements are not available for the final bases of each RNA sequence, resulting in a shorter length for the provided labels compared to the full sequence. + +## Variations + +This dataset is available in two subsets: + +- [RYOS-1](https://huggingface.co/datasets/multimolecule/ryos-1): The RYOS dataset from round 1 of the Eterna RYOS lab. The sequence length for RYOS-1 is 107, and the label length is 68. +- [RYOS-2](https://huggingface.co/datasets/multimolecule/ryos-2): The RYOS dataset from round 2 of the Eterna RYOS lab. The sequence length for RYOS-2 is 130, and the label length is 102. + +## License + +This dataset is licensed under the [AGPL-3.0 License](https://www.gnu.org/licenses/agpl-3.0.html). + +```spdx +SPDX-License-Identifier: AGPL-3.0-or-later +``` + +## Citation + +```bibtex +@article{waymentsteele2021deep, + author = {Wayment-Steele, Hannah K and Kladwang, Wipapat and Watkins, Andrew M and Kim, Do Soon and Tunguz, Bojan and Reade, Walter and Demkin, Maggie and Romano, Jonathan and Wellington-Oguri, Roger and Nicol, John J and Gao, Jiayang and Onodera, Kazuki and Fujikawa, Kazuki and Mao, Hanfei and Vandewiele, Gilles and Tinti, Michele and Steenwinckel, Bram and Ito, Takuya and Noumi, Taiga and He, Shujun and Ishi, Keiichiro and Lee, Youhan and {\"O}zt{\"u}rk, Fatih and Chiu, Anthony and {\"O}zt{\"u}rk, Emin and Amer, Karim and Fares, Mohamed and Participants, Eterna and Das, Rhiju}, + journal = {ArXiv}, + month = oct, + title = {Deep learning models for predicting {RNA} degradation via dual crowdsourcing}, + year = 2021 +} +``` diff --git a/multimolecule/datasets/ryos/ryos.py b/multimolecule/datasets/ryos/ryos.py new file mode 100644 index 00000000..bc89a793 --- /dev/null +++ b/multimolecule/datasets/ryos/ryos.py @@ -0,0 +1,84 @@ +# MultiMolecule +# Copyright (C) 2024-Present MultiMolecule + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from __future__ import annotations + +import os + +import danling as dl +import torch + +from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_ +from multimolecule.datasets.conversion_utils import save_dataset + +torch.manual_seed(1016) + +cols = [ + "id", + "design", + "sequence", + "secondary_structure", + "reactivity", + "errors_reactivity", + "signal_to_noise_reactivity", + "deg_pH10", + "errors_deg_pH10", + "signal_to_noise_deg_pH10", + "deg_50C", + "errors_deg_50C", + "signal_to_noise_deg_50C", + "deg_Mg_pH10", + "errors_deg_Mg_pH10", + "signal_to_noise_deg_Mg_pH10", + "deg_Mg_50C", + "errors_deg_Mg_50C", + "signal_to_noise_deg_Mg_50C", + "SN_filter", +] + + +def convert_dataset(convert_config): + df = dl.load_pandas(convert_config.dataset_path) + df.SN_filter = df.SN_filter.astype(bool) + df = df.rename(columns={"ID": "id", "design_name": "design", "structure": "secondary_structure"}) + df = df.sort_values("id") + ryos1 = df[df["RYOS"] == 1] + ryos2 = df[df["RYOS"] == 2] + data1 = { + "train": ryos1[ryos1["split"] == "public_train"][cols], + "validation": ryos1[ryos1["split"] == "public_test"][cols], + "test": ryos1[ryos1["split"] == "private_test"][cols], + } + data2 = { + "train": ryos2[ryos2["split"] != "private_test"][cols], + "test": ryos2[ryos2["split"] == "private_test"][cols], + } + repo_id, output_path = convert_config.repo_id, convert_config.output_path + convert_config.repo_id, convert_config.output_path = repo_id + "-1", output_path + "-1" + save_dataset(convert_config, data1) + convert_config.repo_id, convert_config.output_path = repo_id + "-2", output_path + "-2" + save_dataset(convert_config, data2) + + +class ConvertConfig(ConvertConfig_): + root: str = os.path.dirname(__file__) + output_path: str = os.path.basename(os.path.dirname(__file__)) + + +if __name__ == "__main__": + config = ConvertConfig() + config.parse() # type: ignore[attr-defined] + convert_dataset(config)