From 50f4bc45fe62a7296b81361d0a09d293cb2ddabb Mon Sep 17 00:00:00 2001 From: Zhiyuan Chen Date: Tue, 8 Oct 2024 21:56:48 +0800 Subject: [PATCH] add EternaBench dataset Signed-off-by: Zhiyuan Chen --- docs/docs/datasets/eternabench.md | 9 ++ docs/mkdocs.yml | 1 + multimolecule/datasets/eternabench/README.md | 97 +++++++++++++++++++ .../datasets/eternabench/eternabench.py | 60 ++++++++++++ 4 files changed, 167 insertions(+) create mode 100644 docs/docs/datasets/eternabench.md create mode 100644 multimolecule/datasets/eternabench/README.md create mode 100644 multimolecule/datasets/eternabench/eternabench.py diff --git a/docs/docs/datasets/eternabench.md b/docs/docs/datasets/eternabench.md new file mode 100644 index 00000000..833d4b2f --- /dev/null +++ b/docs/docs/datasets/eternabench.md @@ -0,0 +1,9 @@ +--- +authors: + - Zhiyuan Chen +date: 2024-05-04 +--- + +# EternaBench + +--8<-- "multimolecule/datasets/eternabench/README.md:21:" diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 5cef7518..e6bd4a30 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -23,6 +23,7 @@ nav: - bpRNA-spot: datasets/bprna-spot.md - bpRNA-new: datasets/bprna-new.md - RYOS: datasets/ryos.md + - EternaBench: datasets/eternabench.md - module: - module/index.md - heads: module/heads.md diff --git a/multimolecule/datasets/eternabench/README.md b/multimolecule/datasets/eternabench/README.md new file mode 100644 index 00000000..8ea810e8 --- /dev/null +++ b/multimolecule/datasets/eternabench/README.md @@ -0,0 +1,97 @@ +--- +language: rna +tags: + - Biology + - RNA +license: + - agpl-3.0 +size_categories: + - 1K. + +from __future__ import annotations + +import os + +import danling as dl +import pandas as pd +import torch + +from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_ +from multimolecule.datasets.conversion_utils import save_dataset + +torch.manual_seed(1016) + +cols = [ + "ID", + "design_name", + "sequence", + "structure", + "reactivity", + "errors", + "signal_to_noise", +] + + +def convert_dataset_(df: pd.DataFrame): + df = df[cols] + df.signal_to_noise = df.signal_to_noise.str.split(":").str[-1].astype(float) + return df + + +def convert_dataset(convert_config): + train = dl.load_pandas(convert_config.train_path) + test = dl.load_pandas(convert_config.test_path) + save_dataset(convert_config, {"train": convert_dataset_(train), "test": convert_dataset_(test)}) + + +class ConvertConfig(ConvertConfig_): + root: str = os.path.dirname(__file__) + + +if __name__ == "__main__": + config = ConvertConfig() + config.parse() # type: ignore[attr-defined] + convert_dataset(config)