diff --git a/docs/docs/datasets/eternabench.md b/docs/docs/datasets/eternabench.md new file mode 100644 index 00000000..833d4b2f --- /dev/null +++ b/docs/docs/datasets/eternabench.md @@ -0,0 +1,9 @@ +--- +authors: + - Zhiyuan Chen +date: 2024-05-04 +--- + +# EternaBench + +--8<-- "multimolecule/datasets/eternabench/README.md:21:" diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 5cef7518..e6bd4a30 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -23,6 +23,7 @@ nav: - bpRNA-spot: datasets/bprna-spot.md - bpRNA-new: datasets/bprna-new.md - RYOS: datasets/ryos.md + - EternaBench: datasets/eternabench.md - module: - module/index.md - heads: module/heads.md diff --git a/multimolecule/datasets/eternabench/README.md b/multimolecule/datasets/eternabench/README.md new file mode 100644 index 00000000..73332145 --- /dev/null +++ b/multimolecule/datasets/eternabench/README.md @@ -0,0 +1,96 @@ +--- +language: rna +tags: + - Biology + - RNA +license: + - agpl-3.0 +size_categories: + - 1K. + +from __future__ import annotations + +import os + +import danling as dl +import pandas as pd +import torch + +from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_ +from multimolecule.datasets.conversion_utils import save_dataset + +torch.manual_seed(1016) + +cols = [ + "ID", + "design_name", + "sequence", + "structure", + "reactivity", + "errors", + "signal_to_noise", +] + + +def convert_dataset_(df: pd.DataFrame): + df = df[cols] + df.signal_to_noise = df.signal_to_noise.str.split(":").str[-1].astype(float) + return df + + +def convert_dataset(convert_config): + train = dl.load_pandas(convert_config.train_path) + test = dl.load_pandas(convert_config.test_path) + save_dataset(convert_config, {"train": convert_dataset_(train), "test": convert_dataset_(test)}) + + +class ConvertConfig(ConvertConfig_): + root: str = os.path.dirname(__file__) + + +if __name__ == "__main__": + config = ConvertConfig() + config.parse() # type: ignore[attr-defined] + convert_dataset(config)