From 84a494c1070bfbcfec66accc4eca33ee925220c7 Mon Sep 17 00:00:00 2001 From: Zhiyuan Chen Date: Tue, 8 Oct 2024 21:56:48 +0800 Subject: [PATCH] add EternaBench dataset Signed-off-by: Zhiyuan Chen --- docs/docs/datasets/eternabench-cm.md | 9 ++ docs/docs/datasets/eternabench-external.md | 9 ++ docs/docs/datasets/eternabench-switch.md | 9 ++ docs/mkdocs.yml | 3 + .../datasets/eternabench_cm/README.md | 110 +++++++++++++ .../datasets/eternabench_cm/eternabench_cm.py | 63 ++++++++ .../datasets/eternabench_external/README.md | 117 ++++++++++++++ .../eternabench_external.py | 69 +++++++++ .../datasets/eternabench_switch/README.md | 144 ++++++++++++++++++ .../eternabench_switch/eternabench_switch.py | 94 ++++++++++++ 10 files changed, 627 insertions(+) create mode 100644 docs/docs/datasets/eternabench-cm.md create mode 100644 docs/docs/datasets/eternabench-external.md create mode 100644 docs/docs/datasets/eternabench-switch.md create mode 100644 multimolecule/datasets/eternabench_cm/README.md create mode 100644 multimolecule/datasets/eternabench_cm/eternabench_cm.py create mode 100644 multimolecule/datasets/eternabench_external/README.md create mode 100644 multimolecule/datasets/eternabench_external/eternabench_external.py create mode 100644 multimolecule/datasets/eternabench_switch/README.md create mode 100644 multimolecule/datasets/eternabench_switch/eternabench_switch.py diff --git a/docs/docs/datasets/eternabench-cm.md b/docs/docs/datasets/eternabench-cm.md new file mode 100644 index 00000000..f46f9071 --- /dev/null +++ b/docs/docs/datasets/eternabench-cm.md @@ -0,0 +1,9 @@ +--- +authors: + - Zhiyuan Chen +date: 2024-05-04 +--- + +# EternaBench-CM + +--8<-- "multimolecule/datasets/eternabench_cm/README.md:21:" diff --git a/docs/docs/datasets/eternabench-external.md b/docs/docs/datasets/eternabench-external.md new file mode 100644 index 00000000..a039a807 --- /dev/null +++ b/docs/docs/datasets/eternabench-external.md @@ -0,0 +1,9 @@ +--- +authors: + - Zhiyuan Chen +date: 2024-05-04 +--- + +# EternaBench-External + +--8<-- "multimolecule/datasets/eternabench_external/README.md:21:" diff --git a/docs/docs/datasets/eternabench-switch.md b/docs/docs/datasets/eternabench-switch.md new file mode 100644 index 00000000..b8aa3f72 --- /dev/null +++ b/docs/docs/datasets/eternabench-switch.md @@ -0,0 +1,9 @@ +--- +authors: + - Zhiyuan Chen +date: 2024-05-04 +--- + +# EternaBench-Switch + +--8<-- "multimolecule/datasets/eternabench_switch/README.md:21:" diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 5cef7518..0a7cb1ca 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -23,6 +23,9 @@ nav: - bpRNA-spot: datasets/bprna-spot.md - bpRNA-new: datasets/bprna-new.md - RYOS: datasets/ryos.md + - EternaBench-CM: datasets/eternabench-cm.md + - EternaBench-Switch: datasets/eternabench-switch.md + - EternaBench-External: datasets/eternabench-external.md - module: - module/index.md - heads: module/heads.md diff --git a/multimolecule/datasets/eternabench_cm/README.md b/multimolecule/datasets/eternabench_cm/README.md new file mode 100644 index 00000000..aeed1fe8 --- /dev/null +++ b/multimolecule/datasets/eternabench_cm/README.md @@ -0,0 +1,110 @@ +--- +language: rna +tags: + - Biology + - RNA +license: + - agpl-3.0 +size_categories: + - 1K. + +from __future__ import annotations + +import os + +import danling as dl +import pandas as pd +import torch + +from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_ +from multimolecule.datasets.conversion_utils import save_dataset + +torch.manual_seed(1016) + +cols = [ + "id", + "design", + "sequence", + "secondary_structure", + "reactivity", + "errors", + "signal_to_noise", +] + + +def convert_dataset_(df: pd.DataFrame): + df.signal_to_noise = df.signal_to_noise.str.split(":").str[-1].astype(float) + df = df.rename(columns={"ID": "id", "design_name": "design", "structure": "secondary_structure"}) + df = df.sort_values("id") + df = df[cols] + return df + + +def convert_dataset(convert_config): + train = dl.load_pandas(convert_config.train_path) + test = dl.load_pandas(convert_config.test_path) + save_dataset(convert_config, {"train": convert_dataset_(train), "test": convert_dataset_(test)}) + + +class ConvertConfig(ConvertConfig_): + root: str = os.path.dirname(__file__) + output_path: str = os.path.basename(os.path.dirname(__file__)).replace("_", "-") + + +if __name__ == "__main__": + config = ConvertConfig() + config.parse() # type: ignore[attr-defined] + convert_dataset(config) diff --git a/multimolecule/datasets/eternabench_external/README.md b/multimolecule/datasets/eternabench_external/README.md new file mode 100644 index 00000000..c865d65a --- /dev/null +++ b/multimolecule/datasets/eternabench_external/README.md @@ -0,0 +1,117 @@ +--- +language: rna +tags: + - Biology + - RNA +license: + - agpl-3.0 +size_categories: + - 1K. + +from __future__ import annotations + +import os +from pathlib import Path + +import danling as dl +import pandas as pd +import torch + +from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_ +from multimolecule.datasets.conversion_utils import save_dataset + +torch.manual_seed(1016) + + +cols = ["name", "sequence", "reactivity", "seqpos", "class", "dataset"] + + +def convert_dataset_(df: pd.DataFrame): + df.drop("seqpos", axis=1, inplace=True) + df = df.rename( + columns={ + "Class": "class", + "Dataset": "dataset", + "orig_seqpos": "seqpos", + } + ) + df = df.sort_values("name") + df = df[cols] + return df + + +def convert_dataset(convert_config): + df = dl.load_pandas(convert_config.dataset_path) + fd = convert_dataset_(df) + save_dataset(convert_config, {"test": fd}) + + +class ConvertConfig(ConvertConfig_): + root: str = os.path.dirname(__file__) + + def post(self): + if not self.output_path: + dataset_name = Path(self.dataset_path).stem + seq_length = dataset_name.split("_")[2][6:] + self.output_path = os.path.basename(os.path.dirname(__file__)).replace("_", "-") + f".{seq_length}" + super().post() + + +if __name__ == "__main__": + config = ConvertConfig() + config.parse() # type: ignore[attr-defined] + convert_dataset(config) diff --git a/multimolecule/datasets/eternabench_switch/README.md b/multimolecule/datasets/eternabench_switch/README.md new file mode 100644 index 00000000..0081cdb9 --- /dev/null +++ b/multimolecule/datasets/eternabench_switch/README.md @@ -0,0 +1,144 @@ +--- +language: rna +tags: + - Biology + - RNA +license: + - agpl-3.0 +size_categories: + - 1K. + +from __future__ import annotations + +import os + +import danling as dl +import pandas as pd +import torch + +from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_ +from multimolecule.datasets.conversion_utils import save_dataset + +torch.manual_seed(1016) + +cols = [ + "id", + "design", + "sequence", + "activation_ratio", + "ligand", + "switch", + "kd_off", + "kd_on", + "kd_fmn", + "kd_no_fmn", + "min_kd_val", + "ms2_aptamer", + "lig_aptamer", + "ms2_lig_aptamer", + "log_kd_nolig", + "log_kd_lig", + "log_kd_nolig_scaled", + "log_kd_lig_scaled", + "log_AR", + "folding_subscore", + "num_clusters", +] + + +def convert_dataset_(df: pd.DataFrame): + df = df.rename( + columns={ + "index": "id", + "Design": "design", + "Activation Ratio": "activation_ratio", + "Folding_Subscore": "folding_subscore", + "KDOFF": "kd_off", + "KDON": "kd_on", + "KDFMN": "kd_fmn", + "KDnoFMN": "kd_no_fmn", + "NumberOfClusters": "num_clusters", + "logkd_nolig": "log_kd_nolig", + "logkd_lig": "log_kd_lig", + "logkd_nolig_scaled": "log_kd_nolig_scaled", + "logkd_lig_scaled": "log_kd_lig_scaled", + "MS2_aptamer": "ms2_aptamer", + "MS2_lig_aptamer": "ms2_lig_aptamer", + } + ) + df = df.sort_values("id") + df = df[cols] + return df + + +def convert_dataset(convert_config): + train = dl.load_pandas(convert_config.train_path) + test = dl.load_pandas(convert_config.test_path) + save_dataset(convert_config, {"train": convert_dataset_(train), "test": convert_dataset_(test)}) + + +class ConvertConfig(ConvertConfig_): + root: str = os.path.dirname(__file__) + output_path: str = os.path.basename(os.path.dirname(__file__)).replace("_", "-") + + +if __name__ == "__main__": + config = ConvertConfig() + config.parse() # type: ignore[attr-defined] + convert_dataset(config)