From e50da4a8db4f70f5cee7f34a431645299c985469 Mon Sep 17 00:00:00 2001 From: Zhiyuan Chen Date: Thu, 2 May 2024 22:15:18 +0800 Subject: [PATCH] add Dataset Signed-off-by: Zhiyuan Chen --- multimolecule/__init__.py | 8 +- multimolecule/data/__init__.py | 4 + multimolecule/data/dataset.py | 162 ++++++++++++++++++++++ multimolecule/data/utils.py | 2 + pyproject.toml | 2 + tests/data/datasets/rna/5utr.csv | 68 +++++++++ tests/data/datasets/rna/modification.json | 1 + tests/data/datasets/rna/ncrna.csv | 68 +++++++++ tests/data/datasets/rna/rnaswitches.csv | 68 +++++++++ tests/data/test_dataset.py | 18 +++ 10 files changed, 399 insertions(+), 2 deletions(-) create mode 100644 multimolecule/data/__init__.py create mode 100644 multimolecule/data/dataset.py create mode 100644 multimolecule/data/utils.py create mode 100644 tests/data/datasets/rna/5utr.csv create mode 100644 tests/data/datasets/rna/modification.json create mode 100644 tests/data/datasets/rna/ncrna.csv create mode 100644 tests/data/datasets/rna/rnaswitches.csv create mode 100644 tests/data/test_dataset.py diff --git a/multimolecule/__init__.py b/multimolecule/__init__.py index 5f606a89..b92ed53d 100644 --- a/multimolecule/__init__.py +++ b/multimolecule/__init__.py @@ -1,4 +1,5 @@ -from . import models, tokenisers +from . import data, models, tokenisers +from .data import Dataset, PandasDataset from .downstream.crispr_off_target import ( RnaBertForCrisprOffTarget, RnaFmForCrisprOffTarget, @@ -54,9 +55,12 @@ from .tokenisers import RnaTokenizer __all__ = [ - "models", + "data", + "Dataset", + "PandasDataset", "tokenisers", "RnaTokenizer", + "models", "RnaBertConfig", "RnaBertModel", "RnaBertForMaskedLM", diff --git a/multimolecule/data/__init__.py b/multimolecule/data/__init__.py new file mode 100644 index 00000000..500f7b59 --- /dev/null +++ b/multimolecule/data/__init__.py @@ -0,0 +1,4 @@ +from .dataset import Dataset, PandasDataset +from .utils import no_collate + +__all__ = ["Dataset", "PandasDataset", "no_collate"] diff --git a/multimolecule/data/dataset.py b/multimolecule/data/dataset.py new file mode 100644 index 00000000..eeeff065 --- /dev/null +++ b/multimolecule/data/dataset.py @@ -0,0 +1,162 @@ +from __future__ import annotations + +from collections import OrderedDict +from collections.abc import Mapping, Sequence + +import danling as dl +import datasets +import torch +from chanfig import FlatDict +from danling import NestedTensor +from pandas import DataFrame +from tokenizers import Tokenizer +from torch import Tensor +from transformers import AutoTokenizer, PreTrainedTokenizerBase + + +class Dataset(datasets.Dataset): + + data_cols: Sequence + feature_cols: Sequence + label_cols: Sequence + tokenizer: PreTrainedTokenizerBase | Tokenizer + sequence_cols: Sequence + rename_sequence: bool + preprocess: bool + + def post( + self, + tokenizer: Tokenizer | PreTrainedTokenizerBase | None = None, + pretrained: str | None = None, + feature_cols: Sequence | None = None, + label_cols: Sequence | None = None, + preprocess: bool = True, + rename_sequence: bool | None = None, + ): + self.sequence_cols = [k for k, v in self.features.items() if v.dtype == "string"] + + data_cols = list(self._info.features.keys()) + if label_cols is None: + label_cols = [i for i in data_cols if i not in feature_cols] if feature_cols is not None else ["label"] + if feature_cols is None: + feature_cols = [i for i in data_cols if i not in label_cols] + missing_feature_cols = set(feature_cols).difference(data_cols) + if missing_feature_cols: + raise ValueError(f"{missing_feature_cols} are specified in feature_cols, but not found in dataset.") + missing_label_cols = set(label_cols).difference(data_cols) + if missing_label_cols: + raise ValueError(f"{missing_label_cols} are specified in label_cols, but not found in dataset.") + self.feature_cols = list(feature_cols) + self.label_cols = list(label_cols) + self.data_cols = self.feature_cols + self.label_cols + + if tokenizer is None: + tokenizer = AutoTokenizer.from_pretrained(pretrained) + if tokenizer is None: # Actually means both tokenizer and pretrained is None + raise ValueError("Either tokenizer or pretrained must be specified") + self.tokenizer = tokenizer + + self.preprocess = preprocess + if self.preprocess: + self.update(self.map(self.tokenization)) + self.set_transform(self.torch_transform) + else: + self.set_transform(self.tokenize_transform) + + if rename_sequence is None: + rename_sequence = len(self.sequence_cols) == 1 + self.rename_sequence = rename_sequence + if self.rename_sequence: + sequence_col = self.sequence_cols[0] + self.update(self.rename_column(sequence_col, "input_ids")) + self.sequence_cols = ("input_ids",) + self.feature_cols = ["input_ids" if i == sequence_col else i for i in self.feature_cols] + self.label_cols = ["input_ids" if i == sequence_col else i for i in self.label_cols] + self.data_cols = ["input_ids" if i == sequence_col else i for i in self.data_cols] + + def update(self, dataset: datasets.Dataset): + # pylint: disable=W0212 + # Why datasets won't support in-place changes? + # It's just impossible to extend. + self._format_columns = dataset._format_columns + self._data = dataset._data + self._info = dataset._info + self._fingerprint = dataset._fingerprint + + def tokenize(self, string: str) -> Tensor: + return self.tokenizer(string, return_attention_mask=False)["input_ids"] + + def tokenization(self, data: Mapping[str, str]) -> Mapping[str, Tensor]: + return {col: self.tokenize(data[col]) for col in self.sequence_cols} + + def torch_transform(self, batch: Mapping) -> Mapping: + return { + k: ((dl.PNTensor(v) if len(v) == 1 else NestedTensor(v)) if k in self.sequence_cols else torch.tensor(v)) + for k, v in batch.items() + } + + def tokenize_transform(self, batch: Mapping) -> Mapping: + return { + k: ( + (dl.PNTensor(self.tokenize(v)) if len(v) == 1 else NestedTensor(self.tokenize(v))) + if k in self.sequence_cols + else torch.tensor(v) + ) + for k, v in batch.items() + } + + def __getitem__(self, key: int | slice | str | Sequence[int]) -> OrderedDict: + batch = self._getitem(key) + input = FlatDict({col: batch[col] for col in self.feature_cols}) + target = FlatDict({col: batch[col] for col in self.label_cols}) + return OrderedDict(input=input, target=target) + + def __getitems__(self, keys: Sequence) -> Sequence: # type: ignore[return-value] + return self.__getitem__(keys) + + # def __getitems__(self, keys: Sequence) -> Sequence: + # # I have NO idea why they want to de-collate batches and then re-collate. + # batch = self._getitem(keys) + # return [ + # OrderedDict( + # input={ + # col: batch[col][index] if not isinstance(batch[col], NestedTensor) else batch[col]._storage[index] + # for col in self.feature_cols + # }, + # target={ + # col: batch[col][index] if not isinstance(batch[col], NestedTensor) else batch[col]._storage[index] + # for col in self.label_cols + # }, + # ) + # for index in range(len(batch[next(iter(batch))])) + # ] + + +class PandasDataset(Dataset): + + def __init__( + self, + dataframe: DataFrame | str, + split: str, + tokenizer: Tokenizer | PreTrainedTokenizerBase | None = None, + pretrained: str | None = None, + feature_cols: Sequence | None = None, + label_cols: Sequence | None = None, + preprocess: bool = True, + rename_sequence: bool | None = None, + ): + if isinstance(dataframe, str): + dataframe = dl.load_pandas(dataframe) + if isinstance(dataframe, dict): + dataframe = DataFrame.from_dict(dataframe) + dataframe = dataframe.loc[:, ~dataframe.columns.str.contains("^Unnamed")] + table = datasets.table.InMemoryTable.from_pandas(dataframe, preserve_index=False) + super().__init__(table, split=split) + self.post( + tokenizer=tokenizer, + pretrained=pretrained, + feature_cols=feature_cols, + label_cols=label_cols, + preprocess=preprocess, + rename_sequence=rename_sequence, + ) diff --git a/multimolecule/data/utils.py b/multimolecule/data/utils.py new file mode 100644 index 00000000..958d87b7 --- /dev/null +++ b/multimolecule/data/utils.py @@ -0,0 +1,2 @@ +def no_collate(batch): + return batch diff --git a/pyproject.toml b/pyproject.toml index 47724647..1d264f6c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,8 @@ dynamic = [ dependencies = [ "chanfig>=0.0.99", "danling", + "datasets", + "tokenizers", "transformers", ] [project.urls] diff --git a/tests/data/datasets/rna/5utr.csv b/tests/data/datasets/rna/5utr.csv new file mode 100644 index 00000000..a1563d18 --- /dev/null +++ b/tests/data/datasets/rna/5utr.csv @@ -0,0 +1,68 @@ +,seq,label +0,ACCAACATGTAATTTCCACTCTTGA,-1.7812239923226838 +1,TGGTAAAATCTAGGGTTTTTTATAA,-0.275642799200671 +2,CAAAAAGTAGACGCAACATGAAAAA,-1.1482258696972798 +3,TGGCCTCGTGGATAGGACATTTGGA,-1.163470221256106 +4,TAATCGGTTCTAAATACGATTAGTG,0.629766441913879 +5,TAAAGAAGAGGTTGATGAGAAACCG,-0.0566107803048019 +6,AAGCGGTGAATAACACACAGTAAAG,0.4379278257655004 +7,ATCTCTCTAGTACAGATTGTCAATA,0.6259435716867096 +8,AGACAGCTAAAACCCTACAAAATAA,0.4289960198091346 +9,CCTTCGACGACCCACGTCCGCCTTA,0.0669630515711629 +10,CGTTGATCATGGATACTTTTTTACA,-1.4310155058894878 +11,GTACGCAAACCATCTCTCGATTTCT,0.3852448242284625 +12,GTTACCCCCTACTCCAGCTCATACT,0.1064178675873167 +13,TCCAATCTTTTGCACCACCCCTAGG,0.1675831224155592 +14,CTCCCTCAACAGGTGCCTCACGCTG,0.4482936797119086 +15,AGTAATGAGTTTCGGCATTTCAAAG,0.4487089779004714 +16,AGGATTGTGTCGCCAGTTCCACTGA,0.2266517419179321 +17,TAATATCATATAGTTCTTCTCCCCT,0.0910128870657181 +18,TAGAATCGGAAGGAATAGGATTCTA,0.6830431344740635 +19,GATGCTTGCACTCGAGGTCCGTGCA,0.7779586432740309 +20,GACACCACGTAAAATCCTAATCAAA,0.7227716269767893 +21,TCTATGACTCGTTCGCGTAGAATCA,-0.9032312196091278 +22,CAAAATGATAAGATGGACCAAAGAT,0.0447788302836263 +23,TGCATGATCTGTAGCATTTGCTGCT,-0.2827116234508076 +24,GCATGACCAGCCTGTTTAGATAGAA,-0.8848997916225679 +25,AGAAAGATAACAAACCACCCGTATG,0.5311009416510439 +26,GACCCCTTTACGCAACCTATTGAAC,0.7381938795002485 +27,GCCCCTACACTCTGTTTTTTGATCC,0.4626627635505125 +28,GGATAAATAAATCTGAGATCAGAAA,0.5764032765766933 +29,CCCTGTTGCCAGCCGCATAATCATC,0.5072462083719866 +30,GCAGCACGCTTACAGTCCCTCAGAC,0.5885318168197583 +31,CTTTTTCCTTACTCGTGATACTATC,0.3549980256557335 +32,GTAAACCCAGATCTAGTTTGACTGT,0.4340458948389251 +33,CACGCTGCACACCGAACAGCCCAAA,-0.0060640299062117 +34,ACTCCGACACCATCTTCATTACAAT,0.40927053992064 +35,TACATGGAACTGTCCCTTCTTACCG,-0.8422048835483932 +36,GACCCTCCTATTATCAACCAAGATA,0.2085787716855296 +37,AGAGTGAGAGCGCGACAAATCACTG,0.677525749419415 +38,CCGATTGGCGCCCTTTGGCCGGGAG,0.0662045936850974 +39,GAGATGAGAAGTCGTGCGAAATAAC,-1.5323635165013456 +40,GTCCTCGCGACAACTGTCCCAAACC,0.2904891214718897 +41,TTCTGATCGGTGTTCCTCCGTTCTG,0.4886688832278358 +42,TCTAGTCGTTTCTAGCATAGACTATA,0.6682424782790564 +43,GCAATGCATCCATTCCAATGCCTACT,-1.1913139865591946 +44,ACCTTGCCGCATCCCACTTGCCTGCA,0.458991405155542 +45,CAAACTGGGCCCATTTCTATACCAAT,0.1781652824101883 +46,ACGAGAGTAACAGATCCAACCTAAA,0.6019488593566376 +47,CCTACGCGGGATGCTCTTTTTTATAG,-1.1687525556467426 +48,GATCCAGGAGACAGAAACCATCTACC,0.4738692979644047 +49,TCGCAAAGAAGAACCTATTTTAAGA,0.7018982636372705 +50,TATAATTACGCTTTTCCGTGTATGG,-0.3720488657064282 +51,TCAATTACAGCTCGACTTCCATGATC,0.2754572607942968 +52,AAGCCGTTCTTTAAATCCACACATTT,0.2832481855967742 +53,AGTCCATCCTCGCGGCCTCACACCA,-0.1678433893986053 +54,AGTCCCGTCCTACACGCTCGGTCCG,0.3135193265556327 +55,CCCCATATCCGATTATCTGCTGGAC,0.5673113165112577 +56,CGTAGTGGCGCAGGACCGTCAATTA,0.3736517875688682 +57,CTCTGCTATGCCCCACCACTCAACA,0.5126163959293235 +58,ATCCACCAATCCCTACATTCATCTTC,0.5112259267226038 +59,GAGAGTGTCGCCGAAGCACAAGCCGA,0.4693891586433297 +60,CCTGTCGATCTAGGTCCTATTGTCCG,0.6496399244427643 +61,ATTTCTAACTTCTTCTGGCAACGACA,0.5061690522661538 +62,ATATACGGCAACACGCCCGAACCAGA,0.2119265981391584 +63,CCTCGTTAATCCTTCCCTTGTCTCCC,0.1640642263497583 +64,TCCCCGCCACGCCCGGTATCCGACTA,-0.0315210929562356 +65,GAAACTCGTGTTTATTCTCGTCGAT,0.7040646602119047 +66,AAGAAAACATACAAGTCTGTTCACT,0.6293633976161706 diff --git a/tests/data/datasets/rna/modification.json b/tests/data/datasets/rna/modification.json new file mode 100644 index 00000000..420dffd0 --- /dev/null +++ b/tests/data/datasets/rna/modification.json @@ -0,0 +1 @@ +{"sequence":{"0":"TTGCCACACTGCTGGACGCCTGCAAGGCCAAGGGTACGGAGGTCATCATCATCACCACCGATACCTCGCCCTCAGGCACCAAGAAGACCCGGCAGTATCTC","1":"TTTGAAAAAATATTAGCAATGTGAGGACACTTAAGCAGTTTTGTCAATTCAGCTGAATCCAGCCTCATAGCAAAATCTGGTCTTAAATTCCCTCATCGTGC","2":"AGAAACATTCAACCTCCCTTCTTTTTATTCCAGTTGTCCTTTTCTCTGACACTTGCATCAATTTTCTGATTGCCTAGGCTCTTAATATTGCTTTCTGTTCA","3":"TTAGTTTTACTATGGAATCATAATAACCCACATAGAAGACTGATATTAAGAGCACAGAAGAAATAGTCCCAATGTTTATGTCATTTAATTTGAAAAATTTC","4":"CAACAGAAGTTTCTCATCTATAATCAGTAGCACTAAACTCTTGGTTTGAAAAATATTTAGTATGGGTAATACTTGGAGTATCAGTTTTCATTAAAATGTAC","5":"AATGTGTTTGTGTGTGTCTCTCACACACACACATAACATGTACATACCTGAAACTCATACTGCAATTGCAACACATCTTAAGTTTTTCCTTTTAAACATAC","6":"AAGTAGAAGACAACAGTACTCTTTTTTTTTTGAAATGGAGTCTCACTCTCACCCAGGCTGGAGTGCAATGGTGTGATCTCGGCCCACTGCACTCCAGCCTG","7":"GCCACCACACGTGGCCACAGTTTGGGCTTTTGAAAAAAGTTAGGTGGAGGAAGAGAGGTATGAGTACTCTAGTTTTCACTGCAGTATCCCATTTGTGTGTG","8":"AGGCTGTTTTAGCTTAAGTAAAATTTAAAAATTAGTTCCTTAGTCACATTAGCCACATTTAATGTGTTCTATAGCCACGTGTGACTGGTGGCTAACATATT","9":"GCAAGTGGTGTTTGGTTACATGAATAAGTTCTTTAGTGGGGATTTCTGAAATTTTGGTGCACCCATCACCTGAGCAGTGTACACTGTATCCAATGTGTAGT","10":"TTCTCAGGATATGTTATAGGATTCTTCTGACCACTAGAGTAGAGTGAACGATATGTTTTAATGTTCAGAAGTCACTATGGAGTAAACCAAATATATATAGG","11":"TTTTCCAGGATTTCATGAAACAAAGAGTTAAGAACTACAGTAGTGGAGCAATATTCATGGTGCTTTTTCTTTTTCTTTTGAAATAATTAAAAACTTACAGA","12":"GTATTGTCGTCTCACTCTATTATCAGCCTACCTCCGGTGGCCCTTGGGGCATGTGGCTGGGCCCAGGGTGATTCATCTAGAGCCAGCTCAGGTGGCAGTGA","13":"GGTTTTTTTTTTTTTTTTTTAGTCCATCCATTCTTTGATTTAATTTGGCAAACCCACATTAGATAATTTAGCAGAAGAGGAATTATATCTTCATCCTATTA","14":"AAGAAACCTGAACCAAGGCCTTGGGTATCAGATTGGCTGGATAAGGAGGGATGAGCACAGAAGGAAGGACAAAGATAATACCTTTTTCAAGATGAGCCTGT","15":"CTCATTTTGTAAGGAGACACTTAGATGCATTTCTGAAAAAAACAAAACAAAACAAAACAAAACAAAAAACACTTTGGGCTTTCTCTGTATTCTTCAAGCAT","16":"GTAAGTGAGATTACTTTATTTATTTCTTTTTCAGATTGTTCACTGTTGGCATATAGAAATGCTACTAATTGTTGTATGTTGATTTTGTATCCTGAAACTTT","17":"CATGCCTGTAATCCCACCTACTCAGGAGGCTGACGCAGGAGAATTGCTTGAATCCGGGAGGTGGAGGTTGCAGTGAGCCAAGATCACGCCACTGCACTCCA","18":"GGGTCCAGCCCAGGCTGTTTGGTCCCAGAGCCTGTGCTCTTGTCCATTATACTGGTGGTATTGCCCCTGGCATTGACAAAGTGGGAAAAGATGACTAACCT","19":"TGGCTCACACCTGTAATCCCTGCACTTTGGGAGGCCAAGGTGAGCAGATCACTTGAGGTCAGGAGTCTTGAGACCAGCCTGGCCAACATGGTGAAGCCCTA","20":"ATATGAATGATTTGTCATTTATGTCTAATCACTAAGTAAAAATATCAATTATGATTACTTTTTAAGTTTTATTGATGCATAATTATACATATTTATGGGGT","21":"ACATCAAAAAGTTTGAAAGAGCACAAATAGACAACCAAGGGTCACACGTCATGGAACTGGAGAAACAAGAACAATAGAAACCCAAACCTAGCAGAAGAAAA","22":"CTATCAGAAATAATGAAAAAACTCACCTTTGGGATTTTCATTAGTTTGGCAATCACTTCTCCTTTTGAAAGATTGGTGGACTGTACATTTTATTATTATTA","23":"TGATCTTATTTGTTTCTGTGTCTTGAAATAGTTTGCTGTTTTGTCATCTTAGAAATTGATTCATTATTAACTCATTTATTCTCAACTATGCTAAAAAAAAG","24":"AAACCACAAAGATGGGGAGAAACCAGAGCAGAAAAGCTGAAAAGTTCAAAAAACCAGAGCACCTCTTCTCCTCCAAAGGATTGCAGTTCCTCACTGCAAAG","25":"CTACACAAGGTATTTCACAATATCCTTAGGAATTACTGAGTTTTAGAGTGACAGAATAATTACCAATTATTCTGATAGTAAATTTGTAGGTACATTATAAT","26":"GCTACCTCTACTTTTAACATATTTTAGGCATTAGGACTTGCTTAGCCTTTAATACACAGGAATATTAACTAAAATGCACATATAAAACAATTGGTTAGACA","27":"CCTGGATCTAAAAGTGTTTTTATTTTTTGTGCCCACATCTGTAGTCATGGATTTGATGTATATATTTAATAACATTCAGTGATTTATTTTTCGGTTCACCT","28":"TCTGAAGTCATAGTCCCTTGGTTTTCCCTGACCTGCCTGCTACTGCGCCCACTTGCAGCAGCACCTCCGTTGCCCAGTGAAGCATGCTGCCCTGGTCTTAC","29":"CTGGGGCGGGCGGGTCAGTTGAGGCCAGGAGTTCGAGACCAGCCTGGCCAACGTGGCAAAACCCTGTCTCTACTAAAAATACAAAAAAGTTAGCTAGGCGT","30":"AATATTGCATGGGCCATACTTATATTTTTAAAATATTCATTGTTTATCAGAATTCAAATTTAACTGGGCATCCTGTATTTTTATTAGCTAAATCTGGCAAC","31":"GACTAGCTGCAGAAAGTGACATTTACACTGGGACAGGAGTCAAAGAGTATATTGATGCAAAGGAAAGACCATGAATTAGACCTGAGTTCAAATCCTAGCCG","32":"AGAAAAAGACAGAGGTTTATAGAAGTTTTTTCCACAAAATTTATTTGTGCATTAATCGATAGGCAACATAGTGTAAAACATAGCTAGCTGAATATTCAGAA","33":"TGCCACTATTGGGGTAACCCACCCCCAATATTACAACATAGGTTCTTTCTATTTTCCATAAGTGTTGGCTGGCTGAGAAATAAAGAGAAAGAGTACAAAGA","34":"TGGAAGGAAGAATTGCTTTTCTGAGGTCAATGCTCAGCTTGGCTGTTGGCAAGTCAACCTTTAGGAATCTGTGTATTCAGGGTATAGCAGTGGAAGTATAG","35":"AAAATCAGCAGCTAGTATTTGCAAATGGTGTTTGTATTTACTCTTGAAATACATGGTTTTGTGCTGGAGATTTGGAGTAAGGAAACTTAGGCACTATAGTC","36":"TCCACTTGCTGCATTATTTTTTTCTTTCTTTTTTTTTGCTGATTATTTTTATATGAATGTTAAATGATAAAGTCTTCTACATCATATCCCATTTAAGCTGC","37":"TGTTTTTACATTGAAAGTAGACAAATAGTTTTGTCATCTGTTTCTCATCCATTTCTAATATTTAAATATAATAAAGTCTAATTGAATACAAAAACAAACAA","38":"AAAGGATGACGAAGTGTAGAGAAGAGGCCAGCCATAGGAAAAGGGGAGTCACTTATGGGAAGGTGACTAGGAAATGTGTGATATACAGGGGTTGTTAGTAA","39":"GGGCCGTCCTGAACACTGCCACCTCTGAGCGTTGGCATCCATCTGCTAGGATTAGCATTGGAGCTTTTTTTGAAGGTATTTTGAAGTCTAATGGGAGAGGA","40":"TCCCCAGGCTGGAGTGCAATGGCACAATCACAGCATACCTCCCAGGCTCAAGCAATCCTCCCACCTCAGCCTTTTGAGTAGCTGGGACCAGAAGCACGTGC","41":"ATTATGGCCCAGCCTATACCCAGAAGAGAGGACTTAACTTGTGCTCCATGAACCACTGTGTCTGGGACACTGAGTAACCTAAGAATTTTCTTTGATATGAC","42":"TCAGCCTCCCGAGTAGGTAGGATTACAGGCATGCGCCACCATGACCGATTAATTTTGTATTTTTGGTAGAGACGGGGTTTCACCATGTTGGTCAGGCTGGT","43":"AAATTCATTTTTTCAATCATTTAAGGAACTTAGATATAAAATACACCTTTAATTCACCTTTGGAAATTTTTTACAAAGTGTTTTATTTGCAAATGACAGTG","44":"ATTAGTTATTTCAGTGTTTATTTCATTTGATGAAGAAACGTTTGCATATGAATGTTGGGAATTCTAGCAGGTCCTGCCTCAATGTGAAGAGGCATTTTTTT","45":"CAGGTGCCTGCCACCATGCCTGGCTTATTTTTGTATTTTTAGTAGAGACAAGGTTTCACCAGGTTGGCCACTCCTGGTCTTGAACTCCTGACCTCAGGTGA","46":"TTTTTTTTTTTTTTTTTTTTACTGTGTCCCAGGCTTAAGAAAAAAGTGATACATGATGTGGGATTAAAATCAAGAACATCATTGAACTTCACCTTCCCTCC","47":"CGGGAGGCACGGGCCCTTCGGGGATGACGTCACGGGCGGGGGCCCCGGACACGCGAGCCTTGCGCCCCACAGACGGCGGCGCAGCCCGCCGCCCTTTTCGA","48":"TGAGGCTTAAGTGATCCTCCCACCTTAGCCTCCTAAGTAGCTGGGAGTACAAATGCACACCACCACACCTGGCTAATTTTTGTATTTTTTGTTTTGCCATG","49":"ACTCATAGCTCTATGTCTCTTATAGTTCTTAGCACAATATCTTGGCCTAGATGAAGTACATAATAATTATATGTAGGGTTGTGGAAAGCAGTGCTGGCTTT","50":"GGCTCCTTCGGAGGCAGAATATGTCAACTCGTTGGCTTCTCACAAAATCAAGTGAGTCAGAAACCTGAATGGGGTTTCGGCTGGTCTCACCTAATTAACTT","51":"TATCTACCACCTGGATTCTACAACTGACATTTTATTATACCTAGTTTTTTACATGTCTGTCCATCTGTCTCATCCATAGATCCATTTTATTTCTTTATACA","52":"CATGTATGTATACTTAACTAAGTTAATAAAAACTGTCCTATTTCTCCTGGACATTAGAGAGATCTCAGAACTCTTTAACTCCGTGTACCCACCTCCTGACT","53":"GGAGCTGGTTCAGGAGATCACACAACATTTATTCTTCTTACAGGTACATCAGTCAAGGCTACCCCCCAGTTCTGAGAGAACTTGCCCAGGAGTGGTTGCAG","54":"TTCCTGGTTGGTTGAATCACTGGATGCGGTACCCACGGATGCAGAGAGTGACTGTACAGAAAAAAAGCATCTATTGCCTTTCCAGGCCAAGCTTTCTGTCT","55":"ACATTTTAGAAAATAAAATGCACCGAACAAACATGGGGTGTTCCTACCGCAGCATGGGAAAGGCGAGGCGCCATCCCACCAAGGCGGGTGTGGTTTTGAGC","56":"GAACGAAAAGAGGAAGTAGTGAGTGAAAAGGAAAGAAGAAAACATTAAGAAGTAGAGGAAAAAGAATTAAGTCGATTAGATGCAATGAGGGAAGAGGAAAA","57":"GAGAAACAGTGACAAATTCTGAGGGGAGCCTACAGTGTATAGTGTTGTGTATAGTGTGTATAGTATATAGTGGTTGTGTATAGTGGCCTCTGCCTTTTACC","58":"CCTTGCCAATCCCCATGAAAATGTTCAGTTATGTCAAAAGCAAGGCAAAAACAGTCTCTTGGCTATACAAGGGTAGCTGTTTTATTTGACTAAAATTTAGC","59":"ATTGTAGTGCAAAGCAGCCACAGACAAAATTTAAATGAATGAACCTGGCCATATTCCAATAAAATGAATTTGAATTTCAAATAATTTTTATGTGTCATAAA","60":"TGAGAAGAAAGAAAGAAAGAAAAAGAGGGGGGGGAGGGAGAGAGAGAGAGAGAAAGGAAGGAAGGAGAAAGAAGAAAGGGAGAGGGAGAGAGAGAGAGAGG","61":"AGTACTTTCAACACTGCATGGCACATAGTAAGGGCACAATAAATGTTAATAATTATGATGGTGGTCATGATGATGATGATCATATGCTTATCTTCCATCCC","62":"GACTCTGTCACCCCCCGCCCCCTGGAAAAAATGCGTTTTTTGACTTAATGATATTTTCAATTGTGATGGGTTAATTGAGATATCACCCCACTGTAAGTTTA","63":"CATATCTCATATTTACAGATTCCTTCAGGGTAAGAAAACTTATGTCTTCTAGGGAAACCACTCCTTTTAAATCTATGTGATTTATCCTATAAGCCACTTAA","64":"AATTTAAAAAGTGTTAAGCACCATAGATGTGCATTTTTAGGAATAAGATGAGTTATTCACTGAAGAAGAGCTCTGCAGGAAGGTGAAAGCTCTCCTTTAAA","65":"ATGGGTTTTGGATTTAATGGGGCATTGGGGGAGTGAGAGGGCATCTGCAGAAAAGAGCCATCCAGGCTGCAGAACTCTTGTTTCCAGCAAATAGTCCATTG","66":"AGATACCAGGAATGACCTGATTCAGGCTAGTAAGTGACGTTTGCCTAGAGATCAGTCTAACTGGGGCTCAAGATATGGCCTAGCTGTGAAACAACAGATGA"},"label":{"0":[1,0,0,0,0,0,0,0,0,0,0,0],"1":[1,0,0,0,0,0,0,0,0,0,0,0],"2":[1,0,0,0,0,0,0,0,0,0,0,0],"3":[1,0,0,0,0,0,0,0,0,0,0,0],"4":[1,0,0,0,0,0,0,0,0,0,0,0],"5":[1,0,0,0,0,0,0,0,0,0,0,0],"6":[1,0,0,0,0,0,0,0,0,0,0,0],"7":[1,0,0,0,0,0,0,0,0,0,0,0],"8":[1,0,0,0,0,0,0,0,0,0,0,0],"9":[1,0,0,0,0,0,0,0,0,0,0,0],"10":[1,0,0,0,0,0,0,0,0,0,0,0],"11":[1,0,0,0,0,0,0,0,0,0,0,0],"12":[1,0,0,0,0,0,0,0,0,0,0,0],"13":[1,0,0,0,0,0,0,0,0,0,0,0],"14":[1,0,0,0,0,0,0,0,0,0,0,0],"15":[1,0,0,0,0,0,0,0,0,0,0,0],"16":[1,0,0,0,0,0,0,0,0,0,0,0],"17":[1,0,0,0,0,0,0,0,0,0,0,0],"18":[1,0,0,0,0,0,0,0,0,0,0,0],"19":[1,0,0,0,0,0,0,0,0,0,0,0],"20":[1,0,0,0,0,0,0,0,0,0,0,0],"21":[1,0,0,0,0,0,0,0,0,0,0,0],"22":[1,0,0,0,0,0,0,0,0,0,0,0],"23":[1,0,0,0,0,0,0,0,0,0,0,0],"24":[1,0,0,0,0,0,0,0,0,0,0,0],"25":[1,0,0,0,0,0,0,0,0,0,0,0],"26":[1,0,0,0,0,0,0,0,0,0,0,0],"27":[1,0,0,0,0,0,0,0,0,0,0,0],"28":[1,0,0,0,0,0,0,0,0,0,0,0],"29":[1,0,0,0,0,0,0,0,0,0,0,0],"30":[1,0,0,0,0,0,0,0,0,0,0,0],"31":[1,0,0,0,0,0,0,0,0,0,0,0],"32":[1,0,0,0,0,0,0,0,0,0,0,0],"33":[1,0,0,0,0,0,0,0,0,0,0,0],"34":[1,0,0,0,0,0,0,0,0,0,0,0],"35":[1,0,0,0,0,0,0,0,0,0,0,0],"36":[1,0,0,0,0,0,0,0,0,0,0,0],"37":[1,0,0,0,0,0,0,0,0,0,0,0],"38":[1,0,0,0,0,0,0,0,0,0,0,0],"39":[1,0,0,0,0,0,0,0,0,0,0,0],"40":[1,0,0,0,0,0,0,0,0,0,0,0],"41":[1,0,0,0,0,0,0,0,0,0,0,0],"42":[1,0,0,0,0,0,0,0,0,0,0,0],"43":[1,0,0,0,0,0,0,0,0,0,0,0],"44":[1,0,0,0,0,0,0,0,0,0,0,0],"45":[1,0,0,0,0,0,0,0,0,0,0,0],"46":[1,0,0,0,0,0,0,0,0,0,0,0],"47":[1,0,0,0,0,0,0,0,0,0,0,0],"48":[1,0,0,0,0,0,0,0,0,0,0,0],"49":[1,0,0,0,0,0,0,0,0,0,0,0],"50":[1,0,0,0,0,0,0,0,0,0,0,0],"51":[1,0,0,0,0,0,0,0,0,0,0,0],"52":[1,0,0,0,0,0,0,0,0,0,0,0],"53":[1,0,0,0,0,0,0,0,0,0,0,0],"54":[1,0,0,0,0,0,0,0,0,0,0,0],"55":[1,0,0,0,0,0,0,0,0,0,0,0],"56":[1,0,0,0,0,0,0,0,0,0,0,0],"57":[1,0,0,0,0,0,0,0,0,0,0,0],"58":[1,0,0,0,0,0,0,0,0,0,0,0],"59":[1,0,0,0,0,0,0,0,0,0,0,0],"60":[1,0,0,0,0,0,0,0,0,0,0,0],"61":[1,0,0,0,0,0,0,0,0,0,0,0],"62":[1,0,0,0,0,0,0,0,0,0,0,0],"63":[1,0,0,0,0,0,0,0,0,0,0,0],"64":[1,0,0,0,0,0,0,0,0,0,0,0],"65":[1,0,0,0,0,0,0,0,0,0,0,0],"66":[1,0,0,0,0,0,0,0,0,0,0,0]}} \ No newline at end of file diff --git a/tests/data/datasets/rna/ncrna.csv b/tests/data/datasets/rna/ncrna.csv new file mode 100644 index 00000000..0ee1e4b9 --- /dev/null +++ b/tests/data/datasets/rna/ncrna.csv @@ -0,0 +1,68 @@ +,sequence,label +0,AAGAATCATAGAGGTTGCTGCTGACATTAGTCGCTTGCGGAGTGCTTACAGGCACAATGAAACAGGTTAAAGGGGATGTAGCCGAAGCATTGGTGCTGTGACACCAAGTAGCTGGGCCAATCGGTGAAAAATCGTTGGACTGTCGCGTAAGCGGGGTGCTATGTTATAG,8 +1,AGCTCTCTGGAGAACGCGAGTATGCGTACCGAAGAAGAAACTTATCAAATACGGAAGTTAAGAATGATAATTGAAGTAAAACATAAGGAAACTTTCAGGTTCGAGGACAGAGATGAACAA,8 +2,TAGAGACATTTGGGGTGCTTTAGGCTGAGATAATACCCATTGAACCTCTGATACAGTTAAGACTGGCGAAGGGAAATGTGAAACGTTTTTT,8 +3,GCGAAAGTTGCAAGAGAGCACCCGCGCACAACCGGCATGCCGGACGTTGCCACGGGGTCGCCGAAGGGGCAAGGCCGCAAGGCTGAAACTCTCAGGCATCAGGATTGCAACCGGACG,8 +4,ATACAAGCCTCGGGGTGGTCGAAAGGCCTGAGATTCACCAGTTGGTGAAGACCCGATGAACCTGATCCAGATAATACTGGCGTAGGGACAGGCGGCAAACCCTTCT,8 +5,TATAATCACTAGGCTAACGGTTCTAGCGGGGATCAGCTGCTAGAAGCAATGGGGAAAGTGCAGTGTAAGTCTGCCGCTGTCCCGCAACTGTAATGAGACTATTGGTCTCTGAGTCAGGACGCCCACCGTTATCCCGTCAAAACAC,8 +6,AAACAAAATAAGAATTTAGGTGCCTCTTTTACAGAGGATAATAGGGAAGTACGGTTAAAATCCGTCGCAGCCCCCGCTACTGTGTGCATGGACAAAACTTCAGTATCCACCATTTGTTTGGGAGAGAGGAAGTAGATTGATGTGTGAGCCAGGAGACCTGCCTATATTCATGAAATTACA,8 +7,CGCTGAGTCATATTCCGAGCAAGGCGCATCTAAGGCAGACGCTATGAGCCCTTGCCTGACGGATTCCGTCACGGGGTATGCCGGGAAACGGGCATGCCTGCTTTTGTAAAGGAGTATATCACATGAAGAT,8 +8,TCTGTGACCGCCGTATATGGTGTCCGAGGCCGTCCATGCGGCCAGGATCGGCGAGGGAACCCGGTGGAAATCCGGGGCTGACGCGCAACGGTAACGAACCCTTTGGTTCGGAGTCCGACTACTCGTCGATCGAGTTGTGACGATC,8 +9,GCTTCCGGCTGGGGGAGCCTGCGGGCTGAGAAAGGGTGCAAGGACCCTGACCCCTAGGGCCGCCGTGCGGCCCTGCACCTGATGCAGATAGTGCTGTCGAAGGGAAGCTGGCAACGTGCGCT,8 +10,CCTAGATTGGCCGGTGACGGTTCTCCTCACGGAGATCAAAAGGGAACGTGGTGCGAGGTTTTCCCGATACCGGGATATTCTCAACGCCGCGGCTGCCCCCGCAACTGTAAGCGGTGAATCTTTCGTCATATGCCACTGGGAATCTCGGTCCTGGGAAGGCGACGCAAGGTAACGACCCGCGAGCCAGGAGACCTGCCGTCAACCGTGGTCACACG,8 +11,GCGCGGGCTGAAATCCCAGGCGGTTGCGCCGCTGTAGCGAGCGACGAAAACTCCATTGAAGCCACTGCTGACACGAGCCGGACAGCGATCCAGCCACGGCGGAAAGGCGGGGTGAGTAGGATGACCTCGGAGCCAGAAGACCTGCTTTTTCGCCCCTCATTATC,8 +12,TCCAACTCATCGGGGAGCAAACCCATCAGGTATTGATGTTATCAATGCCAAAGGGCGCTGAGACCTAATTGTTTAACATTGTTGAGCAGACAGGGACCCGTATTACCTGAACCAGATAATGCTGGCGTAGGAATTGAGTCTGGATATTGG,8 +13,GTCCTGCTAGAGGGAGAGCGATGGCGCAGAGGATGACGTGTTTTCCTATTGTGCCTCCACCGAAGGAGCAAGCGCCCTTGTAAACTCTCAGGTCAATGTACCTCTGGTAAGTG,8 +14,TTAACCGAAACTAGTACCGGTGCCCCTCGCACGAGGGGTGAAACGGGAATGCGGTGCGGGGTCTCGTTGCCCCAATGCCGCGGCTGCCCTCGCAACTGTGGGCGGATCGGAGCGTCCTCGAATGCCACTGGCCGGCTTGGCCGGGAAGGCGGACGCGCCGGATATCCGCGAGCCAGGAGACCGGCCGGTACAAGTGTGCAACTC,8 +15,AGTTATTACTCGTTAAACAGTAACCTGTTGGGAAACTCCGGTGAAACTCCGGGGCTGTGCGGCAACTGTAAGTCATTCACGAGCAAATTGACAAGCCAGAAGGCCAACTGGTTACGCACCACTCTAT,8 +16,TATTTTTGTGCATGCAAAAGTGTCCCGAACGCCTTCCTGCGGGAAGCGTGCGACGGATGAAAAGGGAATGCAGTGAGAATCTGCAACAATACCCATTGCTGTGAGTTCCTGCTCCGGAAGGAGTAAAAGGGGTTTCGCAATCAATTGCCACTGGCTTGCCGAAGTCGGGAAGGCGCGGAACCGGAACGAGTCAGAAGACCTGCTTTGTGCATAGATACCGAT,8 +17,CTAACCCTTCACTTTGGTGGTTTATTGCGCTCACAACTCAATAGCGGCAGTGACTGGGGAAGTCGGTGAAAATCCGACGCTGACCCGCAACCGTGAACATCGCAAGATGTAAGTCGGATTACCACCACCGCCGTATTCCTTACC,8 +18,CTTTATTCACAGGGGAGCCACGTGCTGAGAGGAAGCTTTTGTTTCGACCCTTTCACCTGATGCGGATAATGCCGACGTAGGAAGTGCTTGAATAAACAAT,8 +19,ATAGGAAGGCGTTCTTCAAGCAAATTGCTTTCAAACAGGCGTTTGAAACGGGAAGTCGGGTACGAATCCCGCGCGGTGCCGCCGCTGTAAGGGCGGAGCGTTCGCCACGGCCACTGGAGACGGGAAGGCGGCGAAACGCGAGGATGCCCAAGCCAGAAGACCGGCTTGAGGAAGGCTGATGATC,8 +20,TCTGGAGAGAGGCCCGGCAGGGTCCGCCGAAGGGGAAATCCGGCTTTTTGCGGAAACGCCAGCCGGGTGATGCTCTCAGGCACCAAGACAGA,8 +21,AATTGTATTTTTGCCCAGCTTAAAATGGAATCAGGTGGGAATCCTGAACTGTCGCGCAACTGTGATGAATTTTTCTTCTAAGTCAGGTCTTTTTTAAGTATTAAATGCTTTCGCGATTGAGGCTAAACCTTCATTCATCGAAGCATG,8 +22,AGGTGCTTTTAAAAATAAGTAAATTAAAGGCTGAGAATAAACTCTTTGAACCTGCTTTCATGGTAATGCATGCGTAGGAAGGTATATTTATCTAATT,8 +23,TGGCCGGACAGCGCCGATTTGATCCGGCTTGCGGGCGCTCTAGAAATGCCGCTAAAGAGGTACACCCGATCAATCC,8 +24,CCGCTCTCGACGGGGTGCCGCCCAGGCGGCTGAGAGGAGCTTCGGCTCCGACCCGCTGAACCTGATCCGGATCATGCCGGCGGAGGGATTTCGGGCAGCAGCACCTC,8 +25,CATAAAATAATTTATATGACTCATATAATCTAGAGAATATGGCTTTAGAAGTTTCTACCGTGTCGCCATAAACGACACGACTATGAGTAACAATCCAATACAT,8 +26,GGTGGGGATAGAGGTCGCGGCTTTTATATCATCCTGTACGAGATTCAGAGAAAATCAATGACTCCAGGTAAGGAAAATCCGCCGAAGCTCATATCGCGTCTCTGTGCGATTGGGTTGGGGCTGTACTCGAAAGGGGCAGAACTGTCACGTTTGCATGCCAAGCAAACGTGGTGAGCTATCTTCAGT,8 +27,GATGACCACAAGGGGAGCATTAAAGCTGAGAGTGAGCGGTTTCGTTCTGACCCTTTGAACCTGTTAGTTAACGCTGGCGTAGGGATGTGGCAAAGTCAAATG,8 +28,ATGCGCATTCTCAATTTAGGTGCCTCAAACAAGCAAATTATTATTAACGCTTGTTTATAAGAGGATAATCGGGAAGTTGGTGCCATGTTTATGAAATCCAACGCTGCCCCCGCAACGGTAATAGCGTCACTCTTTAATAATGTCTGACAGCCTAAGCCCGGAGACCGGCCTTAATTATTTATTAGCTA,8 +29,ATATTTCGTCTGGGGAGCTGGGGTGATAGCCCGGCTGAGAGTTAGCAGCTATTTCTGCTATGACCCATAAACCTGATCTGGATAATGCCGGCGTAGGGAGTACATTGGACTTTAAGT,8 +30,CTATCAGCCTATCGCTATGGTGCCTTCAGCCTCCTCTTGCCGGGATGCTGCAAGGCTAAGAGGGAAAACGGTTAAAAGCCGGTGCTGCCCCCGCAACTGTGCGCTGTGAGGTTTTTTGGAAACATGACGCCACTGAATACCCCTTCGGGTGTTCGGGAAGGTATCCATCACACCGTTGACCAGCAAGCCAGGAGACCTGCCATTGCAGAAAAGTCACGG,8 +31,ATAAAAATAATGGAATTAGGTGCTACGGCTTAATAGAGGAAGTCAGGTGAAAGACCTACACAGCGCCCCGCTACTGTAAGACGGATGCCTCATCATATCCACTGGTTTTTTACCGGGAAGGGATGACAAGCAGAGGAAGTCGAGTCAGGAGACTTACCTAATTTTAGTTTTCTTTT,8 +32,CTCCAATGCCCGGGGAGCAACTTGCTGAGAGGTCGCCCCGGCGACGACCCGTGAACCTGATCTGGGTAATTCCAGCGGAGGGATAGAATGAATTTGGAAAA,8 +33,ACAATCAACCCTGTAGTTGATTTACCTCTCAAAAATGTATGAGAGCGTGGGAACTGTGGTGTGATTCCACGGCTGGCGCGCAGCGGTAATGGTGACAACGCAGATTATCCACTGAGCAACCCAGAGATGTTTGAAAAAGCACCGTCTGGAACTTGGGAAGGTTCTGCAGCGGATGACCCGAGCCCGAAGACCGATCAACTGCGGATGTGATCCA,8 +34,TTCCCCAAGGAGCGTTGCAGCGGGCTGGGCGACCTCCCGTCAGGCTTGGGGCTTTTCTCTTGGCAACGACGCTCGCCTGTC,8 +35,AAAATTCATTTATGCAAAGGAAACCCGTGGAATTCGGGTACAGTCCCCGCTACTGTAATTGCAGAATGCTAGTTTATGTCATACCATTTAATC,8 +36,TTAAATATCGATGTGTAAGGTGTCGGACATACGTGTCCGGTGAAAAGGGAAGCAGGTGTGAATCCTGCACGAACTCGTCACCGTATTTTGCAAGCAGGGGAATGATCCACTGGGAAACCGGGAAGGAGACCGATGCGCTGGAGTCAAATCCTGAAGCAATGAGCCGGGAGACCTGCCTTTCGCAGTACAGGAACG,8 +37,CTTACGATAATAGACTGTGGTTTAGGGAATTTGGTGTGAATCCAAGACTGACGCGCAACGGTAAAAGTCCGATACCTGCCACGCATCCAATCGCTTCG,8 +38,ACAAAACCGCATAGTTCTTTTCGTATAATTCCGGGAATTGGCCCGGAAGTCTCTACGAGGTCACCGTAAATGATCTGGCTACGAAAGGAAGAAACGAACGT,8 +39,GCGCAAACCGAGGGGTGGCCCAGGCCTGAGACGCCTGCTCTTCACGACTTTCTTACCTAAGACAGTGAAGCCACGCAGCGAACCCTTTGAACCTGATCCGGGCAATACCGGCGTAGGAATGGTTGTAGCTTTGGCA,8 +40,TACCAGCCGATTGGAGAGAGGCCCTAAGCGCCCGTTCATCATCTGCTACGCTTCAGCAAGCGCGCGTCAGATTATGTAACCGGCCCTTAGCGCCCACCGAAGGGGCAAGTGACATCCGCAAAGCCCGCACACAGGCCGCGCCCGTCATCTAACTCTCAGGTAAAAGGACAAGGGGAGAGGC,8 +41,ACTTTGTAGTAGCTTTTGGGTTCTTCCGCATGCGAAGATGAAAAGGGATAAAGGTGTAAGGCCTTTGCTGTTCCCGCAACTGTAAGTACTGCAACGCTCAATGCTTCAACCACTGTCCGCCTCGGCGTATGGGAAGGTTTGTTGAGTAGTACGAGTCAGGAGACCTGCCTAAGAGATTCATGCTGTG,8 +42,ACCGCTTGTGGGGCTTCGGGTGCCCGGCGCGTCTGCGCCGGGAGAATTGGAAAGTTCGGTCTGAAACCGACGCTGGCCCGCAACCGTGAAAGGGGACGAATCCGCGTAGCCACTGCGGCAGAGCCGTGGGAAGGTGCGGAGGAAAGTGATCCTTAAGGCCGGGACGCCCAGCCCGAAAGCCTGTATCCTCA,8 +43,AGCCCAGCAACTCCTATCCCTACCTCCTCGTGGTACTGGCCGGCTGCGAAAGGCCTGGAAAAGTTTCAGAAAATGGAGTCGCTAAAACCGAAGG,8 +44,TATTGCCGCTTAAAAGGGGAAGCTGGTGAAAATCCAGGACGGTCCCGCCACTGTGACACGAGATGCAACATTTATTCGTGAAGTCAGATTGCCTGCCTGTTTGTACTGCTTGACTCTACGTGGATATAGAGAGGAGTGGATA,8 +45,ATTTTAGTTAGAAATGTTGAAATATGTCCTGTATTTGGGTACCTGGGGCATATGGAGTTATTGGTACAACCGGCTGCCTTTTTT,8 +46,CGCAGAGTAGGAACCTGCGCGTGGGGCGTCATGCCCCGAAGTGCCTGACCAACGGGGAGTTGTGGCAGGACGAAGCGGCCGTAAGGCCCGCGGTGCCGGTTCCGCATCCCACTGCT,8 +47,AATAAATTAAGAAATTTTACAGATATATCGCTGGAAAATGGCCAGCAGTCTCTACCGCGCCCCAAAAGTCGTGACTATCCGTGAATGTACGCTTTTT,8 +48,AATAGGCCTGGGGAGCTGGACATTTGGCCGGCTGAGAGGAGACGCCTCACGTCTCGACCCAATAACCTGATCTGGATAATGCCAGCGTAGGAATCGGCAATGGAGCGATG,8 +49,CATAACTATAGAGGAGCTTAAAGCAAGAGTAGATGATCTATAACAGGCATGTGATCATTGAAAGGGTATTTAAGCCGAAACCTAAACAATAGGCATGTTGTTAGGTTGGTAGTAATTCATAACAGTTTACTACTCTTGGATTTAAATCCTTGGAGCTATAGCTATA,8 +50,CAACTTTAAACATGATAGCCACGTATAATCATAGGAATATGGCCTATAAGTCTCTACCGGGTTACCGTAAATAGCCTGACTACGCGGATGATACAATAAGAC,8 +51,ATGAATCACAAGGGGAGTCCGGCACGGGCTGAGAGGAAGCAGGATGCGCTTCGACCCTCGAACCTGTAAGTTCAAACTTGCGTAGGGATGTGAACCGGATCAATC,8 +52,CGCCCTCCCGCAGGAGAGAGGGGCCCATGCCGCAACCGGCGCCGGGCCTCCGCCGAAGGCGCAAGCTCCCATAATCGCTCAGGCACCAGGAACTGCGGGTTTCGT,8 +53,CCTGTACTGCACGGGTTGGGTATCCGCAAGGATTTAATAGAGAAGCCGGAGCGTTTCCGGCGCGGTCACGCCACCGTATTGGGGAGCGAATTTGCAGAAAAGCGCGATGTTTTCGCCGAAAGGTGAAAATGTGCGTCTTTCGATAGAGCCACTGAAGCTGTTGCTTTGGGAAGGTGCAGAGGAGCTGTGAACCTGAGCCGGGAGAACTGCCCTACCGACAGTCACCGTT,8 +54,CTGGGCCACTCTGGAAAGCGGATTAATGTCCCGCCGACGGTGAAAGCGCTACCCGCGCGAAACTCTCAGGCTAATGACAGAGGGGGAGCC,8 +55,CAGGGTCTTCGGCGCTCCCCGCACGGTATCTGAGTCAATCGGCATCAGGAGCTCGGCTCCAGCCACTGCTTCGCGCTCGCAATGAGCATCGAGGTGGGAAGGCGATGTCGCTATGCCATTCGTGGCCACTCAGGAGCCCGGAGACCGGCTTCGACGAACACGATCGCT,8 +56,TATCTTTATACTGTAGTAGGTTCTTTGTAGTGAAGATGAAAAGGGAATAAGGTGAAAAACCTTTGCTGTACCCGCAACTGTGAGCTGTTGATTTAAGAATCTTTCTTTACACCCTCTATGCCACTGTGGATGATTCCATGGGAAGGCTGGTAAGGAAAAGTGAGCCAGGAGACCTGCCTATTACTATTTTGGGAAC,8 +57,ATGAATGTTACAGGAAAAGGATTGTTGCAATCCGCCGAAGGGGCAAGGGCTTAAAACTCCCATTTAAGTCCAAATCTCTCAGGTATACTGAAAAGTATGA,8 +58,CACATCGACCAGAGGTCGCGCTCAACAAGAGTAGCAATCATTAGCCAGAAAGAGGTGTTAAGTGATTTGCAAAAGGGGAGAGCGCCGAAGTTTGTAAGTTATTCTTTCTAGCTTACAAGCTGGGACTTGGCTTAATAAGCCAAGGACTGTCGCAGCAATTTTGGCAATTTGCTGCGGAGTGCTATCGACATG,8 +59,CTGAACAAAACACTCTAGCCTCTGCGCCTAAGTTGAAAAATAAGGTGCTTTGGCCGCAACGGGTTACTCCGTTGCCAGGGCGAAGGAAGAAATGATCTCGCCTCCCGTATTTGGAAAGGTGTCTATGGCTTTACAAC,8 +60,GATAATCCAAGTCGTCGAGGTTCTCCGGTTCCCATTGATCCGGAGCTAAGAGGGAAGCCGGTGCAAATGCCGGCTCTGCCCCCGCAACTGTGAGCGGCGAGCCGCTGTCCGACGATGTCGCTGAAGCCTGCACGGCTTCGGGAAGGCCGGACAGCAGCGATGACCAGCAAGCCAGGAGACCGGCCCCGACAATATATTGGTCC,8 +61,GAGTTTGAAAATATTAAAGGTTGTGCTATAATCAATTTAGCAGGTGACGGGAAGTCCGGTGAAAATCCGGCGCTGCCCCCGCAACTGTGAGCTGGACGAAAGCCCTTTCGTGCCACTACCGAGCATCTAAAGTATTTTAGGTGCATGGTGGGAAGGCGGGCGAGTAGGATGAAGGCGAGCCAGGAGACCGGCCTGCAAAAAAGACCTACGG,8 +62,AACGCAGAAAAAGGGAGCCTGCAAAAGCAGGCTGAGAGTGGGCTGTTTGCCCAGACCTGTGACCTGATTTGGACAATGCCAACGTAGGGAGATACTGCTGCTTCAAAG,8 +63,TGCCCATCCCCGGGGAGCCAGACCGGCTGAGAGGGGTGGGATAGTCTCCACCCGACCCGTTGAACCTGATCCCGTTAGCGCGGGCGGAGGGAAGGGTGGTCGTAACGCC,8 +64,TATTACCTTAAAGCGCCGGGACCCCTTTTGGGGTTGACGAGGTGGAGGTTTATCGAGGTATCGGCGGATGCCTCCCGGCTTTTCGCGGCCGTTAAAGGCGTCGGACAAAACCGAAGAGCAATCTTCGGGACAAAGGGCGCCGGGCGAATAA,8 +65,CGAATAGGCCACTGCCCGGCTGCTATCCGGTGAGCGTGTTACAATGCGCTACCGTTGTCAACAGCCAGGCGGGAAGGCGGAGGGGGCGATGATCCGCAAGCCAGGAGACCTGCCACTGCAATACAGCTCAAT,8 +66,ATAATCGTAAAGCAATAAGGTCTCAATTTAAAGATTGAGTGAAAAGGGAAGTCTGGTGAAAAACCAGCGCGGTCCCGCCACTGTAAGCGCTTTGTGCGTAAGTCAGGAGACCTGCCTTATTGTTAGCTGTGAAA,8 diff --git a/tests/data/datasets/rna/rnaswitches.csv b/tests/data/datasets/rna/rnaswitches.csv new file mode 100644 index 00000000..dd9c2d4b --- /dev/null +++ b/tests/data/datasets/rna/rnaswitches.csv @@ -0,0 +1,68 @@ +,sequence,ON,OFF,ON_OFF +0,GGGAAAATATTATTTGGTGTCTATAACGGAACAAACCAAACACACAAACGCACTGTTCCGTTATAGACACCAAATAATATTTTAACAGAGGAGAAAAATAATGTTTGGTGTCAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,1.0,0.4620369374752044,0.9885286688804626 +1,GGGTTCAAGAACATGTGCAAACTCAAGCCCCTCAACCAAACACACAAACGCACGAGGGGCTTGAGTTTGCACATGTTCTTGAAAACAGAGGAGATTCAAGATGATGTGCAAAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.675412654876709,0.8580208420753479,0.4585525095462799 +2,GGGCCTGGGTGTGACAAAAGGTTTACAGAATATAACCAAACACACAAACGCACATATTCTGTAAACCTTTTGTCACACCCAGGAACAGAGGAGACCTGGGATGGACAAAAGGAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.2197960019111633,0.1567209362983703,0.3681486248970032 +3,GGGAAGTTCTGAACGTCGAAAAGAAAAGTCTCGAACCAAACACACAAACGCACCGAGACTTTTCTTTTCGACGTTCAGAACTTAACAGAGGAGAAAGTTCATGACGTCGAAAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.6605789661407471,0.3252573609352112,0.8617132306098938 +4,GGGTTCAAGGTCAATGATGGGAGAACGAGAAGGAACCAAACACACAAACGCACCCTTCTCGTTCTCCCATCATTGACCTTGAAAACAGAGGAGATTCAAGATGAATGATGGGAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.5832574367523193,0.9439947009086608,0.0462763123214244 +5,GGGGCCATCCATCTCCTGACAAACAAATCTTGTAACCAAACACACAAACGCACACAAGATTTGTTTGTCAGGAGATGGATGGCAACAGAGGAGAGCCATCATGCTCCTGACAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.0,0.2370066344738006,0.0963056087493896 +6,GGGTTGTCTTTTATAGTATATACGTTATTGTCAAACCAAACACACAAACGCACTGACAATAACGTATATACTATAAAAGACAAAACAGAGGAGATTGTCTATGATAGTATATAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.560741662979126,0.7762762904167175,0.4170002937316894 +7,GGGACGCCTTCTGTCTGAGCACCCAGAATTAGAAACCAAACACACAAACGCACTCTAATTCTGGGTGCTCAGACAGAAGGCGTAACAGAGGAGAACGCCTATGGTCTGAGCAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.1110742762684822,0.2085105180740356,0.2309499531984329 +8,GGGAGCAGTACATGTTGCAAGTGGATTCATAGAAACCAAACACACAAACGCACTCTATGAATCCACTTGCAACATGTACTGCTAACAGAGGAGAAGCAGTATGTGTTGCAAGAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.2960817813873291,0.3393393456935882,0.4432339668273926 +9,GGGACAAAAATTTAAAATGAGTATCCGTATAAAAACCAAACACACAAACGCACTTTATACGGATACTCATTTTAAATTTTTGTAACAGAGGAGAACAAAAATGTAAAATGAGAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.1700725853443145,0.2634412050247192,0.2821010649204254 +10,GGGCCGGTACCAATCAAAGTCTGCCCTCGCTCAAACCAAACACACAAACGCACTGAGCGAGGGCAGACTTTGATTGGTACCGGAACAGAGGAGACCGGTAATGATCAAAGTCAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.3607242405414581,0.0,0.5826334357261658 +11,GGGTATTCCTGGAGATATCCTATCTATTATTGAAACCAAACACACAAACGCACTCAATAATAGATAGGATATCTCCAGGAATAAACAGAGGAGATATTCCATGAGATATCCTAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.7569754719734192,0.5706626176834106,0.8431193232536316 +12,GGGTAGAGAAATATCTATAATTTGTGAGTTTTCAACCAAACACACAAACGCACGAAAACTCACAAATTATAGATATTTCTCTAAACAGAGGAGATAGAGAATGATCTATAATAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.8380022644996643,0.5865103006362915,0.9328819513320924 +13,GGGAACCCTTGGCCACAGGATAAAATATTCAAAAACCAAACACACAAACGCACTTTGAATATTTTATCCTGTGGCCAAGGGTTAACAGAGGAGAAACCCTATGCCACAGGATAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.2963091135025024,0.4075919091701507,0.4223058819770813 +14,GGGAATGGTTCAATGTAAGGAACGGATATGGTTAACCAAACACACAAACGCACAACCATATCCGTTCCTTACATTGAACCATTAACAGAGGAGAAATGGTATGATGTAAGGAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.0593205615878105,0.0,0.1957155168056488 +15,GGGTGGAACTAAAAAACGAAATATATTTGTAGTAACCAAACACACAAACGCACACTACAAATATATTTCGTTTTTTAGTTCCAAACAGAGGAGATGGAACATGAAAACGAAAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.9099984169006348,0.6977009177207947,0.9349337816238404 +16,GGGAAGAAGCCGAAAAATCTCCAAAAGGAAAGGAACCAAACACACAAACGCACCCTTTCCTTTTGGAGATTTTTCGGCTTCTTAACAGAGGAGAAAGAAGATGAAAAATCTCAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,1.0,0.6716539859771729,0.9519994258880616 +17,GGGGCTTGGATGCGAGACATACTACCTAAGGTTAACCAAACACACAAACGCACAACCTTAGGTAGTATGTCTCGCATCCAAGCAACAGAGGAGAGCTTGGATGCGAGACATAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.3445769250392914,0.5669493079185486,0.3838712871074676 +18,GGGTCTGTTCTAGGAGAGTAATGGGTTATTGTGAACCAAACACACAAACGCACCACAATAACCCATTACTCTCCTAGAACAGAAACAGAGGAGATCTGTTATGGGAGAGTAAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.1029859483242034,0.1934356093406677,0.2238156646490097 +19,GGGGACATTTCTGAGAATGACGACTTACGTCAAAACCAAACACACAAACGCACTTGACGTAAGTCGTCATTCTCAGAAATGTCAACAGAGGAGAGACATTATGGAGAATGACAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,1.0,0.4565882086753845,0.9891074299812316 +20,GGGACTGTTTAAATTTGTTCCATCTGCAGAGGAAACCAAACACACAAACGCACTCCTCTGCAGATGGAACAAATTTAAACAGTAACAGAGGAGAACTGTTATGATTTGTTCCAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.6259385943412781,0.2437523156404495,0.8435568809509277 +21,GGGATCTAGTTGGTTCTCTAGATACTCTACCATAACCAAACACACAAACGCACATGGTAGAGTATCTAGAGAACCAACTAGATAACAGAGGAGAATCTAGATGGTTCTCTAGAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.0331300199031829,0.0,0.1660409271717071 +22,GGGTGCCCAGTTGCACTTACTACATGCCCGAGTAACCAAACACACAAACGCACACTCGGGCATGTAGTAAGTGCAACTGGGCAAACAGAGGAGATGCCCAATGGCACTTACTAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.2669273912906647,0.4750176072120666,0.3426825702190399 +23,GGGAACGAATGTGGGAAGAGCTTCAGTCAGCATAACCAAACACACAAACGCACATGCTGACTGAAGCTCTTCCCACATTCGTTAACAGAGGAGAAACGAAATGGGGAAGAGCAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,1.0,0.0,1.0 +24,GGGAAGACTATGGATTTGGAATTTTCACGACAAAACCAAACACACAAACGCACTTGTCGTGAAAATTCCAAATCCATAGTCTTAACAGAGGAGAAAGACTATGGATTTGGAAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.856990396976471,0.8083176612854004,0.8311961889266968 +25,GGGAAATACAGATACTCGCCAAGTACTCAGCTGAACCAAACACACAAACGCACCAGCTGAGTACTTGGCGAGTATCTGTATTTAACAGAGGAGAAAATACATGTACTCGCCAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.3623946905136108,0.4536762833595276,0.4898713827133178 +26,GGGCTAATTCCTTTGCTATTTCTATGGACTTATAACCAAACACACAAACGCACATAAGTCCATAGAAATAGCAAAGGAATTAGAACAGAGGAGACTAATTATGTTGCTATTTAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.7369530200958252,0.7762762904167175,0.6373268365859985 +27,GGGCTATGAAGAACATTAAGTGGATTTTTCCTTAACCAAACACACAAACGCACAAGGAAAAATCCACTTAATGTTCTTCATAGAACAGAGGAGACTATGAATGACATTAAGTAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.6791964769363403,0.578372061252594,0.7810665369033813 +28,GGGTCAATTGAAAAAATAAAGAAAGGGAAAACTAACCAAACACACAAACGCACAGTTTTCCCTTTCTTTATTTTTTCAATTGAAACAGAGGAGATCAATTATGAAAATAAAGAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.8527029752731323,0.9927114844322203,0.0520883239805698 +29,GGGTTCTACATCCATCTTCTCTAGAAAAACTATAACCAAACACACAAACGCACATAGTTTTTCTAGAGAAGATGGATGTAGAAAACAGAGGAGATTCTACATGCATCTTCTCAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.0448745563626289,0.3973135650157928,0.0944852977991104 +30,GGGTCCCAGTTTGCCTAAACATAATGTTACCATAACCAAACACACAAACGCACATGGTAACATTATGTTTAGGCAAACTGGGAAACAGAGGAGATCCCAGATGGCCTAAACAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.1753646731376648,0.2347807139158249,0.2961656749248504 +31,GGGGGTAGTTGATGGATCTGTGATGTATGCAATAACCAAACACACAAACGCACATTGCATACATCACAGATCCATCAACTACCAACAGAGGAGAGGTAGTATGTGGATCTGTAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.3843843936920166,0.1509048640727996,0.5927194356918335 +32,GGGTGCTTACCCAACTCAAAGAAGAAAGAGCACAACCAAACACACAAACGCACGTGCTCTTTCTTCTTTGAGTTGGGTAAGCAAACAGAGGAGATGCTTAATGAACTCAAAGAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.7450301051139832,0.8471434116363525,0.5600634813308716 +33,GGGAAAATATATAAAGAAATAAATGTCGAGCTTAACCAAACACACAAACGCACAAGCTCGACATTTATTTCTTTATATATTTTAACAGAGGAGAAAAATAATGAAAGAAATAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,1.0,0.993942141532898,0.1346538066864013 +34,GGGAGCTATTACACACACTGGACTGGTAGTCTGAACCAAACACACAAACGCACCAGACTACCAGTCCAGTGTGTGTAATAGCTAACAGAGGAGAAGCTATATGACACACTGGAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.835422158241272,0.7213507294654846,0.839072048664093 +35,GGGATCCCAACCCAAACTTTTTACAACAGGAGGAACCAAACACACAAACGCACCCTCCTGTTGTAAAAAGTTTGGGTTGGGATAACAGAGGAGAATCCCAATGCAAACTTTTAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.5069350600242615,0.628690242767334,0.4989488124847412 +36,GGGAATTTCGTTAAAACATTCAAATGTTGTCAAAACCAAACACACAAACGCACTTGACAACATTTGAATGTTTTAACGAAATTAACAGAGGAGAAATTTCATGAAAACATTCAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,1.0,0.965579867362976,0.5386534333229065 +37,GGGAAGCTAAAAATAGAGCAAAAAGGAAACCTCAACCAAACACACAAACGCACGAGGTTTCCTTTTTGCTCTATTTTTAGCTTAACAGAGGAGAAAGCTAATGATAGAGCAAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,1.0,0.9529529809951782,0.6071071028709412 +38,GGGTTCTTTCTCCAAAGTGCCGGGCACGGTGTCAACCAAACACACAAACGCACGACACCGTGCCCGGCACTTTGGAGAAAGAAAACAGAGGAGATTCTTTATGCAAAGTGCCAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.5656057000160217,0.6893579959869385,0.5194949507713318 +39,GGGGAAACAGGAAACGCTGTATATGGAAAAGTTAACCAAACACACAAACGCACAACTTTTCCATATACAGCGTTTCCTGTTTCAACAGAGGAGAGAAACAATGAACGCTGTAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.8108108043670654,0.6653422117233276,0.8508844971656799 +40,GGGAGGAATGTCCTGGTTCTCACAAATCCTCATAACCAAACACACAAACGCACATGAGGATTTGTGAGAACCAGGACATTCCTAACAGAGGAGAAGGAATATGCTGGTTCTCAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.4706798791885376,0.4164164066314697,0.6361076831817627 +41,GGGTTCGAAGACTTCACCACCATGCAGAAGCAGAACCAAACACACAAACGCACCTGCTTCTGCATGGTGGTGAAGTCTTCGAAAACAGAGGAGATTCGAAATGTTCACCACCAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.185580626130104,0.5595619678497314,0.0992536693811416 +42,GGGAAAAAGAAAACAAAGAACTTACCCTTGAAAAACCAAACACACAAACGCACTTTCAAGGGTAAGTTCTTTGTTTTCTTTTTAACAGAGGAGAAAAAAGATGACAAAGAACAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,1.0,0.8885096311569214,0.7806172966957092 +43,GGGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACCAAACACACAAACGCACCTGAGCTTTCAAGGTTTTCACTTTTTCCTCAACAGAGGAGAGAGGAAATGGTGAAAACCAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.3442373871803283,0.2012012004852295,0.5398310422897339 +44,GGGAAATGATGAAGATGGTAACCTGTCTGAAGAAACCAAACACACAAACGCACTCTTCAGACAGGTTACCATCTTCATCATTTAACAGAGGAGAAAATGAATGAGATGGTAAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.3273273408412933,0.2706179022789001,0.5019986629486084 +45,GGGTTTCCATTTTTTAATGGGGTTGTGGATGAGAACCAAACACACAAACGCACCTCATCCACAACCCCATTAAAAAATGGAAAAACAGAGGAGATTTCCAATGTTTAATGGGAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.3680118918418884,0.6296296119689941,0.3493013978004455 +46,GGGTTACCACGTGACACTTCCTCCTGTATGACAAACCAAACACACAAACGCACTGTCATACAGGAGGAAGTGTCACGTGGTAAAACAGAGGAGATTACCAATGGACACTTCCAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.6726977825164795,0.6418948769569397,0.7238343358039856 +47,GGGATACTGGTCTAATTAAGGGAGCCGAGTCTAAACCAAACACACAAACGCACTAGACTCGGCTCCCTTAATTAGACCAGTATAACAGAGGAGAATACTGATGTAATTAAGGAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.3449516594409942,0.7762762904167175,0.0828780606389045 +48,GGGCTACGTATTTGGCAGAGTTATCCATTATTTAACCAAACACACAAACGCACAAATAATGGATAACTCTGCCAAATACGTAGAACAGAGGAGACTACGTATGTGGCAGAGTAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.1729362308979034,0.3563250005245209,0.2619076371192932 +49,GGGAAACACGTGTGCAAACTGAACCTGTAACATAACCAAACACACAAACGCACATGTTACAGGTTCAGTTTGCACACGTGTTTAACAGAGGAGAAAACACATGTGCAAACTGAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.684684693813324,0.8724708557128906,0.4269039034843445 +50,GGGATGTACACCAAAAGCAGCCACCTCAAGGCCAACCAAACACACAAACGCACGGCCTTGAGGTGGCTGCTTTTGGTGTACATAACAGAGGAGAATGTACATGAAAAGCAGCAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,1.0,0.9893017411231996,0.2314978986978531 +51,GGGAATTGAGAATAAGTATGATCTGGAACCCCGAACCAAACACACAAACGCACCGGGGTTCCAGATCATACTTATTCTCAATTAACAGAGGAGAAATTGAATGTAAGTATGAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.0678510591387748,0.2376275658607483,0.173897847533226 +52,GGGATAACTGAACGATGGTTTTCACATTTAACAAACCAAACACACAAACGCACTGTTAAATGTGAAAACCATCGTTCAGTTATAACAGAGGAGAATAACTATGCGATGGTTTAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,1.0,0.4796638488769531,0.987185537815094 +53,GGGTTTATTATATTCAGTAACCAAATACATATTAACCAAACACACAAACGCACAATATGTATTTGGTTACTGAATATAATAAAAACAGAGGAGATTTATTATGTTCAGTAACAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.3053889572620392,0.8195104002952576,0.0507906228303909 +54,GGGTAGCTAAAGAAAAAGCACTGAGGCCTGTGGAACCAAACACACAAACGCACCCACAGGCCTCAGTGCTTTTTCTTTAGCTAAACAGAGGAGATAGCTAATGAAAAAGCACAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,1.0,0.7457597851753235,0.9204674363136292 +55,GGGTTCAGGAAAAAATTTATTTTGCACTTCAACAACCAAACACACAAACGCACGTTGAAGTGCAAAATAAATTTTTTCCTGAAAACAGAGGAGATTCAGGATGAAATTTATTAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.6301235556602478,0.672412097454071,0.6474031805992126 +56,GGGGCCTGCCCCAGCAGCAACCACAGCAGCAACAACCAAACACACAAACGCACGTTGCTGCTGTGGTTGCTGCTGGGGCAGGCAACAGAGGAGAGCCTGCATGAGCAGCAACAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.3843843936920166,0.0,0.6071071028709412 +57,GGGGCCAAATTATGATGATGAGATTGCTTATGGAACCAAACACACAAACGCACCCATAAGCAATCTCATCATCATAATTTGGCAACAGAGGAGAGCCAAAATGTGATGATGAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.0695280656218528,0.3513513505458832,0.13991679251194 +58,GGGGAAATTCTTCTTCAGTAAAATTAAGGAAGCAACCAAACACACAAACGCACGCTTCCTTAATTTTACTGAAGAAGAATTTCAACAGAGGAGAGAAATTATGCTTCAGTAAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.0467082634568214,0.2872872948646545,0.1381275653839111 +59,GGGAACGAGATGGGTTGGCTAGATAAGACCAAGAACCAAACACACAAACGCACCTTGGTCTTATCTAGCCAACCCATCTCGTTAACAGAGGAGAAACGAGATGGGTTGGCTAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.4714882969856262,0.6811115145683289,0.3834249675273895 +60,GGGTGGATGGACAGCACCAAGGCCACAAGGTATAACCAAACACACAAACGCACATACCTTGTGGCCTTGGTGCTGTCCATCCAAACAGAGGAGATGGATGATGAGCACCAAGAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.5529120564460754,0.0,0.7957950830459595 +61,GGGGTTGTGAACAAGGTGTGAAGAGCCTATTGAAACCAAACACACAAACGCACTCAATAGGCTCTTCACACCTTGTTCACAACAACAGAGGAGAGTTGTGATGAAGGTGTGAAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.1011011004447937,0.1536536514759063,0.2300414443016052 +62,GGGTTTTCACCCTTCTGTTCGACAATGAATGGTAACCAAACACACAAACGCACACCATTCATTGTCGAACAGAAGGGTGAAAAAACAGAGGAGATTTTCAATGTTCTGTTCGAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.4014644920825958,0.7299181222915649,0.1940822601318359 +63,GGGAAGAGTCTTAGTGAGAACCCTGCTAACCGCAACCAAACACACAAACGCACGCGGTTAGCAGGGTTCTCACTAAGACTCTTAACAGAGGAGAAAGAGTATGAGTGAGAACAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,1.0,0.7762762904167175,0.9114114046096802 +64,GGGTTTATGTGTGAAACATGTGGCAAGAGTTTTAACCAAACACACAAACGCACAAAACTCTTGCCACATGTTTCACACATAAAAACAGAGGAGATTTATGATGGAAACATGTAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.9011594653129578,0.7411996126174927,0.8974321484565735 +65,GGGTCCATGGACAACACTCCCCACACGCCAACCAACCAAACACACAAACGCACGGTTGGCGTGTGGGGAGTGTTGTCCATGGAAACAGAGGAGATCCATGATGAACACTCCCAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.4209796488285064,0.0,0.6409544944763184 +66,GGGATCCAGACTCCTATGCTTTCCCAAGAACAGAACCAAACACACAAACGCACCTGTTCTTGGGAAAGCATAGGAGTCTGGATAACAGAGGAGAATCCAGATGCCTATGCTTAACCTGGCGGCAGCGCAAAAGATGCGTAAAGGAGAA,0.5878926515579224,0.1944195181131363,0.8202183246612549 diff --git a/tests/data/test_dataset.py b/tests/data/test_dataset.py new file mode 100644 index 00000000..d13ed261 --- /dev/null +++ b/tests/data/test_dataset.py @@ -0,0 +1,18 @@ +import os + +import danling as dl +import pytest + +from multimolecule import PandasDataset + + +class TestPandasDataset: + + @pytest.mark.parametrize("preprocess", [True, False]) + @pytest.mark.parametrize("file", ["rna/5utr.csv", "rna/modification.json", "rna/ncrna.csv", "rna/rnaswitches.csv"]) + def test_constructor(self, file: str, preprocess: bool): + omics = file.split("/")[0] + file = os.path.join("tests", "data", "datasets", file) + dataset = PandasDataset(file, split="train", pretrained=f"multimolecule/{omics}", preprocess=preprocess) + elem = dataset[0] + assert isinstance(elem["input_ids"], dl.PNTensor)