Skip to content

Commit

Permalink
add Dataset
Browse files Browse the repository at this point in the history
Signed-off-by: Zhiyuan Chen <[email protected]>
  • Loading branch information
ZhiyuanChen committed May 3, 2024
1 parent c4ce48c commit ec7661e
Show file tree
Hide file tree
Showing 10 changed files with 403 additions and 2 deletions.
8 changes: 6 additions & 2 deletions multimolecule/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from . import models, tokenisers
from . import data, models, tokenisers
from .data import Dataset, PandasDataset
from .downstream.crispr_off_target import (
RnaBertForCrisprOffTarget,
RnaFmForCrisprOffTarget,
Expand Down Expand Up @@ -54,9 +55,12 @@
from .tokenisers import RnaTokenizer

__all__ = [
"models",
"data",
"Dataset",
"PandasDataset",
"tokenisers",
"RnaTokenizer",
"models",
"RnaBertConfig",
"RnaBertModel",
"RnaBertForMaskedLM",
Expand Down
4 changes: 4 additions & 0 deletions multimolecule/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .dataset import Dataset, PandasDataset
from .utils import no_collate

__all__ = ["Dataset", "PandasDataset", "no_collate"]
166 changes: 166 additions & 0 deletions multimolecule/data/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
from __future__ import annotations

from collections import OrderedDict
from collections.abc import Mapping, Sequence

import danling as dl
import datasets
import torch
from chanfig import FlatDict
from danling import NestedTensor
from pandas import DataFrame
from tokenizers import Tokenizer
from torch import Tensor
from transformers import AutoTokenizer, PreTrainedTokenizerBase


class Dataset(datasets.Dataset):

data_cols: Sequence
feature_cols: Sequence
label_cols: Sequence
tokenizer: PreTrainedTokenizerBase | Tokenizer
sequence_cols: Sequence
rename_sequence: bool
preprocess: bool

def post(
self,
tokenizer: Tokenizer | PreTrainedTokenizerBase | None = None,
pretrained: str | None = None,
feature_cols: Sequence | None = None,
label_cols: Sequence | None = None,
preprocess: bool = True,
rename_sequence: bool | None = None,
):
self.sequence_cols = [k for k, v in self.features.items() if v.dtype == "string"]

data_cols = list(self._info.features.keys())
if label_cols is None:
label_cols = [i for i in data_cols if i not in feature_cols] if feature_cols is not None else ["label"]
if feature_cols is None:
feature_cols = [i for i in data_cols if i not in label_cols]
missing_feature_cols = set(feature_cols).difference(data_cols)
if missing_feature_cols:
raise ValueError(f"{missing_feature_cols} are specified in feature_cols, but not found in dataset.")
missing_label_cols = set(label_cols).difference(data_cols)
if missing_label_cols:
raise ValueError(f"{missing_label_cols} are specified in label_cols, but not found in dataset.")
self.feature_cols = list(feature_cols)
self.label_cols = list(label_cols)
self.data_cols = self.feature_cols + self.label_cols

if tokenizer is None:
tokenizer = AutoTokenizer.from_pretrained(pretrained)
if tokenizer is None: # Actually means both tokenizer and pretrained is None
raise ValueError("Either tokenizer or pretrained must be specified")
self.tokenizer = tokenizer

self.preprocess = preprocess
if self.preprocess:
self.update(self.map(self.tokenization))
self.set_transform(self.torch_transform)
else:
self.set_transform(self.tokenize_transform)

if rename_sequence is None:
rename_sequence = len(self.sequence_cols) == 1
self.rename_sequence = rename_sequence
if self.rename_sequence:
sequence_col = self.sequence_cols[0]
self.update(self.rename_column(sequence_col, "input_ids"))
self.sequence_cols = ("input_ids",)
self.feature_cols = ["input_ids" if i == sequence_col else i for i in self.feature_cols]
self.label_cols = ["input_ids" if i == sequence_col else i for i in self.label_cols]
self.data_cols = ["input_ids" if i == sequence_col else i for i in self.data_cols]

def update(self, dataset: datasets.Dataset):
# pylint: disable=W0212
# Why datasets won't support in-place changes?
# It's just impossible to extend.
self._format_columns = dataset._format_columns
self._data = dataset._data
self._info = dataset._info
self._fingerprint = dataset._fingerprint

def tokenize(self, string: str) -> Tensor:
return self.tokenizer(string, return_attention_mask=False)["input_ids"]

def tokenization(self, data: Mapping[str, str]) -> Mapping[str, Tensor]:
return {col: self.tokenize(data[col]) for col in self.sequence_cols}

def torch_transform(self, batch: Mapping) -> Mapping:
return {
k: (
(dl.PNTensor(v) if len(v) == 1 else NestedTensor(v)).long()
if k in self.sequence_cols
else torch.tensor(v)
)
for k, v in batch.items()
}

def tokenize_transform(self, batch: Mapping) -> Mapping:
return {
k: (
(dl.PNTensor(self.tokenize(v)) if len(v) == 1 else NestedTensor(self.tokenize(v))).long()
if k in self.sequence_cols
else torch.tensor(v)
)
for k, v in batch.items()
}

def __getitem__(self, key: int | slice | str | Sequence[int]) -> OrderedDict:
batch = self._getitem(key)
input = FlatDict({col: batch[col] for col in self.feature_cols})
target = FlatDict({col: batch[col] for col in self.label_cols})
return OrderedDict(input=input, target=target)

def __getitems__(self, keys: Sequence) -> Sequence: # type: ignore[return-value]
return self.__getitem__(keys)

# def __getitems__(self, keys: Sequence) -> Sequence:
# # I have NO idea why they want to de-collate batches and then re-collate.
# batch = self._getitem(keys)
# return [
# OrderedDict(
# input={
# col: batch[col][index] if not isinstance(batch[col], NestedTensor) else batch[col]._storage[index]
# for col in self.feature_cols
# },
# target={
# col: batch[col][index] if not isinstance(batch[col], NestedTensor) else batch[col]._storage[index]
# for col in self.label_cols
# },
# )
# for index in range(len(batch[next(iter(batch))]))
# ]


class PandasDataset(Dataset):

def __init__(
self,
dataframe: DataFrame | str,
split: str,
tokenizer: Tokenizer | PreTrainedTokenizerBase | None = None,
pretrained: str | None = None,
feature_cols: Sequence | None = None,
label_cols: Sequence | None = None,
preprocess: bool = True,
rename_sequence: bool | None = None,
):
if isinstance(dataframe, str):
dataframe = dl.load_pandas(dataframe)
if isinstance(dataframe, dict):
dataframe = DataFrame.from_dict(dataframe)
dataframe = dataframe.loc[:, ~dataframe.columns.str.contains("^Unnamed")]
table = datasets.table.InMemoryTable.from_pandas(dataframe, preserve_index=False)
super().__init__(table, split=split)
self.post(
tokenizer=tokenizer,
pretrained=pretrained,
feature_cols=feature_cols,
label_cols=label_cols,
preprocess=preprocess,
rename_sequence=rename_sequence,
)
2 changes: 2 additions & 0 deletions multimolecule/data/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
def no_collate(batch):
return batch
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ dynamic = [
dependencies = [
"chanfig>=0.0.99",
"danling",
"datasets",
"tokenizers",
"transformers",
]
[project.urls]
Expand Down
68 changes: 68 additions & 0 deletions tests/data/datasets/rna/5utr.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
,seq,label
0,ACCAACATGTAATTTCCACTCTTGA,-1.7812239923226838
1,TGGTAAAATCTAGGGTTTTTTATAA,-0.275642799200671
2,CAAAAAGTAGACGCAACATGAAAAA,-1.1482258696972798
3,TGGCCTCGTGGATAGGACATTTGGA,-1.163470221256106
4,TAATCGGTTCTAAATACGATTAGTG,0.629766441913879
5,TAAAGAAGAGGTTGATGAGAAACCG,-0.0566107803048019
6,AAGCGGTGAATAACACACAGTAAAG,0.4379278257655004
7,ATCTCTCTAGTACAGATTGTCAATA,0.6259435716867096
8,AGACAGCTAAAACCCTACAAAATAA,0.4289960198091346
9,CCTTCGACGACCCACGTCCGCCTTA,0.0669630515711629
10,CGTTGATCATGGATACTTTTTTACA,-1.4310155058894878
11,GTACGCAAACCATCTCTCGATTTCT,0.3852448242284625
12,GTTACCCCCTACTCCAGCTCATACT,0.1064178675873167
13,TCCAATCTTTTGCACCACCCCTAGG,0.1675831224155592
14,CTCCCTCAACAGGTGCCTCACGCTG,0.4482936797119086
15,AGTAATGAGTTTCGGCATTTCAAAG,0.4487089779004714
16,AGGATTGTGTCGCCAGTTCCACTGA,0.2266517419179321
17,TAATATCATATAGTTCTTCTCCCCT,0.0910128870657181
18,TAGAATCGGAAGGAATAGGATTCTA,0.6830431344740635
19,GATGCTTGCACTCGAGGTCCGTGCA,0.7779586432740309
20,GACACCACGTAAAATCCTAATCAAA,0.7227716269767893
21,TCTATGACTCGTTCGCGTAGAATCA,-0.9032312196091278
22,CAAAATGATAAGATGGACCAAAGAT,0.0447788302836263
23,TGCATGATCTGTAGCATTTGCTGCT,-0.2827116234508076
24,GCATGACCAGCCTGTTTAGATAGAA,-0.8848997916225679
25,AGAAAGATAACAAACCACCCGTATG,0.5311009416510439
26,GACCCCTTTACGCAACCTATTGAAC,0.7381938795002485
27,GCCCCTACACTCTGTTTTTTGATCC,0.4626627635505125
28,GGATAAATAAATCTGAGATCAGAAA,0.5764032765766933
29,CCCTGTTGCCAGCCGCATAATCATC,0.5072462083719866
30,GCAGCACGCTTACAGTCCCTCAGAC,0.5885318168197583
31,CTTTTTCCTTACTCGTGATACTATC,0.3549980256557335
32,GTAAACCCAGATCTAGTTTGACTGT,0.4340458948389251
33,CACGCTGCACACCGAACAGCCCAAA,-0.0060640299062117
34,ACTCCGACACCATCTTCATTACAAT,0.40927053992064
35,TACATGGAACTGTCCCTTCTTACCG,-0.8422048835483932
36,GACCCTCCTATTATCAACCAAGATA,0.2085787716855296
37,AGAGTGAGAGCGCGACAAATCACTG,0.677525749419415
38,CCGATTGGCGCCCTTTGGCCGGGAG,0.0662045936850974
39,GAGATGAGAAGTCGTGCGAAATAAC,-1.5323635165013456
40,GTCCTCGCGACAACTGTCCCAAACC,0.2904891214718897
41,TTCTGATCGGTGTTCCTCCGTTCTG,0.4886688832278358
42,TCTAGTCGTTTCTAGCATAGACTATA,0.6682424782790564
43,GCAATGCATCCATTCCAATGCCTACT,-1.1913139865591946
44,ACCTTGCCGCATCCCACTTGCCTGCA,0.458991405155542
45,CAAACTGGGCCCATTTCTATACCAAT,0.1781652824101883
46,ACGAGAGTAACAGATCCAACCTAAA,0.6019488593566376
47,CCTACGCGGGATGCTCTTTTTTATAG,-1.1687525556467426
48,GATCCAGGAGACAGAAACCATCTACC,0.4738692979644047
49,TCGCAAAGAAGAACCTATTTTAAGA,0.7018982636372705
50,TATAATTACGCTTTTCCGTGTATGG,-0.3720488657064282
51,TCAATTACAGCTCGACTTCCATGATC,0.2754572607942968
52,AAGCCGTTCTTTAAATCCACACATTT,0.2832481855967742
53,AGTCCATCCTCGCGGCCTCACACCA,-0.1678433893986053
54,AGTCCCGTCCTACACGCTCGGTCCG,0.3135193265556327
55,CCCCATATCCGATTATCTGCTGGAC,0.5673113165112577
56,CGTAGTGGCGCAGGACCGTCAATTA,0.3736517875688682
57,CTCTGCTATGCCCCACCACTCAACA,0.5126163959293235
58,ATCCACCAATCCCTACATTCATCTTC,0.5112259267226038
59,GAGAGTGTCGCCGAAGCACAAGCCGA,0.4693891586433297
60,CCTGTCGATCTAGGTCCTATTGTCCG,0.6496399244427643
61,ATTTCTAACTTCTTCTGGCAACGACA,0.5061690522661538
62,ATATACGGCAACACGCCCGAACCAGA,0.2119265981391584
63,CCTCGTTAATCCTTCCCTTGTCTCCC,0.1640642263497583
64,TCCCCGCCACGCCCGGTATCCGACTA,-0.0315210929562356
65,GAAACTCGTGTTTATTCTCGTCGAT,0.7040646602119047
66,AAGAAAACATACAAGTCTGTTCACT,0.6293633976161706
1 change: 1 addition & 0 deletions tests/data/datasets/rna/modification.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"sequence":{"0":"TTGCCACACTGCTGGACGCCTGCAAGGCCAAGGGTACGGAGGTCATCATCATCACCACCGATACCTCGCCCTCAGGCACCAAGAAGACCCGGCAGTATCTC","1":"TTTGAAAAAATATTAGCAATGTGAGGACACTTAAGCAGTTTTGTCAATTCAGCTGAATCCAGCCTCATAGCAAAATCTGGTCTTAAATTCCCTCATCGTGC","2":"AGAAACATTCAACCTCCCTTCTTTTTATTCCAGTTGTCCTTTTCTCTGACACTTGCATCAATTTTCTGATTGCCTAGGCTCTTAATATTGCTTTCTGTTCA","3":"TTAGTTTTACTATGGAATCATAATAACCCACATAGAAGACTGATATTAAGAGCACAGAAGAAATAGTCCCAATGTTTATGTCATTTAATTTGAAAAATTTC","4":"CAACAGAAGTTTCTCATCTATAATCAGTAGCACTAAACTCTTGGTTTGAAAAATATTTAGTATGGGTAATACTTGGAGTATCAGTTTTCATTAAAATGTAC","5":"AATGTGTTTGTGTGTGTCTCTCACACACACACATAACATGTACATACCTGAAACTCATACTGCAATTGCAACACATCTTAAGTTTTTCCTTTTAAACATAC","6":"AAGTAGAAGACAACAGTACTCTTTTTTTTTTGAAATGGAGTCTCACTCTCACCCAGGCTGGAGTGCAATGGTGTGATCTCGGCCCACTGCACTCCAGCCTG","7":"GCCACCACACGTGGCCACAGTTTGGGCTTTTGAAAAAAGTTAGGTGGAGGAAGAGAGGTATGAGTACTCTAGTTTTCACTGCAGTATCCCATTTGTGTGTG","8":"AGGCTGTTTTAGCTTAAGTAAAATTTAAAAATTAGTTCCTTAGTCACATTAGCCACATTTAATGTGTTCTATAGCCACGTGTGACTGGTGGCTAACATATT","9":"GCAAGTGGTGTTTGGTTACATGAATAAGTTCTTTAGTGGGGATTTCTGAAATTTTGGTGCACCCATCACCTGAGCAGTGTACACTGTATCCAATGTGTAGT","10":"TTCTCAGGATATGTTATAGGATTCTTCTGACCACTAGAGTAGAGTGAACGATATGTTTTAATGTTCAGAAGTCACTATGGAGTAAACCAAATATATATAGG","11":"TTTTCCAGGATTTCATGAAACAAAGAGTTAAGAACTACAGTAGTGGAGCAATATTCATGGTGCTTTTTCTTTTTCTTTTGAAATAATTAAAAACTTACAGA","12":"GTATTGTCGTCTCACTCTATTATCAGCCTACCTCCGGTGGCCCTTGGGGCATGTGGCTGGGCCCAGGGTGATTCATCTAGAGCCAGCTCAGGTGGCAGTGA","13":"GGTTTTTTTTTTTTTTTTTTAGTCCATCCATTCTTTGATTTAATTTGGCAAACCCACATTAGATAATTTAGCAGAAGAGGAATTATATCTTCATCCTATTA","14":"AAGAAACCTGAACCAAGGCCTTGGGTATCAGATTGGCTGGATAAGGAGGGATGAGCACAGAAGGAAGGACAAAGATAATACCTTTTTCAAGATGAGCCTGT","15":"CTCATTTTGTAAGGAGACACTTAGATGCATTTCTGAAAAAAACAAAACAAAACAAAACAAAACAAAAAACACTTTGGGCTTTCTCTGTATTCTTCAAGCAT","16":"GTAAGTGAGATTACTTTATTTATTTCTTTTTCAGATTGTTCACTGTTGGCATATAGAAATGCTACTAATTGTTGTATGTTGATTTTGTATCCTGAAACTTT","17":"CATGCCTGTAATCCCACCTACTCAGGAGGCTGACGCAGGAGAATTGCTTGAATCCGGGAGGTGGAGGTTGCAGTGAGCCAAGATCACGCCACTGCACTCCA","18":"GGGTCCAGCCCAGGCTGTTTGGTCCCAGAGCCTGTGCTCTTGTCCATTATACTGGTGGTATTGCCCCTGGCATTGACAAAGTGGGAAAAGATGACTAACCT","19":"TGGCTCACACCTGTAATCCCTGCACTTTGGGAGGCCAAGGTGAGCAGATCACTTGAGGTCAGGAGTCTTGAGACCAGCCTGGCCAACATGGTGAAGCCCTA","20":"ATATGAATGATTTGTCATTTATGTCTAATCACTAAGTAAAAATATCAATTATGATTACTTTTTAAGTTTTATTGATGCATAATTATACATATTTATGGGGT","21":"ACATCAAAAAGTTTGAAAGAGCACAAATAGACAACCAAGGGTCACACGTCATGGAACTGGAGAAACAAGAACAATAGAAACCCAAACCTAGCAGAAGAAAA","22":"CTATCAGAAATAATGAAAAAACTCACCTTTGGGATTTTCATTAGTTTGGCAATCACTTCTCCTTTTGAAAGATTGGTGGACTGTACATTTTATTATTATTA","23":"TGATCTTATTTGTTTCTGTGTCTTGAAATAGTTTGCTGTTTTGTCATCTTAGAAATTGATTCATTATTAACTCATTTATTCTCAACTATGCTAAAAAAAAG","24":"AAACCACAAAGATGGGGAGAAACCAGAGCAGAAAAGCTGAAAAGTTCAAAAAACCAGAGCACCTCTTCTCCTCCAAAGGATTGCAGTTCCTCACTGCAAAG","25":"CTACACAAGGTATTTCACAATATCCTTAGGAATTACTGAGTTTTAGAGTGACAGAATAATTACCAATTATTCTGATAGTAAATTTGTAGGTACATTATAAT","26":"GCTACCTCTACTTTTAACATATTTTAGGCATTAGGACTTGCTTAGCCTTTAATACACAGGAATATTAACTAAAATGCACATATAAAACAATTGGTTAGACA","27":"CCTGGATCTAAAAGTGTTTTTATTTTTTGTGCCCACATCTGTAGTCATGGATTTGATGTATATATTTAATAACATTCAGTGATTTATTTTTCGGTTCACCT","28":"TCTGAAGTCATAGTCCCTTGGTTTTCCCTGACCTGCCTGCTACTGCGCCCACTTGCAGCAGCACCTCCGTTGCCCAGTGAAGCATGCTGCCCTGGTCTTAC","29":"CTGGGGCGGGCGGGTCAGTTGAGGCCAGGAGTTCGAGACCAGCCTGGCCAACGTGGCAAAACCCTGTCTCTACTAAAAATACAAAAAAGTTAGCTAGGCGT","30":"AATATTGCATGGGCCATACTTATATTTTTAAAATATTCATTGTTTATCAGAATTCAAATTTAACTGGGCATCCTGTATTTTTATTAGCTAAATCTGGCAAC","31":"GACTAGCTGCAGAAAGTGACATTTACACTGGGACAGGAGTCAAAGAGTATATTGATGCAAAGGAAAGACCATGAATTAGACCTGAGTTCAAATCCTAGCCG","32":"AGAAAAAGACAGAGGTTTATAGAAGTTTTTTCCACAAAATTTATTTGTGCATTAATCGATAGGCAACATAGTGTAAAACATAGCTAGCTGAATATTCAGAA","33":"TGCCACTATTGGGGTAACCCACCCCCAATATTACAACATAGGTTCTTTCTATTTTCCATAAGTGTTGGCTGGCTGAGAAATAAAGAGAAAGAGTACAAAGA","34":"TGGAAGGAAGAATTGCTTTTCTGAGGTCAATGCTCAGCTTGGCTGTTGGCAAGTCAACCTTTAGGAATCTGTGTATTCAGGGTATAGCAGTGGAAGTATAG","35":"AAAATCAGCAGCTAGTATTTGCAAATGGTGTTTGTATTTACTCTTGAAATACATGGTTTTGTGCTGGAGATTTGGAGTAAGGAAACTTAGGCACTATAGTC","36":"TCCACTTGCTGCATTATTTTTTTCTTTCTTTTTTTTTGCTGATTATTTTTATATGAATGTTAAATGATAAAGTCTTCTACATCATATCCCATTTAAGCTGC","37":"TGTTTTTACATTGAAAGTAGACAAATAGTTTTGTCATCTGTTTCTCATCCATTTCTAATATTTAAATATAATAAAGTCTAATTGAATACAAAAACAAACAA","38":"AAAGGATGACGAAGTGTAGAGAAGAGGCCAGCCATAGGAAAAGGGGAGTCACTTATGGGAAGGTGACTAGGAAATGTGTGATATACAGGGGTTGTTAGTAA","39":"GGGCCGTCCTGAACACTGCCACCTCTGAGCGTTGGCATCCATCTGCTAGGATTAGCATTGGAGCTTTTTTTGAAGGTATTTTGAAGTCTAATGGGAGAGGA","40":"TCCCCAGGCTGGAGTGCAATGGCACAATCACAGCATACCTCCCAGGCTCAAGCAATCCTCCCACCTCAGCCTTTTGAGTAGCTGGGACCAGAAGCACGTGC","41":"ATTATGGCCCAGCCTATACCCAGAAGAGAGGACTTAACTTGTGCTCCATGAACCACTGTGTCTGGGACACTGAGTAACCTAAGAATTTTCTTTGATATGAC","42":"TCAGCCTCCCGAGTAGGTAGGATTACAGGCATGCGCCACCATGACCGATTAATTTTGTATTTTTGGTAGAGACGGGGTTTCACCATGTTGGTCAGGCTGGT","43":"AAATTCATTTTTTCAATCATTTAAGGAACTTAGATATAAAATACACCTTTAATTCACCTTTGGAAATTTTTTACAAAGTGTTTTATTTGCAAATGACAGTG","44":"ATTAGTTATTTCAGTGTTTATTTCATTTGATGAAGAAACGTTTGCATATGAATGTTGGGAATTCTAGCAGGTCCTGCCTCAATGTGAAGAGGCATTTTTTT","45":"CAGGTGCCTGCCACCATGCCTGGCTTATTTTTGTATTTTTAGTAGAGACAAGGTTTCACCAGGTTGGCCACTCCTGGTCTTGAACTCCTGACCTCAGGTGA","46":"TTTTTTTTTTTTTTTTTTTTACTGTGTCCCAGGCTTAAGAAAAAAGTGATACATGATGTGGGATTAAAATCAAGAACATCATTGAACTTCACCTTCCCTCC","47":"CGGGAGGCACGGGCCCTTCGGGGATGACGTCACGGGCGGGGGCCCCGGACACGCGAGCCTTGCGCCCCACAGACGGCGGCGCAGCCCGCCGCCCTTTTCGA","48":"TGAGGCTTAAGTGATCCTCCCACCTTAGCCTCCTAAGTAGCTGGGAGTACAAATGCACACCACCACACCTGGCTAATTTTTGTATTTTTTGTTTTGCCATG","49":"ACTCATAGCTCTATGTCTCTTATAGTTCTTAGCACAATATCTTGGCCTAGATGAAGTACATAATAATTATATGTAGGGTTGTGGAAAGCAGTGCTGGCTTT","50":"GGCTCCTTCGGAGGCAGAATATGTCAACTCGTTGGCTTCTCACAAAATCAAGTGAGTCAGAAACCTGAATGGGGTTTCGGCTGGTCTCACCTAATTAACTT","51":"TATCTACCACCTGGATTCTACAACTGACATTTTATTATACCTAGTTTTTTACATGTCTGTCCATCTGTCTCATCCATAGATCCATTTTATTTCTTTATACA","52":"CATGTATGTATACTTAACTAAGTTAATAAAAACTGTCCTATTTCTCCTGGACATTAGAGAGATCTCAGAACTCTTTAACTCCGTGTACCCACCTCCTGACT","53":"GGAGCTGGTTCAGGAGATCACACAACATTTATTCTTCTTACAGGTACATCAGTCAAGGCTACCCCCCAGTTCTGAGAGAACTTGCCCAGGAGTGGTTGCAG","54":"TTCCTGGTTGGTTGAATCACTGGATGCGGTACCCACGGATGCAGAGAGTGACTGTACAGAAAAAAAGCATCTATTGCCTTTCCAGGCCAAGCTTTCTGTCT","55":"ACATTTTAGAAAATAAAATGCACCGAACAAACATGGGGTGTTCCTACCGCAGCATGGGAAAGGCGAGGCGCCATCCCACCAAGGCGGGTGTGGTTTTGAGC","56":"GAACGAAAAGAGGAAGTAGTGAGTGAAAAGGAAAGAAGAAAACATTAAGAAGTAGAGGAAAAAGAATTAAGTCGATTAGATGCAATGAGGGAAGAGGAAAA","57":"GAGAAACAGTGACAAATTCTGAGGGGAGCCTACAGTGTATAGTGTTGTGTATAGTGTGTATAGTATATAGTGGTTGTGTATAGTGGCCTCTGCCTTTTACC","58":"CCTTGCCAATCCCCATGAAAATGTTCAGTTATGTCAAAAGCAAGGCAAAAACAGTCTCTTGGCTATACAAGGGTAGCTGTTTTATTTGACTAAAATTTAGC","59":"ATTGTAGTGCAAAGCAGCCACAGACAAAATTTAAATGAATGAACCTGGCCATATTCCAATAAAATGAATTTGAATTTCAAATAATTTTTATGTGTCATAAA","60":"TGAGAAGAAAGAAAGAAAGAAAAAGAGGGGGGGGAGGGAGAGAGAGAGAGAGAAAGGAAGGAAGGAGAAAGAAGAAAGGGAGAGGGAGAGAGAGAGAGAGG","61":"AGTACTTTCAACACTGCATGGCACATAGTAAGGGCACAATAAATGTTAATAATTATGATGGTGGTCATGATGATGATGATCATATGCTTATCTTCCATCCC","62":"GACTCTGTCACCCCCCGCCCCCTGGAAAAAATGCGTTTTTTGACTTAATGATATTTTCAATTGTGATGGGTTAATTGAGATATCACCCCACTGTAAGTTTA","63":"CATATCTCATATTTACAGATTCCTTCAGGGTAAGAAAACTTATGTCTTCTAGGGAAACCACTCCTTTTAAATCTATGTGATTTATCCTATAAGCCACTTAA","64":"AATTTAAAAAGTGTTAAGCACCATAGATGTGCATTTTTAGGAATAAGATGAGTTATTCACTGAAGAAGAGCTCTGCAGGAAGGTGAAAGCTCTCCTTTAAA","65":"ATGGGTTTTGGATTTAATGGGGCATTGGGGGAGTGAGAGGGCATCTGCAGAAAAGAGCCATCCAGGCTGCAGAACTCTTGTTTCCAGCAAATAGTCCATTG","66":"AGATACCAGGAATGACCTGATTCAGGCTAGTAAGTGACGTTTGCCTAGAGATCAGTCTAACTGGGGCTCAAGATATGGCCTAGCTGTGAAACAACAGATGA"},"label":{"0":[1,0,0,0,0,0,0,0,0,0,0,0],"1":[1,0,0,0,0,0,0,0,0,0,0,0],"2":[1,0,0,0,0,0,0,0,0,0,0,0],"3":[1,0,0,0,0,0,0,0,0,0,0,0],"4":[1,0,0,0,0,0,0,0,0,0,0,0],"5":[1,0,0,0,0,0,0,0,0,0,0,0],"6":[1,0,0,0,0,0,0,0,0,0,0,0],"7":[1,0,0,0,0,0,0,0,0,0,0,0],"8":[1,0,0,0,0,0,0,0,0,0,0,0],"9":[1,0,0,0,0,0,0,0,0,0,0,0],"10":[1,0,0,0,0,0,0,0,0,0,0,0],"11":[1,0,0,0,0,0,0,0,0,0,0,0],"12":[1,0,0,0,0,0,0,0,0,0,0,0],"13":[1,0,0,0,0,0,0,0,0,0,0,0],"14":[1,0,0,0,0,0,0,0,0,0,0,0],"15":[1,0,0,0,0,0,0,0,0,0,0,0],"16":[1,0,0,0,0,0,0,0,0,0,0,0],"17":[1,0,0,0,0,0,0,0,0,0,0,0],"18":[1,0,0,0,0,0,0,0,0,0,0,0],"19":[1,0,0,0,0,0,0,0,0,0,0,0],"20":[1,0,0,0,0,0,0,0,0,0,0,0],"21":[1,0,0,0,0,0,0,0,0,0,0,0],"22":[1,0,0,0,0,0,0,0,0,0,0,0],"23":[1,0,0,0,0,0,0,0,0,0,0,0],"24":[1,0,0,0,0,0,0,0,0,0,0,0],"25":[1,0,0,0,0,0,0,0,0,0,0,0],"26":[1,0,0,0,0,0,0,0,0,0,0,0],"27":[1,0,0,0,0,0,0,0,0,0,0,0],"28":[1,0,0,0,0,0,0,0,0,0,0,0],"29":[1,0,0,0,0,0,0,0,0,0,0,0],"30":[1,0,0,0,0,0,0,0,0,0,0,0],"31":[1,0,0,0,0,0,0,0,0,0,0,0],"32":[1,0,0,0,0,0,0,0,0,0,0,0],"33":[1,0,0,0,0,0,0,0,0,0,0,0],"34":[1,0,0,0,0,0,0,0,0,0,0,0],"35":[1,0,0,0,0,0,0,0,0,0,0,0],"36":[1,0,0,0,0,0,0,0,0,0,0,0],"37":[1,0,0,0,0,0,0,0,0,0,0,0],"38":[1,0,0,0,0,0,0,0,0,0,0,0],"39":[1,0,0,0,0,0,0,0,0,0,0,0],"40":[1,0,0,0,0,0,0,0,0,0,0,0],"41":[1,0,0,0,0,0,0,0,0,0,0,0],"42":[1,0,0,0,0,0,0,0,0,0,0,0],"43":[1,0,0,0,0,0,0,0,0,0,0,0],"44":[1,0,0,0,0,0,0,0,0,0,0,0],"45":[1,0,0,0,0,0,0,0,0,0,0,0],"46":[1,0,0,0,0,0,0,0,0,0,0,0],"47":[1,0,0,0,0,0,0,0,0,0,0,0],"48":[1,0,0,0,0,0,0,0,0,0,0,0],"49":[1,0,0,0,0,0,0,0,0,0,0,0],"50":[1,0,0,0,0,0,0,0,0,0,0,0],"51":[1,0,0,0,0,0,0,0,0,0,0,0],"52":[1,0,0,0,0,0,0,0,0,0,0,0],"53":[1,0,0,0,0,0,0,0,0,0,0,0],"54":[1,0,0,0,0,0,0,0,0,0,0,0],"55":[1,0,0,0,0,0,0,0,0,0,0,0],"56":[1,0,0,0,0,0,0,0,0,0,0,0],"57":[1,0,0,0,0,0,0,0,0,0,0,0],"58":[1,0,0,0,0,0,0,0,0,0,0,0],"59":[1,0,0,0,0,0,0,0,0,0,0,0],"60":[1,0,0,0,0,0,0,0,0,0,0,0],"61":[1,0,0,0,0,0,0,0,0,0,0,0],"62":[1,0,0,0,0,0,0,0,0,0,0,0],"63":[1,0,0,0,0,0,0,0,0,0,0,0],"64":[1,0,0,0,0,0,0,0,0,0,0,0],"65":[1,0,0,0,0,0,0,0,0,0,0,0],"66":[1,0,0,0,0,0,0,0,0,0,0,0]}}
Loading

0 comments on commit ec7661e

Please sign in to comment.