-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
264 additions
and
227 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# beignet.datasets | ||
|
||
::: beignet.datasets.SequenceDataset | ||
::: beignet.datasets.SizedSequenceDataset | ||
::: beignet.datasets.FASTADataset | ||
::: beignet.datasets.UniRef50Dataset | ||
::: beignet.datasets.UniRef90Dataset | ||
::: beignet.datasets.UniRef100Dataset |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,16 @@ | ||
from .__uni_ref_dataset import _UniRefDataset | ||
from ._fasta_dataset import FASTADataset | ||
from ._sequence_dataset import SequenceDataset | ||
from ._sized_sequence_dataset import SizedSequenceDataset | ||
from ._uni_ref_50_dataset import UniRef50Dataset | ||
from ._uni_ref_90_dataset import UniRef90Dataset | ||
from ._uni_ref_100_dataset import UniRef100Dataset | ||
from ._uni_ref_dataset import UniRefDataset | ||
from ._uniref50_dataset import UniRef50Dataset | ||
from ._uniref90_dataset import UniRef90Dataset | ||
from ._uniref100_dataset import UniRef100Dataset | ||
|
||
__all__ = [ | ||
"FASTADataset", | ||
"SequenceDataset", | ||
"SizedSequenceDataset", | ||
"UniRef100Dataset", | ||
"UniRef50Dataset", | ||
"UniRef90Dataset", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
import os.path | ||
import re | ||
from pathlib import Path | ||
from typing import Callable | ||
|
||
from beignet.io import download_and_extract_archive | ||
|
||
from ..transforms import Transform | ||
from ._fasta_dataset import FASTADataset | ||
|
||
|
||
class _UniRefDataset(FASTADataset): | ||
def __init__( | ||
self, | ||
root: str | Path, | ||
name: str, | ||
md5: (str, str), | ||
*, | ||
index: bool = True, | ||
download: bool = False, | ||
transform: Callable | Transform | None = None, | ||
target_transform: Callable | Transform | None = None, | ||
) -> None: | ||
""" | ||
Parameters | ||
---------- | ||
root : str | Path | ||
Root directory where the dataset subdirectory exists or, if | ||
`download` is `True`, the directory where the dataset subdirectory | ||
will be created and the dataset downloaded. | ||
index : bool, optional | ||
If `True`, caches the sequence indicies to disk for faster | ||
re-initialization (default: `True`). | ||
download : bool, optional | ||
If `True`, download the dataset and to the `root` directory | ||
(default: `False`). If the dataset is already downloaded, it is | ||
not redownloaded. | ||
transform : Callable | Transform, optional | ||
A `Callable` or `Transform` that that maps a sequence to a | ||
transformed sequence (default: `None`). | ||
target_transform : Callable | Transform, optional | ||
A `Callable` or `Transform` that maps a target (a cluster | ||
identifier) to a transformed target (default: `None`). | ||
""" | ||
root = Path(root) | ||
|
||
directory = root / name | ||
|
||
path = directory / f"{name}.fasta" | ||
|
||
if download and not os.path.exists(path): | ||
download_and_extract_archive( | ||
f"http://ftp.uniprot.org/pub/databases/uniprot/uniref/{name}/{name}.fasta.gz", | ||
str(directory), | ||
str(directory), | ||
f"{name}.fasta.gz", | ||
md5[1], | ||
) | ||
|
||
self._pattern = re.compile(r"^UniRef.+_([A-Z0-9]+)\s.+$") | ||
|
||
super().__init__(path, index=index) | ||
|
||
self._transform = transform | ||
|
||
self._target_transform = target_transform | ||
|
||
def __getitem__(self, index: int) -> (str, str): | ||
target, sequence = self.get(index) | ||
|
||
(target,) = re.search(self._pattern, target).groups() | ||
|
||
if self._transform: | ||
sequence = self._transform(sequence) | ||
|
||
if self._target_transform: | ||
target = self._target_transform(target) | ||
|
||
return sequence, target |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.