Skip to content

Commit

Permalink
docs
Browse files Browse the repository at this point in the history
  • Loading branch information
0x00b1 committed May 14, 2024
1 parent c31ddc4 commit 7075ef0
Show file tree
Hide file tree
Showing 11 changed files with 264 additions and 227 deletions.
8 changes: 8 additions & 0 deletions docs/beignet.datasets.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# beignet.datasets

::: beignet.datasets.SequenceDataset
::: beignet.datasets.SizedSequenceDataset
::: beignet.datasets.FASTADataset
::: beignet.datasets.UniRef50Dataset
::: beignet.datasets.UniRef90Dataset
::: beignet.datasets.UniRef100Dataset
11 changes: 7 additions & 4 deletions src/beignet/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from .__uni_ref_dataset import _UniRefDataset
from ._fasta_dataset import FASTADataset
from ._sequence_dataset import SequenceDataset
from ._sized_sequence_dataset import SizedSequenceDataset
from ._uni_ref_50_dataset import UniRef50Dataset
from ._uni_ref_90_dataset import UniRef90Dataset
from ._uni_ref_100_dataset import UniRef100Dataset
from ._uni_ref_dataset import UniRefDataset
from ._uniref50_dataset import UniRef50Dataset
from ._uniref90_dataset import UniRef90Dataset
from ._uniref100_dataset import UniRef100Dataset

__all__ = [
"FASTADataset",
"SequenceDataset",
"SizedSequenceDataset",
"UniRef100Dataset",
"UniRef50Dataset",
"UniRef90Dataset",
]
83 changes: 83 additions & 0 deletions src/beignet/datasets/__uni_ref_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import os.path
import re
from pathlib import Path
from typing import Callable

from beignet.io import download_and_extract_archive

from ..transforms import Transform
from ._fasta_dataset import FASTADataset


class _UniRefDataset(FASTADataset):
def __init__(
self,
root: str | Path,
name: str,
md5: (str, str),
*,
index: bool = True,
download: bool = False,
transform: Callable | Transform | None = None,
target_transform: Callable | Transform | None = None,
) -> None:
"""
Parameters
----------
root : str | Path
Root directory where the dataset subdirectory exists or, if
`download` is `True`, the directory where the dataset subdirectory
will be created and the dataset downloaded.
index : bool, optional
If `True`, caches the sequence indicies to disk for faster
re-initialization (default: `True`).
download : bool, optional
If `True`, download the dataset and to the `root` directory
(default: `False`). If the dataset is already downloaded, it is
not redownloaded.
transform : Callable | Transform, optional
A `Callable` or `Transform` that that maps a sequence to a
transformed sequence (default: `None`).
target_transform : Callable | Transform, optional
A `Callable` or `Transform` that maps a target (a cluster
identifier) to a transformed target (default: `None`).
"""
root = Path(root)

directory = root / name

path = directory / f"{name}.fasta"

if download and not os.path.exists(path):
download_and_extract_archive(
f"http://ftp.uniprot.org/pub/databases/uniprot/uniref/{name}/{name}.fasta.gz",
str(directory),
str(directory),
f"{name}.fasta.gz",
md5[1],
)

self._pattern = re.compile(r"^UniRef.+_([A-Z0-9]+)\s.+$")

super().__init__(path, index=index)

self._transform = transform

self._target_transform = target_transform

def __getitem__(self, index: int) -> (str, str):
target, sequence = self.get(index)

(target,) = re.search(self._pattern, target).groups()

if self._transform:
sequence = self._transform(sequence)

if self._target_transform:
target = self._target_transform(target)

return sequence, target
6 changes: 3 additions & 3 deletions src/beignet/datasets/_fasta_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,15 @@ def __init__(
else:
self.offsets, sizes = self._build_index()

self._transform_fn = transform
self._transform = transform

super().__init__(self.root, sizes)

def __getitem__(self, index: int) -> Tuple[str, str]:
x = self.get(index)

if self._transform_fn:
x = self._transform_fn(x)
if self._transform:
x = self._transform(x)

return x

Expand Down
46 changes: 0 additions & 46 deletions src/beignet/datasets/_uni_ref_100_dataset.py

This file was deleted.

46 changes: 0 additions & 46 deletions src/beignet/datasets/_uni_ref_50_dataset.py

This file was deleted.

46 changes: 0 additions & 46 deletions src/beignet/datasets/_uni_ref_90_dataset.py

This file was deleted.

82 changes: 0 additions & 82 deletions src/beignet/datasets/_uni_ref_dataset.py

This file was deleted.

Loading

0 comments on commit 7075ef0

Please sign in to comment.