Skip to content

Commit

Permalink
cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
0x00b1 committed May 14, 2024
1 parent 3f5b1bf commit 4521618
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 52 deletions.
48 changes: 26 additions & 22 deletions src/beignet/datasets/__uni_ref_dataset.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import re
from os import PathLike
from pathlib import Path
from typing import Callable

import pooch
from pooch import Decompress

from beignet.transforms import Transform

Expand All @@ -12,32 +14,30 @@
class _UniRefDataset(FASTADataset):
def __init__(
self,
root: str | Path,
name: str,
md5: (str, str),
url: str,
root: str | PathLike | None = None,
known_hash: str | None = None,
*,
index: bool = True,
download: bool = False,
transform: Callable | Transform | None = None,
target_transform: Callable | Transform | None = None,
) -> None:
"""
Parameters
----------
root : str | Path
url : str
URL to the file that needs to be downloaded. Ideally, the URL
should end with a file name (e.g., `uniref50.fasta.gz`).
root : str | PathLike, optional
Root directory where the dataset subdirectory exists or, if
`download` is `True`, the directory where the dataset subdirectory
will be created and the dataset downloaded.
index : bool, optional
If `True`, caches the sequence indicies to disk for faster
If `True`, caches the sequence indexes to disk for faster
re-initialization (default: `True`).
download : bool, optional
If `True`, download the dataset and to the `root` directory
(default: `False`). If the dataset is already downloaded, it is
not redownloaded.
transform : Callable | Transform, optional
A `Callable` or `Transform` that that maps a sequence to a
transformed sequence (default: `None`).
Expand All @@ -46,20 +46,24 @@ def __init__(
A `Callable` or `Transform` that maps a target (a cluster
identifier) to a transformed target (default: `None`).
"""
root = Path(root)
if root is None:
root = pooch.os_cache("beignet")

if isinstance(root, str):
root = Path(root)

directory = root / name
root = root.resolve()

path = directory / f"{name}.fasta"
name = self.__class__.__name__.replace("Dataset", "")

if download:
pooch.retrieve(
f"http://ftp.uniprot.org/pub/databases/uniprot/uniref/{name}/{name}.fasta.gz",
md5[1],
f"{name}.fasta.gz",
root / name,
progressbar=True,
)
path = pooch.retrieve(
url,
known_hash,
f"{name}.fasta.gz",
root / name,
processor=Decompress(),
progressbar=True,
)

self._pattern = re.compile(r"^UniRef.+_([A-Z0-9]+)\s.+$")

Expand Down
11 changes: 2 additions & 9 deletions src/beignet/datasets/_uniref100_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ def __init__(
root: str | Path,
*,
index: bool = True,
download: bool = False,
transform: Callable | Transform | None = None,
target_transform: Callable | Transform | None = None,
) -> None:
Expand All @@ -27,11 +26,6 @@ def __init__(
If `True`, caches the sequence indicies to disk for faster
re-initialization (default: `True`).
download : bool, optional
If `True`, download the dataset and to the `root` directory
(default: `False`). If the dataset is already downloaded, it is
not redownloaded.
transform : Callable, optional
A `Callable` or `Transform` that that maps a sequence to a
transformed sequence (default: `None`).
Expand All @@ -41,11 +35,10 @@ def __init__(
identifier) to a transformed target (default: `None`).
"""
super().__init__(
"http://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.fasta.gz",
root,
"uniref100",
"0354240a56f4ca91ff426f8241cfeb7d",
"md5:0354240a56f4ca91ff426f8241cfeb7d",
index=index,
download=download,
transform=transform,
target_transform=target_transform,
)
19 changes: 6 additions & 13 deletions src/beignet/datasets/_uniref50_dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from pathlib import Path
from os import PathLike
from typing import Callable

from beignet.transforms import Transform
Expand All @@ -9,30 +9,24 @@
class UniRef50Dataset(_UniRefDataset):
def __init__(
self,
root: str | Path,
root: str | PathLike | None = None,
*,
index: bool = True,
download: bool = False,
transform: Callable | Transform | None = None,
target_transform: Callable | Transform | None = None,
) -> None:
"""
Parameters
----------
root : str | Path
root : str | PathLike, optional
Root directory where the dataset subdirectory exists or, if
`download` is `True`, the directory where the dataset subdirectory
will be created and the dataset downloaded.
index : bool, optional
If `True`, caches the sequence indicies to disk for faster
If `True`, caches the sequence indexes to disk for faster
re-initialization (default: `True`).
download : bool, optional
If `True`, download the dataset and to the `root` directory
(default: `False`). If the dataset is already downloaded, it is
not redownloaded.
transform : Callable, optional
A `Callable` or `Transform` that that maps a sequence to a
transformed sequence (default: `None`).
Expand All @@ -42,11 +36,10 @@ def __init__(
identifier) to a transformed target (default: `None`).
"""
super().__init__(
"http://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz",
root,
"uniref50",
"e638c63230d13ad5e2098115b9cb5d8f",
"md5:e638c63230d13ad5e2098115b9cb5d8f",
index=index,
download=download,
transform=transform,
target_transform=target_transform,
)
10 changes: 2 additions & 8 deletions src/beignet/datasets/_uniref90_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,6 @@ def __init__(
If `True`, caches the sequence indicies to disk for faster
re-initialization (default: `True`).
download : bool, optional
If `True`, download the dataset and to the `root` directory
(default: `False`). If the dataset is already downloaded, it is
not redownloaded.
transform : Callable, optional
A `Callable` or `Transform` that that maps a sequence to a
transformed sequence (default: `None`).
Expand All @@ -41,11 +36,10 @@ def __init__(
identifier) to a transformed target (default: `None`).
"""
super().__init__(
"http://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz",
root,
"uniref90",
"6161bad4d7506365aee882fd5ff9c833",
"md5:6161bad4d7506365aee882fd5ff9c833",
index=index,
download=download,
transform=transform,
target_transform=target_transform,
)

0 comments on commit 4521618

Please sign in to comment.