diff --git a/src/beignet/datasets/__uni_ref_dataset.py b/src/beignet/datasets/__uni_ref_dataset.py index 2f345ac1aa..f59c1f8756 100644 --- a/src/beignet/datasets/__uni_ref_dataset.py +++ b/src/beignet/datasets/__uni_ref_dataset.py @@ -1,8 +1,10 @@ import re +from os import PathLike from pathlib import Path from typing import Callable import pooch +from pooch import Decompress from beignet.transforms import Transform @@ -12,32 +14,30 @@ class _UniRefDataset(FASTADataset): def __init__( self, - root: str | Path, - name: str, - md5: (str, str), + url: str, + root: str | PathLike | None = None, + known_hash: str | None = None, *, index: bool = True, - download: bool = False, transform: Callable | Transform | None = None, target_transform: Callable | Transform | None = None, ) -> None: """ Parameters ---------- - root : str | Path + url : str + URL to the file that needs to be downloaded. Ideally, the URL + should end with a file name (e.g., `uniref50.fasta.gz`). + + root : str | PathLike, optional Root directory where the dataset subdirectory exists or, if `download` is `True`, the directory where the dataset subdirectory will be created and the dataset downloaded. index : bool, optional - If `True`, caches the sequence indicies to disk for faster + If `True`, caches the sequence indexes to disk for faster re-initialization (default: `True`). - download : bool, optional - If `True`, download the dataset and to the `root` directory - (default: `False`). If the dataset is already downloaded, it is - not redownloaded. - transform : Callable | Transform, optional A `Callable` or `Transform` that that maps a sequence to a transformed sequence (default: `None`). @@ -46,20 +46,24 @@ def __init__( A `Callable` or `Transform` that maps a target (a cluster identifier) to a transformed target (default: `None`). """ - root = Path(root) + if root is None: + root = pooch.os_cache("beignet") + + if isinstance(root, str): + root = Path(root) - directory = root / name + root = root.resolve() - path = directory / f"{name}.fasta" + name = self.__class__.__name__.replace("Dataset", "") - if download: - pooch.retrieve( - f"http://ftp.uniprot.org/pub/databases/uniprot/uniref/{name}/{name}.fasta.gz", - md5[1], - f"{name}.fasta.gz", - root / name, - progressbar=True, - ) + path = pooch.retrieve( + url, + known_hash, + f"{name}.fasta.gz", + root / name, + processor=Decompress(), + progressbar=True, + ) self._pattern = re.compile(r"^UniRef.+_([A-Z0-9]+)\s.+$") diff --git a/src/beignet/datasets/_uniref100_dataset.py b/src/beignet/datasets/_uniref100_dataset.py index 1c6c758540..0e7e3b7c0e 100644 --- a/src/beignet/datasets/_uniref100_dataset.py +++ b/src/beignet/datasets/_uniref100_dataset.py @@ -11,7 +11,6 @@ def __init__( root: str | Path, *, index: bool = True, - download: bool = False, transform: Callable | Transform | None = None, target_transform: Callable | Transform | None = None, ) -> None: @@ -27,11 +26,6 @@ def __init__( If `True`, caches the sequence indicies to disk for faster re-initialization (default: `True`). - download : bool, optional - If `True`, download the dataset and to the `root` directory - (default: `False`). If the dataset is already downloaded, it is - not redownloaded. - transform : Callable, optional A `Callable` or `Transform` that that maps a sequence to a transformed sequence (default: `None`). @@ -41,11 +35,10 @@ def __init__( identifier) to a transformed target (default: `None`). """ super().__init__( + "http://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.fasta.gz", root, - "uniref100", - "0354240a56f4ca91ff426f8241cfeb7d", + "md5:0354240a56f4ca91ff426f8241cfeb7d", index=index, - download=download, transform=transform, target_transform=target_transform, ) diff --git a/src/beignet/datasets/_uniref50_dataset.py b/src/beignet/datasets/_uniref50_dataset.py index 72656f0eac..1ff14948f4 100644 --- a/src/beignet/datasets/_uniref50_dataset.py +++ b/src/beignet/datasets/_uniref50_dataset.py @@ -1,4 +1,4 @@ -from pathlib import Path +from os import PathLike from typing import Callable from beignet.transforms import Transform @@ -9,30 +9,24 @@ class UniRef50Dataset(_UniRefDataset): def __init__( self, - root: str | Path, + root: str | PathLike | None = None, *, index: bool = True, - download: bool = False, transform: Callable | Transform | None = None, target_transform: Callable | Transform | None = None, ) -> None: """ Parameters ---------- - root : str | Path + root : str | PathLike, optional Root directory where the dataset subdirectory exists or, if `download` is `True`, the directory where the dataset subdirectory will be created and the dataset downloaded. index : bool, optional - If `True`, caches the sequence indicies to disk for faster + If `True`, caches the sequence indexes to disk for faster re-initialization (default: `True`). - download : bool, optional - If `True`, download the dataset and to the `root` directory - (default: `False`). If the dataset is already downloaded, it is - not redownloaded. - transform : Callable, optional A `Callable` or `Transform` that that maps a sequence to a transformed sequence (default: `None`). @@ -42,11 +36,10 @@ def __init__( identifier) to a transformed target (default: `None`). """ super().__init__( + "http://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz", root, - "uniref50", - "e638c63230d13ad5e2098115b9cb5d8f", + "md5:e638c63230d13ad5e2098115b9cb5d8f", index=index, - download=download, transform=transform, target_transform=target_transform, ) diff --git a/src/beignet/datasets/_uniref90_dataset.py b/src/beignet/datasets/_uniref90_dataset.py index 5b2b0c1ab0..7b400b7ee6 100644 --- a/src/beignet/datasets/_uniref90_dataset.py +++ b/src/beignet/datasets/_uniref90_dataset.py @@ -27,11 +27,6 @@ def __init__( If `True`, caches the sequence indicies to disk for faster re-initialization (default: `True`). - download : bool, optional - If `True`, download the dataset and to the `root` directory - (default: `False`). If the dataset is already downloaded, it is - not redownloaded. - transform : Callable, optional A `Callable` or `Transform` that that maps a sequence to a transformed sequence (default: `None`). @@ -41,11 +36,10 @@ def __init__( identifier) to a transformed target (default: `None`). """ super().__init__( + "http://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz", root, - "uniref90", - "6161bad4d7506365aee882fd5ff9c833", + "md5:6161bad4d7506365aee882fd5ff9c833", index=index, - download=download, transform=transform, target_transform=target_transform, )