From 31e5bc361a0c2839adac97cb41deb89998907611 Mon Sep 17 00:00:00 2001 From: Aaron Mussig Date: Wed, 16 Feb 2022 10:54:06 +1000 Subject: [PATCH] feat(GTDB): Add R202 and Genome. --- magna/config.py | 5 +++ magna/{dataset => }/gtdb/__init__.py | 0 magna/gtdb/enums.py | 10 +++++ magna/gtdb/genome.py | 60 ++++++++++++++++++++++++++++ magna/{dataset => }/gtdb/metadata.py | 36 +++++++++++++++-- magna/io.py | 14 +++++++ 6 files changed, 122 insertions(+), 3 deletions(-) rename magna/{dataset => }/gtdb/__init__.py (100%) create mode 100644 magna/gtdb/enums.py create mode 100644 magna/gtdb/genome.py rename magna/{dataset => }/gtdb/metadata.py (60%) diff --git a/magna/config.py b/magna/config.py index 0ee2e5a..daa0336 100644 --- a/magna/config.py +++ b/magna/config.py @@ -1,4 +1,9 @@ import os +import tempfile from pathlib import Path +# Persistent cache MAGNA_DIR = os.path.join(Path.home(), '.magna') + +# Temporary cache +CACHE_DIR = os.path.join(tempfile.gettempdir(), 'magna') diff --git a/magna/dataset/gtdb/__init__.py b/magna/gtdb/__init__.py similarity index 100% rename from magna/dataset/gtdb/__init__.py rename to magna/gtdb/__init__.py diff --git a/magna/gtdb/enums.py b/magna/gtdb/enums.py new file mode 100644 index 0000000..fb8d217 --- /dev/null +++ b/magna/gtdb/enums.py @@ -0,0 +1,10 @@ +from enum import Enum + + +class GtdbRelease(Enum): + R80 = '80' + R83 = '83' + R86 = '86' + R89 = '89' + R95 = '95' + R202 = '202' diff --git a/magna/gtdb/genome.py b/magna/gtdb/genome.py new file mode 100644 index 0000000..c21d6eb --- /dev/null +++ b/magna/gtdb/genome.py @@ -0,0 +1,60 @@ +import os +from typing import Dict, Tuple + +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord + +from magna.gtdb.enums import GtdbRelease +from magna.io import cache_file + + +class Genome: + + def __init__(self, accession: str, root: str): + self.accession: str = accession + self.root: str = root + + # Generate paths + base = os.path.basename(self.root) + self.cds_path = os.path.join(self.root, f'{base}_cds_from_genomic.fna') + self.fna_path = os.path.join(self.root, f'{base}_genomic.fna') + + def __repr__(self): + return str(self.accession) + + def cds_seqio(self) -> Tuple[SeqRecord, ...]: + # Returns the CDS generated from the FNA + with open(self.cds_path, 'r') as f: + out = tuple(SeqIO.parse(f, 'fasta')) + return out + + def fna_seqio(self) -> Tuple[SeqRecord, ...]: + # Returns the FNA + with open(self.fna_path, 'r') as f: + out = tuple(SeqIO.parse(f, 'fasta')) + return out + + +class GenomeDirs: + + def __init__(self, release: GtdbRelease): + self.release = release + + # Create the paths + srv_path = f'/srv/db/gtdb/genomes/ncbi/release{release.value}/genome_dirs.tsv' + cache_path = cache_file(srv_path, f'genome_dirs_{self.release.value}.tsv') + + # Read the data + self._data = self.read(cache_path) + + @staticmethod + def read(path: str) -> Dict[str, str]: + out = dict() + with open(path, 'r') as f: + for line in f: + short, root, canonical = line.strip().split('\t') + out[short] = root + return out + + def get(self, accession: str) -> Genome: + return Genome(accession=accession, root=self._data[accession]) diff --git a/magna/dataset/gtdb/metadata.py b/magna/gtdb/metadata.py similarity index 60% rename from magna/dataset/gtdb/metadata.py rename to magna/gtdb/metadata.py index 05e8e1c..364a281 100644 --- a/magna/dataset/gtdb/metadata.py +++ b/magna/gtdb/metadata.py @@ -7,7 +7,7 @@ from magna.io import download_file, md5sum, untar -class _GtdbMetadataR95: +class _GtdbMetadata: def __init__(self, source: str, path: str, md5: str): self.source = source @@ -43,7 +43,9 @@ def _download(self): df.to_feather(path=self.path, compression='lz4') -class GtdbMetadataR95Arc(_GtdbMetadataR95): +# ---------------------------------------------------------------------------------------------------------------------- + +class GtdbMetadataR95Arc(_GtdbMetadata): source = 'https://data.gtdb.ecogenomic.org/releases/release95/95.0/ar122_metadata_r95.tar.gz' path = os.path.join(MAGNA_DIR, 'dataset', 'gtdb', 'metadata', 'ar122_metadata_r95.feather') md5 = '110ad5daa2dbed2ee904b10c295da5dc' @@ -52,7 +54,7 @@ def __init__(self): super().__init__(self.source, self.path, self.md5) -class GtdbMetadataR95Bac(_GtdbMetadataR95): +class GtdbMetadataR95Bac(_GtdbMetadata): source = 'https://data.gtdb.ecogenomic.org/releases/release95/95.0/bac120_metadata_r95.tar.gz' path = os.path.join(MAGNA_DIR, 'dataset', 'gtdb', 'metadata', 'bac120_metadata_r95.feather') md5 = '223ada02ffca4d1a2dda6edb9a164dcd' @@ -65,3 +67,31 @@ class GtdbMetadataR95: def __init__(self): self.df = pd.concat([GtdbMetadataR95Arc().df, GtdbMetadataR95Bac().df]) + + +# ---------------------------------------------------------------------------------------------------------------------- + +class GtdbMetadataR202Arc(_GtdbMetadata): + source = 'https://data.gtdb.ecogenomic.org/releases/release202/202.0/ar122_metadata_r202.tar.gz' + path = os.path.join(MAGNA_DIR, 'dataset', 'gtdb', 'metadata', 'ar122_metadata_r202.feather') + md5 = '0607728ae1f56bdb1a7cc24d238185c3' + + def __init__(self): + super().__init__(self.source, self.path, self.md5) + + +class GtdbMetadataR202Bac(_GtdbMetadata): + source = 'https://data.gtdb.ecogenomic.org/releases/release202/202.0/bac120_metadata_r202.tar.gz' + path = os.path.join(MAGNA_DIR, 'dataset', 'gtdb', 'metadata', 'bac120_metadata_r202.feather') + md5 = '68fed11eb688982edb6f4669476c2a10' + + def __init__(self): + super().__init__(self.source, self.path, self.md5) + + +class GtdbMetadataR202: + + def __init__(self): + self.df = pd.concat([GtdbMetadataR202Arc().df, GtdbMetadataR202Bac().df]) + +# ---------------------------------------------------------------------------------------------------------------------- diff --git a/magna/io.py b/magna/io.py index b485d2e..93195ac 100644 --- a/magna/io.py +++ b/magna/io.py @@ -1,4 +1,6 @@ import hashlib +import os +import shutil import tarfile import urllib import urllib.request @@ -6,6 +8,8 @@ from tqdm import tqdm +from magna.config import CACHE_DIR + def untar(file_path, dir_name): """ @@ -52,3 +56,13 @@ def download_file(url: str, path: str, md5: Optional[str] = None): if md5 and md5 != md5sum(path): raise ValueError('Hash mismatch') + + +def cache_file(srv_path: str, local_name: str) -> str: + """Copies a remote file to the local machine.""" + if not os.path.isdir(CACHE_DIR): + os.makedirs(CACHE_DIR) + local_path = os.path.join(CACHE_DIR, local_name) + if not os.path.isfile(local_path): + shutil.copyfile(srv_path, local_path) + return local_path