From e891e84045e30a00037dcfed9a1e818a5aab13d1 Mon Sep 17 00:00:00 2001 From: Aaron Mussig Date: Thu, 10 Mar 2022 13:05:02 +1000 Subject: [PATCH] feat: Refactored code, added CI for docs. BREAKING CHANGE: Re-organised folder structure. --- .../{push-main.yml => pypi-publish.yml} | 23 +--- .github/workflows/semantic-release.yml | 63 +++++++++ .releaserc | 3 +- README.md | 17 ++- docs/Makefile | 20 +++ docs/requirements.txt | 4 + docs/source/_static/css/custom.css | 13 ++ docs/source/changelog.md | 1 + docs/source/conf.py | 109 +++++++++++++++ docs/source/gtdb/enums.rst | 8 ++ docs/source/gtdb/genome.rst | 11 ++ docs/source/gtdb/metadata.rst | 33 +++++ docs/source/gtdb/tree.rst | 26 ++++ docs/source/gunc/index.rst | 24 ++++ docs/source/index.rst | 36 +++++ docs/source/util/accession.rst | 5 + docs/source/util/io.rst | 16 +++ magna/dataset/gunc/__init__.py | 120 ----------------- magna/gtdb/enums.py | 2 + magna/gtdb/genome.py | 26 +++- magna/gtdb/metadata.py | 39 ++++-- magna/gtdb/tree.py | 31 +++-- magna/gunc/__init__.py | 124 ++++++++++++++++++ magna/{dataset => util}/__init__.py | 0 magna/{ => util}/accession.py | 10 +- magna/{ => util}/io.py | 68 ++++++++-- package.json | 18 --- setup.py | 3 +- 28 files changed, 649 insertions(+), 204 deletions(-) rename .github/workflows/{push-main.yml => pypi-publish.yml} (60%) create mode 100644 .github/workflows/semantic-release.yml create mode 100644 docs/Makefile create mode 100644 docs/requirements.txt create mode 100644 docs/source/_static/css/custom.css create mode 120000 docs/source/changelog.md create mode 100644 docs/source/conf.py create mode 100644 docs/source/gtdb/enums.rst create mode 100644 docs/source/gtdb/genome.rst create mode 100644 docs/source/gtdb/metadata.rst create mode 100644 docs/source/gtdb/tree.rst create mode 100644 docs/source/gunc/index.rst create mode 100644 docs/source/index.rst create mode 100644 docs/source/util/accession.rst create mode 100644 docs/source/util/io.rst delete mode 100644 magna/dataset/gunc/__init__.py rename magna/{dataset => util}/__init__.py (100%) rename magna/{ => util}/accession.py (61%) rename magna/{ => util}/io.py (53%) diff --git a/.github/workflows/push-main.yml b/.github/workflows/pypi-publish.yml similarity index 60% rename from .github/workflows/push-main.yml rename to .github/workflows/pypi-publish.yml index d6ccafd..694df95 100644 --- a/.github/workflows/push-main.yml +++ b/.github/workflows/pypi-publish.yml @@ -1,25 +1,14 @@ -name: Release +name: PyPI publish on: - push: - branches: - - main + release: + types: [ published ] jobs: - release: - name: release + pypi-publish: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - uses: actions/setup-node@v2 - with: - cache: npm - node-version: 16 - - run: npm install - - run: npx semantic-release - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Set up Python + - name: Setup Python uses: actions/setup-python@v2 with: python-version: '3.6' @@ -33,4 +22,4 @@ jobs: TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} run: | python setup.py sdist bdist_wheel - twine upload dist/* \ No newline at end of file + twine upload dist/* diff --git a/.github/workflows/semantic-release.yml b/.github/workflows/semantic-release.yml new file mode 100644 index 0000000..de67e9f --- /dev/null +++ b/.github/workflows/semantic-release.yml @@ -0,0 +1,63 @@ +name: Semantic release +on: + push: + branches: + - main + +jobs: + semantic-release: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-node@v2 + with: + cache: npm + node-version: 16 + - run: npm install + - run: npx semantic-release + env: + GITHUB_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }} + + documentation: + runs-on: ubuntu-latest + needs: ['semantic-release'] + steps: + - uses: actions/setup-python@v2 + with: + python-version: '3.8' + + # Checkout the main branch to /main + - uses: actions/checkout@v2 + with: + path: main + + # Checkout the GitHub pages branch to /gh-pages + - uses: actions/checkout@v2 + with: + ref: gh-pages + path: gh-pages + + - name: Install Sphinx requirements + working-directory: ${{ github.workspace }}/main/docs + run: | + sudo apt-get update -y + python -m pip install --upgrade pip + python -m pip install -r requirements.txt + + - name: Build documentation + working-directory: ${{ github.workspace }}/main/docs + run: make html + + - name: Move documentation + run: | + touch main/docs/build/html/.nojekyll + cp -a main/docs/build/html/. gh-pages/ + + - name: Upload documentation + working-directory: ${{ github.workspace }}/gh-pages + run: | + git config --local user.email "action@github.com" + git config --local user.name "GitHub Action" + git add --all + git diff-index --quiet HEAD || git commit -m "docs update" -a + git push diff --git a/.releaserc b/.releaserc index bbea15c..b65e40c 100644 --- a/.releaserc +++ b/.releaserc @@ -20,7 +20,8 @@ [ "@semantic-release/changelog", { - "changelogFile": "CHANGELOG.md" + "changelogFile": "CHANGELOG.md", + "changelogTitle": "# Changelog" } ], [ diff --git a/README.md b/README.md index 07d03b3..780798e 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,17 @@ # magna -The magna CLI package is a collection of commands that I frequently use in bioinformatics and day-to-day life. + +[![PyPI](https://img.shields.io/pypi/v/magna.svg)](https://pypi.python.org/pypi/magna) + +Magna is a collection of bioinformatic datasets and utilities I use in my everyday life. + +This has been written with the intention of personal use, but feel free to use/contribute. + +**Documentation:** [https://aaronmussig.github.io/magna/](https://aaronmussig.github.io/magna/) + + +## Contributing + +This project uses [Semantic Versioning](http://semver.org/) and [Conventional Commits](https://conventionalcommits.org/) +to automatically generate release notes using [Semantic Release](https://semantic-release.gitbook.io/semantic-release/). + +Please ensure that your commits are property formatted. diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..88d11cc --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,4 @@ +sphinx-rtd-theme ~= 1.0.0 +sphinx ~= 4.4.0 +sphinx-autodoc-typehints ~= 1.12.0 +myst-parser ~= 0.16.1 diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css new file mode 100644 index 0000000..f0c8365 --- /dev/null +++ b/docs/source/_static/css/custom.css @@ -0,0 +1,13 @@ +@import 'theme.css'; + +.wy-nav-content { + max-width: 100% !important; +} + +.wy-side-nav-search { + background-color: #2a518f !important; +} + +.rst-content img { + margin-right: 5px; +} \ No newline at end of file diff --git a/docs/source/changelog.md b/docs/source/changelog.md new file mode 120000 index 0000000..699cc9e --- /dev/null +++ b/docs/source/changelog.md @@ -0,0 +1 @@ +../../CHANGELOG.md \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..88773de --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,109 @@ +import os +import sys +from datetime import datetime +import sphinx_rtd_theme + +sys.path.insert(0, os.path.abspath('../..')) +from magna import __author__, __version__, __title__, __url__ + +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + + +# The full version, including alpha/beta/rc tags +release = __version__ +project = __title__ +copyright = f'{datetime.now().year}, {__author__}' +author = __author__ + +# The full version, including alpha/beta/rc tags +version = __version__ + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +github_url = __url__ + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon', + 'sphinx_rtd_theme', 'sphinx_autodoc_typehints', + 'myst_parser'] + +# Napoleon settings +napoleon_google_docstring = True +napoleon_numpy_docstring = False +napoleon_include_init_with_doc = False +napoleon_include_private_with_doc = False +napoleon_include_special_with_doc = True +napoleon_use_admonition_for_examples = False +napoleon_use_admonition_for_notes = False +napoleon_use_admonition_for_references = False +napoleon_use_ivar = False +napoleon_use_param = True +napoleon_use_rtype = True +napoleon_preprocess_types = False +napoleon_type_aliases = None +napoleon_attr_annotations = True + +# autodoc_typehints = 'None' +autodoc_typehints_format = 'short' + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. + +html_theme_options = {# 'analytics_id': 'G-XXXXXXXXXX', # Provided by Google in your dashboard + 'analytics_anonymize_ip': False, 'logo_only': False, 'display_version': True, + 'prev_next_buttons_location': 'bottom', 'style_external_links': False, 'vcs_pageview_mode': '', + 'style_nav_header_background': 'white', # Toc options + 'collapse_navigation': True, 'sticky_navigation': True, 'navigation_depth': 4, 'includehidden': True, + 'titles_only': False} + +# +html_theme = 'sphinx_rtd_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +html_style = 'css/custom.css' + +html_context = {'display_github': True, 'github_user': 'aaronmussig', 'github_repo': 'magna', + 'github_version': 'master/docs/source/'} + + +source_suffix = { + '.rst': 'restructuredtext', + '.txt': 'markdown', + '.md': 'markdown', +} diff --git a/docs/source/gtdb/enums.rst b/docs/source/gtdb/enums.rst new file mode 100644 index 0000000..d9049cc --- /dev/null +++ b/docs/source/gtdb/enums.rst @@ -0,0 +1,8 @@ +***** +Enums +***** + +.. autoclass:: magna.gtdb.enums.GtdbRelease + :members: + :undoc-members: + :member-order: bysource diff --git a/docs/source/gtdb/genome.rst b/docs/source/gtdb/genome.rst new file mode 100644 index 0000000..a03e1bc --- /dev/null +++ b/docs/source/gtdb/genome.rst @@ -0,0 +1,11 @@ +****** +Genome +****** + +.. autoclass:: magna.gtdb.genome.Genome + :members: + + +.. autoclass:: magna.gtdb.genome.GenomeDirs + :members: + :special-members: __init__ diff --git a/docs/source/gtdb/metadata.rst b/docs/source/gtdb/metadata.rst new file mode 100644 index 0000000..44150de --- /dev/null +++ b/docs/source/gtdb/metadata.rst @@ -0,0 +1,33 @@ +******** +Metadata +******** + +.. autoclass:: magna.gtdb.metadata.GtdbMetadata + :members: + + +Release 95 +---------- + +.. autoclass:: magna.gtdb.metadata.GtdbMetadataR95 + :members: + +.. autoclass:: magna.gtdb.metadata.GtdbMetadataR95Arc + :show-inheritance: + +.. autoclass:: magna.gtdb.metadata.GtdbMetadataR95Bac + :show-inheritance: + + + +Release 202 +----------- + +.. autoclass:: magna.gtdb.metadata.GtdbMetadataR202 + :members: + +.. autoclass:: magna.gtdb.metadata.GtdbMetadataR202Arc + :show-inheritance: + +.. autoclass:: magna.gtdb.metadata.GtdbMetadataR202Bac + :show-inheritance: diff --git a/docs/source/gtdb/tree.rst b/docs/source/gtdb/tree.rst new file mode 100644 index 0000000..1537803 --- /dev/null +++ b/docs/source/gtdb/tree.rst @@ -0,0 +1,26 @@ +**** +Tree +**** + +.. autoclass:: magna.gtdb.tree.GtdbTree + :members: + + +Release 95 +---------- + +.. autoclass:: magna.gtdb.tree.GtdbTreeR95Arc + :show-inheritance: + +.. autoclass:: magna.gtdb.tree.GtdbTreeR95Bac + :show-inheritance: + + +Release 202 +----------- + +.. autoclass:: magna.gtdb.tree.GtdbTreeR202Arc + :show-inheritance: + +.. autoclass:: magna.gtdb.tree.GtdbTreeR202Bac + :show-inheritance: diff --git a/docs/source/gunc/index.rst b/docs/source/gunc/index.rst new file mode 100644 index 0000000..054d283 --- /dev/null +++ b/docs/source/gunc/index.rst @@ -0,0 +1,24 @@ +**** +GUNC +**** + +GTDB R95 reference DB +--------------------- + +Note: The following methods have source data that is not publicly available: + +.. autofunction:: magna.gunc.gunc_max_css_scores_gtdb_r95 + +.. autofunction:: magna.gunc.gunc_contig_assignment_gtdb_r95 + +.. autofunction:: magna.gunc.gunc_all_levels_gtdb_r95 + + +Progenes reference DB +--------------------- + +.. autoclass:: magna.gunc.GuncMaxCssScores + :members: + +.. autoclass:: magna.gunc.GuncAllScores + :members: diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..25273b0 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,36 @@ +***** +magna +***** + +.. image:: https://img.shields.io/pypi/v/magna.svg + :alt: PyPI version + +cd ~/git/magna/docs && make clean && make html && cd ~/git/magna/docs/build/html && http-server + + +.. toctree:: + :caption: GTDB + + gtdb/tree + gtdb/metadata + gtdb/genome + gtdb/enums + +.. toctree:: + :caption: GUNC + + gunc/index + + +.. toctree:: + :caption: Utility + + util/io + util/accession + + +.. toctree:: + :caption: Changelog + :maxdepth: 1 + + changelog diff --git a/docs/source/util/accession.rst b/docs/source/util/accession.rst new file mode 100644 index 0000000..c01b4e5 --- /dev/null +++ b/docs/source/util/accession.rst @@ -0,0 +1,5 @@ +********* +Accession +********* + +.. autofunction:: magna.util.accession.canonical_gid diff --git a/docs/source/util/io.rst b/docs/source/util/io.rst new file mode 100644 index 0000000..be17946 --- /dev/null +++ b/docs/source/util/io.rst @@ -0,0 +1,16 @@ +************ +Input/Output +************ + +.. autofunction:: magna.util.io.untar + + +.. autofunction:: magna.util.io.md5sum + + +.. autofunction:: magna.util.io.download_file + + +.. autofunction:: magna.util.io.cache_file + + diff --git a/magna/dataset/gunc/__init__.py b/magna/dataset/gunc/__init__.py deleted file mode 100644 index 1e853e0..0000000 --- a/magna/dataset/gunc/__init__.py +++ /dev/null @@ -1,120 +0,0 @@ -import os -import tempfile - -import numpy as np -import pandas as pd - -from magna.config import MAGNA_DIR -from magna.io import download_file - - -class GuncMaxCssScores: - source = 'https://swifter.embl.de/~fullam/gunc/paper_supplementary_files/All_Datasets.GUNC.scores.maxCSS_level.tsv' - path = os.path.join(MAGNA_DIR, 'dataset', 'gunc', 'All_Datasets.GUNC.scores.maxCSS_level.feather') - md5 = 'dd91aa177b9112c361b9503e132f1c06' - - def __init__(self): - if not os.path.isfile(self.path): - self._download() - self.df = self._read() - - def _read(self): - print('Note: Only the RefSeq and GenBank results are used.') - return pd.read_feather(self.path) - - @staticmethod - def _read_tsv(path): - dtype = { - 'genome': np.object, - 'n_genes_called': np.uintc, - 'n_genes_mapped': np.uintc, - 'n_contigs': np.uintc, - 'taxonomic_level': np.object, - 'proportion_genes_retained_in_major_clades': np.float16, - 'genes_retained_index': np.float16, - 'clade_separation_score': np.float16, - 'contamination_portion': np.float16, - 'n_effective_surplus_clades': np.float16, - 'mean_hit_identity': np.float16, - 'reference_representation_score': np.float16, - 'pass.GUNC': np.object, - 'study': np.object, - 'CheckM_completeness': np.object, - 'CheckM_contamination': np.object, - } - rows = list() - allowed_studies = frozenset({'GenBank', 'RefSeq'}) - with open(path, 'r') as f: - header = {k: i for i, k in enumerate( - f.readline().strip().split('\t'))} - study_idx = header['study'] - for line in f.readlines(): - cols = line.strip().split('\t') - if cols[study_idx] in allowed_studies: - rows.append(cols) - return pd.DataFrame(rows, columns=dtype) - - def _download(self): - with tempfile.TemporaryDirectory() as tmpdir: - # Download the file - tmp_path = os.path.join(tmpdir, 'download.tsv') - download_file(self.source, tmp_path, self.md5) - - df = self._read_tsv(tmp_path) - os.makedirs(os.path.dirname(self.path), exist_ok=True) - df.to_feather(path=self.path, compression='lz4') - - -class GuncAllScores: - source = 'https://swifter.embl.de/~fullam/gunc/paper_supplementary_files/All_Datasets.GUNC.scores.all_levels.specI2species.tsv' - path = os.path.join(MAGNA_DIR, 'dataset', 'gunc', 'All_Datasets.GUNC.scores.all_levels.specI2species.feather') - md5 = 'a54e3719221a42a5f96b267412827d27' - - def __init__(self): - if not os.path.isfile(self.path): - self._download() - self.df = self._read() - - def _read(self): - print('Note: Only the RefSeq and GenBank results are used.') - return pd.read_feather(self.path) - - @staticmethod - def _read_tsv(path): - dtype = { - 'genome': np.object, - 'n_genes_called': np.uintc, - 'n_genes_mapped': np.uintc, - 'n_contigs': np.uintc, - 'taxonomic_level': np.object, - 'proportion_genes_retained_in_major_clades': np.float16, - 'genes_retained_index': np.float16, - 'clade_separation_score': np.float16, - 'contamination_portion': np.float16, - 'n_effective_surplus_clades': np.float16, - 'mean_hit_identity': np.float16, - 'reference_representation_score': np.float16, - 'pass.GUNC': np.object, - 'study': np.object, - } - rows = list() - allowed_studies = frozenset({'GenBank', 'RefSeq'}) - with open(path, 'r') as f: - header = {k: i for i, k in enumerate( - f.readline().strip().split('\t'))} - study_idx = header['study'] - for line in f.readlines(): - cols = line.strip().split('\t') - if cols[study_idx] in allowed_studies: - rows.append(cols) - return pd.DataFrame(rows, columns=dtype) - - def _download(self): - with tempfile.TemporaryDirectory() as tmpdir: - # Download the file - tmp_path = os.path.join(tmpdir, 'download.tsv') - download_file(self.source, tmp_path, self.md5) - - df = self._read_tsv(tmp_path) - os.makedirs(os.path.dirname(self.path), exist_ok=True) - df.to_feather(path=self.path, compression='lz4') diff --git a/magna/gtdb/enums.py b/magna/gtdb/enums.py index fb8d217..c96cba3 100644 --- a/magna/gtdb/enums.py +++ b/magna/gtdb/enums.py @@ -2,6 +2,8 @@ class GtdbRelease(Enum): + """All available GTDB releases.""" + R80 = '80' R83 = '83' R86 = '86' diff --git a/magna/gtdb/genome.py b/magna/gtdb/genome.py index c21d6eb..2bfe2f1 100644 --- a/magna/gtdb/genome.py +++ b/magna/gtdb/genome.py @@ -5,39 +5,50 @@ from Bio.SeqRecord import SeqRecord from magna.gtdb.enums import GtdbRelease -from magna.io import cache_file +from magna.util.io import cache_file class Genome: + """A wrapper to a GTDB genome.""" def __init__(self, accession: str, root: str): + #: The short accession of the genome. self.accession: str = accession + #: The root directory where this genome is stored. self.root: str = root # Generate paths base = os.path.basename(self.root) - self.cds_path = os.path.join(self.root, f'{base}_cds_from_genomic.fna') - self.fna_path = os.path.join(self.root, f'{base}_genomic.fna') + #: The path to the CDS file. + self.cds_path: str = os.path.join(self.root, f'{base}_cds_from_genomic.fna') + #: The path to the FNA file. + self.fna_path: str = os.path.join(self.root, f'{base}_genomic.fna') def __repr__(self): return str(self.accession) def cds_seqio(self) -> Tuple[SeqRecord, ...]: - # Returns the CDS generated from the FNA + """Read and return the CDS file as a SeqIO object.""" with open(self.cds_path, 'r') as f: out = tuple(SeqIO.parse(f, 'fasta')) return out def fna_seqio(self) -> Tuple[SeqRecord, ...]: - # Returns the FNA + """Read and return the FNA file as a SeqIO object.""" with open(self.fna_path, 'r') as f: out = tuple(SeqIO.parse(f, 'fasta')) return out class GenomeDirs: + """An interface to the :obj:`GtdbRelease` accession to :obj:`Genome` mapping.""" def __init__(self, release: GtdbRelease): + """Initialise the GenomeDirs class for a given release. + + Args: + release: The release of GTDB to use. + """ self.release = release # Create the paths @@ -57,4 +68,9 @@ def read(path: str) -> Dict[str, str]: return out def get(self, accession: str) -> Genome: + """Return the :obj:`Genome` for the given accession. + + Args: + accession: The short accession of the genome (e.g. ``GCA_123456789.1``). + """ return Genome(accession=accession, root=self._data[accession]) diff --git a/magna/gtdb/metadata.py b/magna/gtdb/metadata.py index 364a281..871d9fd 100644 --- a/magna/gtdb/metadata.py +++ b/magna/gtdb/metadata.py @@ -4,18 +4,22 @@ import pandas as pd from magna.config import MAGNA_DIR -from magna.io import download_file, md5sum, untar +from magna.util.io import download_file, md5sum, untar -class _GtdbMetadata: +class GtdbMetadata: def __init__(self, source: str, path: str, md5: str): - self.source = source - self.path = path - self.md5 = md5 + #: The source URL + self.source: str = source + #: The path to the metadata file. + self.path: str = path + #: The MD5 checksum of the downloaded file. + self.md5: str = md5 if not os.path.isfile(self.path): self._download() - self.df = self._read() + #: The metadata as a pandas DataFrame. + self.df: pd.DataFrame = self._read() def _read(self): df = pd.read_feather(self.path) @@ -45,7 +49,9 @@ def _download(self): # ---------------------------------------------------------------------------------------------------------------------- -class GtdbMetadataR95Arc(_GtdbMetadata): +class GtdbMetadataR95Arc(GtdbMetadata): + """The archaeal metadata (release 95).""" + source = 'https://data.gtdb.ecogenomic.org/releases/release95/95.0/ar122_metadata_r95.tar.gz' path = os.path.join(MAGNA_DIR, 'dataset', 'gtdb', 'metadata', 'ar122_metadata_r95.feather') md5 = '110ad5daa2dbed2ee904b10c295da5dc' @@ -54,7 +60,9 @@ def __init__(self): super().__init__(self.source, self.path, self.md5) -class GtdbMetadataR95Bac(_GtdbMetadata): +class GtdbMetadataR95Bac(GtdbMetadata): + """The bacterial metadata (release 95).""" + source = 'https://data.gtdb.ecogenomic.org/releases/release95/95.0/bac120_metadata_r95.tar.gz' path = os.path.join(MAGNA_DIR, 'dataset', 'gtdb', 'metadata', 'bac120_metadata_r95.feather') md5 = '223ada02ffca4d1a2dda6edb9a164dcd' @@ -64,14 +72,18 @@ def __init__(self): class GtdbMetadataR95: + """The combined archaeal and bacterial metadata (release 95).""" def __init__(self): - self.df = pd.concat([GtdbMetadataR95Arc().df, GtdbMetadataR95Bac().df]) + #: The combined dataframe. + self.df: pd.DataFrame = pd.concat([GtdbMetadataR95Arc().df, GtdbMetadataR95Bac().df]) # ---------------------------------------------------------------------------------------------------------------------- -class GtdbMetadataR202Arc(_GtdbMetadata): +class GtdbMetadataR202Arc(GtdbMetadata): + """The archaeal metadata (release 202).""" + source = 'https://data.gtdb.ecogenomic.org/releases/release202/202.0/ar122_metadata_r202.tar.gz' path = os.path.join(MAGNA_DIR, 'dataset', 'gtdb', 'metadata', 'ar122_metadata_r202.feather') md5 = '0607728ae1f56bdb1a7cc24d238185c3' @@ -80,7 +92,8 @@ def __init__(self): super().__init__(self.source, self.path, self.md5) -class GtdbMetadataR202Bac(_GtdbMetadata): +class GtdbMetadataR202Bac(GtdbMetadata): + """The bacterial metadata (release 202).""" source = 'https://data.gtdb.ecogenomic.org/releases/release202/202.0/bac120_metadata_r202.tar.gz' path = os.path.join(MAGNA_DIR, 'dataset', 'gtdb', 'metadata', 'bac120_metadata_r202.feather') md5 = '68fed11eb688982edb6f4669476c2a10' @@ -90,8 +103,10 @@ def __init__(self): class GtdbMetadataR202: + """The combined archaeal and bacterial metadata (release 202).""" def __init__(self): - self.df = pd.concat([GtdbMetadataR202Arc().df, GtdbMetadataR202Bac().df]) + #: The combined dataframe. + self.df: pd.DataFrame = pd.concat([GtdbMetadataR202Arc().df, GtdbMetadataR202Bac().df]) # ---------------------------------------------------------------------------------------------------------------------- diff --git a/magna/gtdb/tree.py b/magna/gtdb/tree.py index 40b6f04..c1c87e2 100644 --- a/magna/gtdb/tree.py +++ b/magna/gtdb/tree.py @@ -5,18 +5,25 @@ import dendropy from magna.config import MAGNA_DIR -from magna.io import download_file, md5sum +from magna.util.io import download_file, md5sum -class _GtdbTree: +class GtdbTree: + """The base class that all GTDB tree objects inherit.""" + + __slots__ = ('source', 'path', 'md5', 'tree') def __init__(self, source: str, path: str, md5: str): - self.source = source - self.path = path - self.md5 = md5 + #: The source URL. + self.source: str = source + #: The path to the downloaded file. + self.path: str = path + #: The expected MD5 checksum of the file. + self.md5: str = md5 if not os.path.isfile(self.path): self._download() - self.tree = self._read() + #: The dendropy tree. + self.tree: dendropy.Tree = self._read() def _read(self): return dendropy.Tree.get(path=self.path, schema='newick', preserve_underscores=True) @@ -35,7 +42,8 @@ def _download(self): shutil.copyfile(tmp_path, self.path) -class GtdbTreeR95Arc(_GtdbTree): +class GtdbTreeR95Arc(GtdbTree): + """The GTDB archaeal tree (release 95).""" source = 'https://data.gtdb.ecogenomic.org/releases/release95/95.0/ar122_r95.tree' path = os.path.join(MAGNA_DIR, 'dataset', 'gtdb', 'tree', 'ar122_r95.tree') md5 = '2f5e072b9095617e7b5cff09653f8bec' @@ -44,7 +52,8 @@ def __init__(self): super().__init__(self.source, self.path, self.md5) -class GtdbTreeR95Bac(_GtdbTree): +class GtdbTreeR95Bac(GtdbTree): + """The GTDB bacterial tree (release 95).""" source = 'https://data.gtdb.ecogenomic.org/releases/release95/95.0/bac120_r95.tree' path = os.path.join(MAGNA_DIR, 'dataset', 'gtdb', 'tree', 'bac120_r95.tree') md5 = 'c896d0eece01b281e09bd38534cd072e' @@ -53,7 +62,8 @@ def __init__(self): super().__init__(self.source, self.path, self.md5) -class GtdbTreeR202Arc(_GtdbTree): +class GtdbTreeR202Arc(GtdbTree): + """The GTDB archaeal tree (release 202).""" source = 'https://data.gtdb.ecogenomic.org/releases/release202/202.0/ar122_r202.tree' path = os.path.join(MAGNA_DIR, 'dataset', 'gtdb', 'tree', 'ar122_r202.tree') md5 = '5b2dd87b0836fd63a223a556eae2906d' @@ -62,7 +72,8 @@ def __init__(self): super().__init__(self.source, self.path, self.md5) -class GtdbTreeR202Bac(_GtdbTree): +class GtdbTreeR202Bac(GtdbTree): + """The GTDB bacterial tree (release 202).""" source = 'https://data.gtdb.ecogenomic.org/releases/release202/202.0/bac120_r202.tree' path = os.path.join(MAGNA_DIR, 'dataset', 'gtdb', 'tree', 'bac120_r202.tree') md5 = 'aebfc092ff6f2d81ef1226da6f1477c9' diff --git a/magna/gunc/__init__.py b/magna/gunc/__init__.py index b9e014d..0146392 100644 --- a/magna/gunc/__init__.py +++ b/magna/gunc/__init__.py @@ -1,11 +1,15 @@ import os +import tempfile +import numpy as np import pandas as pd from magna.config import MAGNA_DIR +from magna.util.io import download_file def gunc_max_css_scores_gtdb_r95() -> pd.DataFrame: + """Return the max clade separation score (CSS) for the R95 GTDB.""" path = os.path.join(MAGNA_DIR, 'dataset', 'gunc', 'GUNC.gtdb_95.maxCSS_level.feather') if not os.path.isfile(path): raise IOError(f'{path} does not exist.') @@ -13,6 +17,7 @@ def gunc_max_css_scores_gtdb_r95() -> pd.DataFrame: def gunc_contig_assignment_gtdb_r95() -> pd.DataFrame: + """Return the contig assignment for the R95 GTDB.""" path = os.path.join(MAGNA_DIR, 'dataset', 'gunc', 'GUNC.gtdb_95.contig_assignments.feather') if not os.path.isfile(path): raise IOError(f'{path} does not exist.') @@ -20,7 +25,126 @@ def gunc_contig_assignment_gtdb_r95() -> pd.DataFrame: def gunc_all_levels_gtdb_r95() -> pd.DataFrame: + """Return GUNC output at all levels for the R95 GTDB.""" path = os.path.join(MAGNA_DIR, 'dataset', 'gunc', 'gtdb_95.all_levels.tsv') if not os.path.isfile(path): raise IOError(f'{path} does not exist.') return pd.read_feather(path) + + +class GuncMaxCssScores: + """Return the max clade separation score (CSS) for the R95 GTDB (progenes db).""" + + source = 'https://swifter.embl.de/~fullam/gunc/paper_supplementary_files/All_Datasets.GUNC.scores.maxCSS_level.tsv' + path = os.path.join(MAGNA_DIR, 'dataset', 'gunc', 'All_Datasets.GUNC.scores.maxCSS_level.feather') + md5 = 'dd91aa177b9112c361b9503e132f1c06' + + def __init__(self): + if not os.path.isfile(self.path): + self._download() + #: The dataframe. + self.df: pd.DataFrame = self._read() + + def _read(self): + print('Note: Only the RefSeq and GenBank results are used.') + return pd.read_feather(self.path) + + @staticmethod + def _read_tsv(path): + dtype = { + 'genome': np.object, + 'n_genes_called': np.uintc, + 'n_genes_mapped': np.uintc, + 'n_contigs': np.uintc, + 'taxonomic_level': np.object, + 'proportion_genes_retained_in_major_clades': np.float16, + 'genes_retained_index': np.float16, + 'clade_separation_score': np.float16, + 'contamination_portion': np.float16, + 'n_effective_surplus_clades': np.float16, + 'mean_hit_identity': np.float16, + 'reference_representation_score': np.float16, + 'pass.GUNC': np.object, + 'study': np.object, + 'CheckM_completeness': np.object, + 'CheckM_contamination': np.object, + } + rows = list() + allowed_studies = frozenset({'GenBank', 'RefSeq'}) + with open(path, 'r') as f: + header = {k: i for i, k in enumerate( + f.readline().strip().split('\t'))} + study_idx = header['study'] + for line in f.readlines(): + cols = line.strip().split('\t') + if cols[study_idx] in allowed_studies: + rows.append(cols) + return pd.DataFrame(rows, columns=dtype) + + def _download(self): + with tempfile.TemporaryDirectory() as tmpdir: + # Download the file + tmp_path = os.path.join(tmpdir, 'download.tsv') + download_file(self.source, tmp_path, self.md5) + + df = self._read_tsv(tmp_path) + os.makedirs(os.path.dirname(self.path), exist_ok=True) + df.to_feather(path=self.path, compression='lz4') + + +class GuncAllScores: + """Return GUNC output at all levels for the R95 GTDB (progenes db).""" + + source = 'https://swifter.embl.de/~fullam/gunc/paper_supplementary_files/All_Datasets.GUNC.scores.all_levels.specI2species.tsv' + path = os.path.join(MAGNA_DIR, 'dataset', 'gunc', 'All_Datasets.GUNC.scores.all_levels.specI2species.feather') + md5 = 'a54e3719221a42a5f96b267412827d27' + + def __init__(self): + if not os.path.isfile(self.path): + self._download() + #: The dataframe. + self.df: pd.DataFrame = self._read() + + def _read(self): + print('Note: Only the RefSeq and GenBank results are used.') + return pd.read_feather(self.path) + + @staticmethod + def _read_tsv(path): + dtype = { + 'genome': np.object, + 'n_genes_called': np.uintc, + 'n_genes_mapped': np.uintc, + 'n_contigs': np.uintc, + 'taxonomic_level': np.object, + 'proportion_genes_retained_in_major_clades': np.float16, + 'genes_retained_index': np.float16, + 'clade_separation_score': np.float16, + 'contamination_portion': np.float16, + 'n_effective_surplus_clades': np.float16, + 'mean_hit_identity': np.float16, + 'reference_representation_score': np.float16, + 'pass.GUNC': np.object, + 'study': np.object, + } + rows = list() + allowed_studies = frozenset({'GenBank', 'RefSeq'}) + with open(path, 'r') as f: + header = {k: i for i, k in enumerate( + f.readline().strip().split('\t'))} + study_idx = header['study'] + for line in f.readlines(): + cols = line.strip().split('\t') + if cols[study_idx] in allowed_studies: + rows.append(cols) + return pd.DataFrame(rows, columns=dtype) + + def _download(self): + with tempfile.TemporaryDirectory() as tmpdir: + # Download the file + tmp_path = os.path.join(tmpdir, 'download.tsv') + download_file(self.source, tmp_path, self.md5) + + df = self._read_tsv(tmp_path) + os.makedirs(os.path.dirname(self.path), exist_ok=True) + df.to_feather(path=self.path, compression='lz4') diff --git a/magna/dataset/__init__.py b/magna/util/__init__.py similarity index 100% rename from magna/dataset/__init__.py rename to magna/util/__init__.py diff --git a/magna/accession.py b/magna/util/accession.py similarity index 61% rename from magna/accession.py rename to magna/util/accession.py index 3492cbd..5186c0f 100644 --- a/magna/accession.py +++ b/magna/util/accession.py @@ -1,12 +1,12 @@ def canonical_gid(gid: str) -> str: """Get canonical form of NCBI genome accession. + Args: + gid: The NCBI genome accession. + Example: - G005435135 -> G005435135 - GCF_005435135.1 -> G005435135 - GCF_005435135.1_ASM543513v1_genomic -> G005435135 - RS_GCF_005435135.1 -> G005435135 - GB_GCA_005435135.1 -> G005435135 + >>> canonical_gid('GCF_005435135.1_ASM543513v1_genomic') + 'G005435135' """ if gid.startswith('U'): diff --git a/magna/io.py b/magna/util/io.py similarity index 53% rename from magna/io.py rename to magna/util/io.py index 93195ac..af3cc10 100644 --- a/magna/io.py +++ b/magna/util/io.py @@ -11,15 +11,6 @@ from magna.config import CACHE_DIR -def untar(file_path, dir_name): - """ - Extracts the contents of the tar file at file_path into the directory - dest_path. - """ - with tarfile.open(file_path) as tar: - tar.extractall(dir_name) - - class TqdmUpTo(tqdm): """Provides `update_to(n)` which uses `tqdm.update(delta_n)`.""" @@ -37,7 +28,30 @@ def update_to(self, b=1, bsize=1, tsize=None): return self.update(b * bsize - self.n) # also sets self.n = b * bsize +def untar(file_path: str, dir_name: str): + """Extracts the contents of the tar file into the target directory. + + Args: + file_path: The path to the tar file. + dir_name: The directory to extract the tar file into. + + Examples: + Extract the contents of the tar file at ``/tmp/data.tar.gz`` into the + directory ``/tmp/data``. + + >>> untar('/tmp/data.tar.gz', '/tmp/data') + """ + os.makedirs(dir_name, exist_ok=True) + with tarfile.open(file_path) as tar: + tar.extractall(dir_name) + + def md5sum(path: str) -> str: + """Returns the md5 hash of a file. + + Args: + path: The path to the file. + """ block_size = 65536 hasher = hashlib.md5() with open(path, 'rb') as f: @@ -49,19 +63,45 @@ def md5sum(path: str) -> str: def download_file(url: str, path: str, md5: Optional[str] = None): - """Downloads a file to disk with tqdm progress bar.""" + """Downloads a file to disk, optionally validating the md5 hash. + + Args: + url: The url to download from. + path: The path to save the file to. + md5: The expected md5 hash of the file. + + Raises: + IOError: If the md5 hash doesn't match. + + Examples: + Download the file at ``https://www.example.com/data.csv`` to ``/tmp/data.csv``. + + >>> download_file('https://www.example.com/data.csv', '/tmp/data.csv') + """ with TqdmUpTo(unit='B', unit_scale=True, unit_divisor=1024, miniters=1, desc=url.split('/')[-1]) as t: urllib.request.urlretrieve(url, filename=path, reporthook=t.update_to, data=None) t.total = t.n if md5 and md5 != md5sum(path): - raise ValueError('Hash mismatch') + raise IOError('Hash mismatch') def cache_file(srv_path: str, local_name: str) -> str: - """Copies a remote file to the local machine.""" - if not os.path.isdir(CACHE_DIR): - os.makedirs(CACHE_DIR) + """Copies a file to the magna cache (doesn't auto-remove). + + Args: + srv_path: The remote path of the file. + local_name: The key to cache this file with. + + Returns: + The path to the cached file. + + Examples: + Cache the file at ``/srv/data.csv`` as ``data.csv``. + + >>> cache_file('/srv/data.csv', 'data.csv') + """ + os.makedirs(CACHE_DIR, exist_ok=True) local_path = os.path.join(CACHE_DIR, local_name) if not os.path.isfile(local_path): shutil.copyfile(srv_path, local_path) diff --git a/package.json b/package.json index 45395d0..bb02ecf 100644 --- a/package.json +++ b/package.json @@ -1,23 +1,5 @@ { "name": "magna", - "version": "0.0.1", - "description": "Magna is a collection of commands that I frequently use in bioinformatics and day-to-day life.", - "main": "index.js", - "scripts": { - "test": "echo \"Error: no test specified\" && exit 1" - }, - "repository": { - "type": "git", - "url": "git+https://github.com/aaronmussig/magna.git" - }, - "author": "", - "license": "ISC", - "bugs": { - "url": "https://github.com/aaronmussig/magna/issues" - }, - "homepage": "https://github.com/aaronmussig/magna#readme", - "dependencies": { - }, "devDependencies": { "@semantic-release/github": "^8.0.2", "@semantic-release/changelog": "^6.0.1", diff --git a/setup.py b/setup.py index 9e231dd..92ffb93 100644 --- a/setup.py +++ b/setup.py @@ -43,6 +43,7 @@ def readme(): ], packages=find_packages(), include_package_data=True, - install_requires=['tqdm', 'pandas>=1.1.0', 'pyarrow', 'numpy', 'dendropy', 'biopython'], + install_requires=['tqdm', 'pandas>=1.1.0', 'pyarrow', 'numpy', + 'dendropy', 'biopython'], python_requires='>=3.6', )