diff --git a/.github/workflows/build-publish.yml b/.github/workflows/build-publish.yml index fbaa500..9dd1297 100644 --- a/.github/workflows/build-publish.yml +++ b/.github/workflows/build-publish.yml @@ -15,7 +15,7 @@ jobs: - uses: actions/setup-node@v4 with: cache: npm - node-version: lts + node-version: lts/Hydrogen - run: npm install - run: npx semantic-release --dry-run id: semantic_release @@ -25,7 +25,7 @@ jobs: with: name: semantic-release path: | - betula/__init__.py + gtdb_precurate/__init__.py CHANGELOG.md retention-days: 1 outputs: @@ -72,7 +72,7 @@ jobs: - uses: actions/setup-node@v4 with: cache: npm - node-version: lts + node-version: lts/Hydrogen - run: npm install - run: npx semantic-release env: diff --git a/.releaserc b/.releaserc index ecf4147..aa376a4 100644 --- a/.releaserc +++ b/.releaserc @@ -9,7 +9,7 @@ [ "@semantic-release/exec", { - "verifyReleaseCmd": "sed -i \"s/.*__version__.*/__version__ = '${nextRelease.version}'/g\" betula/__init__.py && echo version=${nextRelease.version} >> $GITHUB_OUTPUT" + "verifyReleaseCmd": "sed -i \"s/.*__version__.*/__version__ = '${nextRelease.version}'/g\" gtdb_precurate/__init__.py && echo version=${nextRelease.version} >> $GITHUB_OUTPUT" } ], [ @@ -23,7 +23,7 @@ { "assets": [ "CHANGELOG.md", - "betula/__init__.py" + "gtdb_precurate/__init__.py" ], "message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}" } diff --git a/README.md b/README.md index b12e820..9e7a5ac 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,30 @@ -# betula -Automatic pre-curation of GTDB trees. +# GTDB Precurate + +[![PyPI](https://img.shields.io/pypi/v/gtdb_precurate.svg)](https://pypi.python.org/pypi/gtdb_precurate) + +`gtdb_precurate` is an internally used tool used that provides automatic pre-curation of GTDB trees. + +## Installation + +gtdb_precurate is available on PyPI and can be installed with pip: + +```bash +pip install gtdb_precurate +``` + +## Usage + +After a successful install, the `gtdb_precurate` command should be available. + +The following positional arguments are required: + +* `metadata` - This is the path to the metadata file, it should contain a header as the first line. + The only requirement is that it has the following columns: `formatted_accession` and `ncbi_wgs_formatted`. +* `red_dict` - This is the path to the RED dictionary output by PhyloRank. +* `red_decorated_tree` - This is the path to the scaled RED decorated output by PhyloRank. +* `out_directory` - This is the path to the directory where the output files will be written. + +The following optional arguments are available: + +* `--min-bootstrap` - This is the minimum bootstrap value to consider a node to be supported. Default: 95.0. +* `--debug` - This enables debug logging. Default: False. diff --git a/betula/__init__.py b/betula/__init__.py deleted file mode 100644 index 1f356cc..0000000 --- a/betula/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = '1.0.0' diff --git a/gtdb_precurate/__init__.py b/gtdb_precurate/__init__.py new file mode 100644 index 0000000..a0bee05 --- /dev/null +++ b/gtdb_precurate/__init__.py @@ -0,0 +1,15 @@ +__author__ = 'Aaron Mussig' +__author_email__ = 'aaronmussig@gmail.com' +__copyright__ = 'Copyright 2023' +__credits__ = ['Aaron Mussig'] +__description__ = 'Automatic pre-curation of GTDB trees.' +__email__ = 'aaronmussig@gmail.com' +__license__ = 'GPL3' +__maintainer__ = 'Aaron Mussig' +__maintainer_email__ = 'aaronmussig@gmail.com' +__name__ = 'gtdb_precurate' +__python_requires__ = '>=3.8' +__status__ = 'Production' +__title__ = 'GTDB Precurate' +__url__ = 'https://github.com/Ecogenomics/gtdb_precurate' +__version__ = '1.0.0' diff --git a/betula/__main__.py b/gtdb_precurate/__main__.py similarity index 76% rename from betula/__main__.py rename to gtdb_precurate/__main__.py index 10ac985..09f7451 100644 --- a/betula/__main__.py +++ b/gtdb_precurate/__main__.py @@ -3,13 +3,13 @@ import typer -from betula import __version__ -from betula.method.create_denovo import create_denovo_clusters -from betula.model.metadata import MetadataFile -from betula.model.ranks import RANKS -from betula.model.red_dict import RedDict -from betula.model.tree import Tree -from betula.util.logger import init_logger +from gtdb_precurate import __version__ +from gtdb_precurate.method.create_denovo import create_denovo_clusters +from gtdb_precurate.model.metadata import MetadataFile +from gtdb_precurate.model.ranks import RANKS +from gtdb_precurate.model.red_dict import RedDict +from gtdb_precurate.model.tree import Tree +from gtdb_precurate.util.logger import init_logger def main( @@ -25,14 +25,14 @@ def main( # Initialise the logger log = init_logger(out_directory, debug) - log.info(f'betula v{__version__}') - log.info(f'betula {" ".join(sys.argv[1:])}') + log.info(f'gtdb_precurate v{__version__}') + log.info(f'gtdb_precurate {" ".join(sys.argv[1:])}') # Create the output paths if 'red_decorated' in red_decorated_tree.name: - tree_path_out = out_directory / red_decorated_tree.name.replace('red_decorated', 'betula') + tree_path_out = out_directory / red_decorated_tree.name.replace('red_decorated', 'gtdb_precurate') else: - tree_path_out = out_directory / f'{red_decorated_tree.stem}.betula{red_decorated_tree.suffix}' + tree_path_out = out_directory / f'{red_decorated_tree.stem}.gtdb_precurate{red_decorated_tree.suffix}' report_path_out = out_directory / f'{tree_path_out.stem}_report.tsv' # Read the RED dictionary diff --git a/betula/method/__init__.py b/gtdb_precurate/method/__init__.py similarity index 100% rename from betula/method/__init__.py rename to gtdb_precurate/method/__init__.py diff --git a/betula/method/create_denovo.py b/gtdb_precurate/method/create_denovo.py similarity index 94% rename from betula/method/create_denovo.py rename to gtdb_precurate/method/create_denovo.py index 4d8af2d..eff126e 100644 --- a/betula/method/create_denovo.py +++ b/gtdb_precurate/method/create_denovo.py @@ -1,11 +1,11 @@ import logging from collections import defaultdict -from betula.model.metadata import MetadataFile -from betula.model.node_label import NodeLabel -from betula.model.ranks import RANKS -from betula.model.red_dict import RedDict -from betula.model.tree import Tree +from gtdb_precurate.model.metadata import MetadataFile +from gtdb_precurate.model.node_label import NodeLabel +from gtdb_precurate.model.ranks import RANKS +from gtdb_precurate.model.red_dict import RedDict +from gtdb_precurate.model.tree import Tree def get_gids_missing_ranks(tree): @@ -106,7 +106,7 @@ def create_taxon(rank, meta, new_taxa, tree, gids): def create_ranks(d_rank_to_leaf_candidates, tree, meta): # We will now iterate from the highest to lower rank to find shared nodes created = set() - log = logging.getLogger('betula') + log = logging.getLogger('gtdb_precurate') for cur_rank, d_leaf_to_candidate_node in sorted(d_rank_to_leaf_candidates.items(), key=lambda x: RANKS.index(x[0])): @@ -142,7 +142,7 @@ def create_denovo_clusters( ): """Creates de novo clusters from the specified tree and metadata.""" - log = logging.getLogger('betula') + log = logging.getLogger('gtdb_precurate') # First, we need to find all genomes that are missing ranks d_leaf_to_missing_ranks = get_gids_missing_ranks(tree) diff --git a/betula/model/__init__.py b/gtdb_precurate/model/__init__.py similarity index 100% rename from betula/model/__init__.py rename to gtdb_precurate/model/__init__.py diff --git a/betula/model/metadata.py b/gtdb_precurate/model/metadata.py similarity index 100% rename from betula/model/metadata.py rename to gtdb_precurate/model/metadata.py diff --git a/betula/model/node_label.py b/gtdb_precurate/model/node_label.py similarity index 100% rename from betula/model/node_label.py rename to gtdb_precurate/model/node_label.py diff --git a/betula/model/ranks.py b/gtdb_precurate/model/ranks.py similarity index 100% rename from betula/model/ranks.py rename to gtdb_precurate/model/ranks.py diff --git a/betula/model/red_dict.py b/gtdb_precurate/model/red_dict.py similarity index 96% rename from betula/model/red_dict.py rename to gtdb_precurate/model/red_dict.py index fd97b50..8a69fa8 100644 --- a/betula/model/red_dict.py +++ b/gtdb_precurate/model/red_dict.py @@ -1,7 +1,7 @@ import json from pathlib import Path -from betula.model.ranks import RANKS +from gtdb_precurate.model.ranks import RANKS class RedDict: diff --git a/betula/model/tree.py b/gtdb_precurate/model/tree.py similarity index 97% rename from betula/model/tree.py rename to gtdb_precurate/model/tree.py index eed563a..c7e8877 100644 --- a/betula/model/tree.py +++ b/gtdb_precurate/model/tree.py @@ -4,9 +4,9 @@ import dendropy -from betula.model.node_label import NodeLabel -from betula.model.ranks import RANKS, TaxString -from betula.util.tree import parse_node_label +from gtdb_precurate.model.node_label import NodeLabel +from gtdb_precurate.model.ranks import RANKS, TaxString +from gtdb_precurate.util.tree import parse_node_label class Tree: diff --git a/betula/util/__init__.py b/gtdb_precurate/util/__init__.py similarity index 100% rename from betula/util/__init__.py rename to gtdb_precurate/util/__init__.py diff --git a/betula/util/logger.py b/gtdb_precurate/util/logger.py similarity index 83% rename from betula/util/logger.py rename to gtdb_precurate/util/logger.py index d20f3db..9a950f4 100644 --- a/betula/util/logger.py +++ b/gtdb_precurate/util/logger.py @@ -5,7 +5,7 @@ def init_logger(out_dir: Path, debug=False): # Logger setup - logger = logging.getLogger('betula') + logger = logging.getLogger('gtdb_precurate') if debug: logger.setLevel(logging.DEBUG) else: @@ -20,7 +20,7 @@ def init_logger(out_dir: Path, debug=False): logger.addHandler(console_handler) # File handler - file_handler = logging.FileHandler(out_dir / 'betula.log') + file_handler = logging.FileHandler(out_dir / 'gtdb_precurate.log') file_handler.setFormatter(formatter) logger.addHandler(file_handler) diff --git a/betula/util/tree.py b/gtdb_precurate/util/tree.py similarity index 100% rename from betula/util/tree.py rename to gtdb_precurate/util/tree.py diff --git a/package-lock.json b/package-lock.json index 3dff59e..092c618 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,10 +1,10 @@ { - "name": "betula", + "name": "gtdb_precurate", "lockfileVersion": 3, "requires": true, "packages": { "": { - "name": "betula", + "name": "gtdb_precurate", "devDependencies": { "@semantic-release/changelog": "^6.0.3", "@semantic-release/commit-analyzer": "^11.1.0", diff --git a/package.json b/package.json index 4e60d68..e2592f4 100644 --- a/package.json +++ b/package.json @@ -1,5 +1,5 @@ { - "name": "betula", + "name": "gtdb_precurate", "devDependencies": { "@semantic-release/github": "^9.2.5", "@semantic-release/changelog": "^6.0.3", diff --git a/setup.py b/setup.py index e69de29..25661f4 100644 --- a/setup.py +++ b/setup.py @@ -0,0 +1,60 @@ +import os +import re + +from setuptools import setup, find_packages + + +def read_meta(): + """Read each of the keys stored in __init__.py + + Returns + ------- + dict[str, str] + A dictionary containing each of the string key value pairs. + """ + path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'gtdb_precurate/__init__.py') + with open(path) as fh: + hits = re.findall(r'__(\w+)__ ?= ?["\'](.+)["\']\n', fh.read()) + return {k: v for k, v in hits} + + +def readme(): + with open('README.md') as f: + return f.read() + + +meta = read_meta() +setup( + author=meta['author'], + author_email=meta['author_email'], + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', + 'Natural Language :: English', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Topic :: Scientific/Engineering :: Bio-Informatics', + ], + data_files=[("", ["LICENSE"])], + description=meta['description'], + entry_points={ + 'console_scripts': [ + 'gtdb_precurate = gtdb_precurate.__main__:main' + ] + }, + install_requires=["dendropy>=4.1.0"], + license=meta['license'], + long_description=readme(), + long_description_content_type='text/markdown', + maintainer=meta['maintainer'], + maintainer_email=meta['maintainer_email'], + name=meta['name'], + packages=find_packages(), + python_requires=meta['python_requires'], + url=meta['url'], + version=meta['version'] +)