diff --git a/.github/workflows/ci-dev.yaml b/.github/workflows/ci-dev.yaml index bfd90f5b..484b8123 100644 --- a/.github/workflows/ci-dev.yaml +++ b/.github/workflows/ci-dev.yaml @@ -10,24 +10,3 @@ jobs: uses: qiime2/distributions/.github/workflows/lib-ci-dev.yaml@dev with: distro: metagenome - additional-reports-path: ./coverage.xml - additional-reports-name: coverage - - coverage: - needs: [ci] - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - uses: actions/download-artifact@v3 - name: 'Fetch coverage from builds' - with: - name: ${{ needs.ci.outputs.additional-reports-name }} - path: ${{ needs.ci.outputs.additional-reports-path }} - - - uses: codecov/codecov-action@v4 - name: 'Upload coverage' - with: - fail_ci_if_error: true - env: - CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.github/workflows/q2-ci.yaml b/.github/workflows/q2-ci.yaml index b604d221..fcbc000e 100644 --- a/.github/workflows/q2-ci.yaml +++ b/.github/workflows/q2-ci.yaml @@ -18,25 +18,3 @@ jobs: uses: qiime2/distributions/.github/workflows/lib-ci-dev.yaml@dev with: distro: metagenome - additional-reports-path: ./coverage.xml - additional-reports-name: coverage - - coverage: - needs: [ci] - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - uses: actions/download-artifact@v3 - name: 'Fetch coverage from builds' - with: - name: ${{ needs.ci.outputs.additional-reports-name }} - path: ${{ needs.ci.outputs.additional-reports-path }} - - - uses: codecov/codecov-action@v4 - name: 'Upload coverage' - with: - fail_ci_if_error: true - verbose: true - env: - CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/README.md b/README.md index 890d9ffe..5cbae3f3 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,46 @@ # q2-moshpit ![CI](https://github.com/bokulich-lab/q2-moshpit/actions/workflows/ci.yaml/badge.svg) -![QIIME CI](https://github.com/bokulich-lab/q2-moshpit/actions/workflows/q2-ci.yaml/badge.svg) [![codecov](https://codecov.io/gh/bokulich-lab/q2-moshpit/graph/badge.svg?token=PSCAYJUP01)](https://codecov.io/gh/bokulich-lab/q2-moshpit) **MO**dular **SH**otgun metagenome **P**ipelines with **I**ntegrated provenance **T**racking + +QIIME 2 plugin for functional annotation and taxonomic classification of shotgun metagenomes. + +## Installation +_q2-moshpit_ is available as part of the QIIME 2 metagenome distribution. For installation and usage instructions please consult the official [QIIME 2 documentation](https://docs.qiime2.org). + +## Functionality +This QIIME 2 plugin contains actions used to annotate and classify (meta)genomes: + +| Action | Description | Underlying tool | +|----------------------|------------------------------------------------------------|--------------------------------------------------------------------| +| bin-contigs-metabat | Bin contigs into MAGs using MetaBat 2. | [MetaBat 2](https://bitbucket.org/berkeleylab/metabat/src/master/) | +| build-custom-diamond-db | Create a DIAMOND reference database from a FASTA input file. | [Diamond](https://github.com/bbuchfink/diamond) | +| build-eggnog-diamond-db | Create a DIAMOND reference database for the specified taxon. | [Diamond](https://github.com/bbuchfink/diamond) | +| build-kraken-db | Fetch an existing or build a custom Kraken 2 database. | [Kraken 2](https://ccb.jhu.edu/software/kraken2/) | +| classify-kaiju | Classify reads using Kaiju. | [Kaiju](https://bioinformatics-centre.github.io/kaiju/) | +| classify-kraken2 | Classify reads/MAGs using Kraken 2. | [Kraken 2](https://ccb.jhu.edu/software/kraken2/) | +| dereplicate-mags | Dereplicate MAGs from multiple samples. | - | +| eggnog-annotate | Annotate orthologs against eggNOG database. | [EggNOG mapper](https://github.com/eggnogdb/eggnog-mapper) | +| eggnog-diamond-search | Run eggNOG search using diamond aligner. | [EggNOG mapper](https://github.com/eggnogdb/eggnog-mapper) | +| eggnog-hmmer-search | Run eggNOG search using HMMER aligner. | [EggNOG mapper](https://github.com/eggnogdb/eggnog-mapper) | +| estimate-bracken | Perform read abundance re-estimation using Bracken. | [Kraken 2](https://ccb.jhu.edu/software/bracken/) | +| estimate-mag-abundance | Estimate MAG abundance. | - | +| evaluate-busco | Evaluate quality of the generated MAGs using BUSCO. | [BUSCO](https://busco.ezlab.org) | +| extract-annotations | Extract annotation frequencies from all annotations. | - | +| fetch-busco-db | Download BUSCO database. | [BUSCO](https://busco.ezlab.org) | +| fetch-diamond-db | Fetch the complete Diamond database necessary to run the eggnog-diamond-search action. | [EggNOG mapper](https://github.com/eggnogdb/eggnog-mapper) | +| fetch-eggnog-db | Fetch the databases necessary to run the eggnog-annotate action. | [EggNOG mapper](https://github.com/eggnogdb/eggnog-mapper) | +| fetch-eggnog-hmmer-db | Fetch the taxon specific database necessary to run the eggnog-hmmer-search action. | [EggNOG mapper](https://github.com/eggnogdb/eggnog-mapper) | +| fetch-eggnog-proteins | Fetch the databases necessary to run the build-eggnog-diamond-db action. | [EggNOG mapper](https://github.com/eggnogdb/eggnog-mapper) | +| fetch-kaiju-db | Fetch Kaiju database. | [Kaiju](https://bioinformatics-centre.github.io/kaiju/) | +| fetch-ncbi-taxonomy | Fetch NCBI reference taxonomy. | [EggNOG mapper](https://github.com/eggnogdb/eggnog-mapper) | +| filter-derep-mags | Filter dereplicated MAGs. | - | +| filter-mags | Filter MAGs. | - | +| filter-reads-pangenome | Remove contaminating human reads. | [Bowtie 2](https://bowtie-bio.sourceforge.net/bowtie2/index.shtml) | +| get-feature-lengths | Get feature lengths. | - | +| inspect-kraken2-db | Inspect a Kraken 2 database. | +| kraken2-to-features | Select downstream features from Kraken 2. | - | +| kraken2-to-mag-features | Select downstream MAG features from Kraken 2. | - | +| multiply-tables | Multiply two feature tables. | - | +| predict-genes-prodigal | Predict gene sequences from MAGs using Prodigal. | [Prodigal](https://github.com/hyattpd/Prodigal) | diff --git a/ci/recipe/meta.yaml b/ci/recipe/meta.yaml index 4ea76e8b..d8f36865 100644 --- a/ci/recipe/meta.yaml +++ b/ci/recipe/meta.yaml @@ -42,13 +42,12 @@ requirements: test: requires: - - coverage - parameterized imports: - q2_moshpit - qiime2.plugins.moshpit commands: - - coverage run --rcfile ./repo/.coveragerc -m pytest && coverage xml -o coverage.xml + - pytest --pyargs q2_moshpit about: home: https://github.com/bokulich-lab/q2-moshpit diff --git a/environment-files/q2-moshpit-qiime2-metagenome-2024.10.yml b/environment-files/q2-moshpit-qiime2-metagenome-2024.10.yml new file mode 100644 index 00000000..156815b3 --- /dev/null +++ b/environment-files/q2-moshpit-qiime2-metagenome-2024.10.yml @@ -0,0 +1,10 @@ +channels: +- https://packages.qiime2.org/qiime2/2024.10/metagenome/passed +- conda-forge +- bioconda +dependencies: + - qiime2-metagenome + - pip + - pip: + - q2-moshpit@git+https://github.com/bokulich-lab/q2-moshpit.git@main + \ No newline at end of file diff --git a/q2_moshpit/kraken2/select.py b/q2_moshpit/kraken2/select.py index fc4a1c82..88d00f61 100644 --- a/q2_moshpit/kraken2/select.py +++ b/q2_moshpit/kraken2/select.py @@ -223,21 +223,35 @@ def _kraken_to_ncbi_tree(df): def _combine_ncbi_trees(trees): full_tree = trees[0] for tree in trees[1:]: + tip_cache = {t.name: t for t in full_tree.tips()} for tip in list(tree.tips()): - try: - # check if taxid is already in this tree - full_tree.find(tip.name) + # check if taxid is already in this tree + if tip.name in tip_cache: continue # for clarity - except skbio.tree.MissingNodeError: + else: parents = list(tip.ancestors())[:-1] # ignore unnamed root matching = full_tree - while parents: + subtree_inserted = False + while parents and not subtree_inserted: node = parents.pop() - try: - matching = matching.find(node.name) - except skbio.tree.MissingNodeError: + ancestor_found = False + + for child in matching.children: + if child.name == node.name: + matching = child + ancestor_found = True + break + if not ancestor_found: matching.append(node) - break + for t in node.tips(): + tip_cache[t.name] = t + assert tip.name in tip_cache + subtree_inserted = True + + if not subtree_inserted: + # should be impossible. Implies ancestor_found was always + # True but not in tip_cache + raise AssertionError(f"{tip.name} could not be inserted") return full_tree diff --git a/q2_moshpit/kraken2/tests/data/duplicated-genus/SRR17000961-1.report.txt b/q2_moshpit/kraken2/tests/data/duplicated-genus/SRR17000961-1.report.txt new file mode 100644 index 00000000..ce84fb0f --- /dev/null +++ b/q2_moshpit/kraken2/tests/data/duplicated-genus/SRR17000961-1.report.txt @@ -0,0 +1,44 @@ +100 9332144 0 R 1 root +96.08 8966446 0 R1 131567 cellular organisms +5.3 494861 0 D 2759 Eukaryota +4.81 448908 0 D1 33154 Opisthokonta +4.54 423718 0 K 33208 Metazoa +4.53 422551 0 K1 6072 Eumetazoa +4.5 419698 0 K2 33213 Bilateria +3.42 318923 0 K3 33317 Protostomia +3.1 289656 0 K4 1206794 Ecdysozoa +3.07 286854 0 K5 88770 Panarthropoda +3.07 286847 0 P 6656 Arthropoda +3.01 281163 0 P1 197563 Mandibulata +3.01 281054 0 P2 197562 Pancrustacea +2.94 274151 0 P3 6960 Hexapoda +2.93 273688 0 C 50557 Insecta +2.93 273688 0 C1 85512 Dicondylia +2.93 273688 0 C2 7496 Pterygota +2.92 272146 0 C3 33340 Neoptera +0.06 5786 0 C4 33342 Paraneoptera +0.06 5578 0 O 7524 Hemiptera +0.02 2099 0 O1 33343 Prosorrhyncha +0.02 2099 0 O2 33345 Heteroptera +0.02 2099 0 O3 33347 Euheteroptera +0.02 2099 0 O4 33349 Neoheteroptera +0.02 1673 0 O5 33351 Panheteroptera +0.01 962 0 O6 33354 Cimicomorpha +0.01 935 0 O7 33355 Cimicoidea +0.01 927 0 F 30083 Miridae +0 228 0 F1 236635 Phylinae +0 228 0 F2 236648 Pilophorini +0 228 0 G 237084 Pilophorus +0.27 25092 0 K 4751 Fungi +0.24 22558 0 K1 451864 Dikarya +0.22 20530 0 P 4890 Ascomycota +0.22 20419 0 P1 716545 saccharomyceta +0.2 18909 0 P2 147538 Pezizomycotina +0.2 18778 0 P3 716546 leotiomyceta +0 58 0 C 147547 Lecanoromycetes +0 58 0 C1 1520881 OSLEUM clade +0 58 0 C2 388435 Lecanoromycetidae +0 58 0 O 5197 Lecanorales +0 58 0 O1 157822 Lecanorineae +0 53 0 F 5198 Cladoniaceae +0 53 0 G 5199 Cladonia \ No newline at end of file diff --git a/q2_moshpit/kraken2/tests/data/duplicated-genus/SRR17000967-1.report.txt b/q2_moshpit/kraken2/tests/data/duplicated-genus/SRR17000967-1.report.txt new file mode 100644 index 00000000..53362c7f --- /dev/null +++ b/q2_moshpit/kraken2/tests/data/duplicated-genus/SRR17000967-1.report.txt @@ -0,0 +1,44 @@ +100 7316706 0 R 1 root +95.72 7003211 0 R1 131567 cellular organisms +6.5 475365 0 D 2759 Eukaryota +5.81 425370 0 D1 33154 Opisthokonta +5.54 405469 0 K 33208 Metazoa +5.52 404192 0 K1 6072 Eumetazoa +5.47 400582 0 K2 33213 Bilateria +3.72 271980 0 K3 33317 Protostomia +3.32 242600 0 K4 1206794 Ecdysozoa +3.28 239851 0 K5 88770 Panarthropoda +3.28 239726 0 P 6656 Arthropoda +3.24 236861 0 P1 197563 Mandibulata +3.24 236717 0 P2 197562 Pancrustacea +3.18 232964 0 P3 6960 Hexapoda +3.18 232673 0 C 50557 Insecta +3.18 232673 0 C1 85512 Dicondylia +3.18 232673 0 C2 7496 Pterygota +3.16 230960 0 C3 33340 Neoptera +0.06 4692 0 C4 33342 Paraneoptera +0.06 4576 0 O 7524 Hemiptera +0.03 2165 0 O1 33343 Prosorrhyncha +0.03 2165 0 O2 33345 Heteroptera +0.03 2165 0 O3 33347 Euheteroptera +0.03 2165 0 O4 33349 Neoheteroptera +0.03 1877 0 O5 33351 Panheteroptera +0.01 874 0 O6 33354 Cimicomorpha +0.01 874 0 O7 33355 Cimicoidea +0 126 0 F1 236635 Phylinae +0 126 0 F2 236648 Pilophorini +0 126 0 G 237084 Pilophorus +0.27 19812 0 K 4751 Fungi +0.24 17222 0 K1 451864 Dikarya +0.2 14668 0 P 4890 Ascomycota +0.2 14565 0 P1 716545 saccharomyceta +0.14 10258 0 P2 147538 Pezizomycotina +0.14 10225 0 P3 716546 leotiomyceta +0 24 0 C 147547 Lecanoromycetes +0 24 0 C1 1520881 OSLEUM clade +0 24 0 C2 388435 Lecanoromycetidae +0 24 0 O 5197 Lecanorales +0 24 0 O1 157822 Lecanorineae +0 5 0 F 5198 Cladoniaceae +0 4 0 G 5199 Cladonia +0 1 0 G 51977 Pilophorus \ No newline at end of file diff --git a/q2_moshpit/kraken2/tests/test_selection.py b/q2_moshpit/kraken2/tests/test_selection.py index 12eff533..dc05a65b 100644 --- a/q2_moshpit/kraken2/tests/test_selection.py +++ b/q2_moshpit/kraken2/tests/test_selection.py @@ -86,6 +86,25 @@ def setUp(self): def tearDown(self): shutil.rmtree(self.temp_dir) + def test_kraken2_to_features_duplicated_genus(self): + reports = Kraken2ReportDirectoryFormat( + self.get_data_path("duplicated-genus/"), "r" + ) + obs_table, obs_taxonomy = kraken2_to_features( + reports, coverage_threshold=0.0) + # 51977 is the taxon code for pilophorus which shares a genus name with + # taxon code 237084 + assert '51977' in obs_taxonomy.index + assert '51977' in obs_table.columns + assert '237084' in obs_taxonomy.index + assert '237084' in obs_table.columns + + # Taxon codes 51977 and 5199 are sister genera and 5199 needs to be in + # the first evaluated tree and 51977 must be absent + # to induce this duplicated genus name issue in the second tree + assert '5199' in obs_taxonomy.index + assert '5199' in obs_table.columns + def test_kraken2_to_features_coverage_threshold(self): reports = Kraken2ReportDirectoryFormat( self.get_data_path("kraken2-reports-select/samples"), "r" diff --git a/setup.py b/setup.py index edc2ef35..b2f20f67 100644 --- a/setup.py +++ b/setup.py @@ -117,6 +117,7 @@ 'data/bracken-report/*/*', 'data/bracken-report-with-unclassified/*', 'data/bracken-report-with-unclassified/*/*', + 'data/duplicated-genus/*', 'data/kraken2-reports-select/*', 'data/kraken2-reports-select/*/*', 'data/kraken2-to-ncbi-tree/*',