diff --git a/.gitignore b/.gitignore index 6aff3029..135314d9 100644 --- a/.gitignore +++ b/.gitignore @@ -25,7 +25,6 @@ share/python-wheels/ *.egg-info/ .installed.cfg *.egg -MANIFEST # PyInstaller # Usually these files are written by a python script from a template @@ -133,3 +132,9 @@ dmypy.json # Mac OS .DS_Store + +# VS code settings +.vscode + +# Ignore notebooks +**/*.ipynb \ No newline at end of file diff --git a/ci/recipe/meta.yaml b/ci/recipe/meta.yaml index 50cc2936..0b6e2ec3 100644 --- a/ci/recipe/meta.yaml +++ b/ci/recipe/meta.yaml @@ -23,10 +23,13 @@ requirements: - samtools - qiime2 {{ qiime2_epoch }}.* - q2-types-genomics {{ qiime2_epoch }}.* + - q2templates {{ qiime2_epoch }}.* - eggnog-mapper >=2.1.10 - diamond - tqdm - xmltodict + - altair + - busco >=5.0.0 test: requires: diff --git a/q2_moshpit/__init__.py b/q2_moshpit/__init__.py index f2ba2c56..ee9d3e0d 100644 --- a/q2_moshpit/__init__.py +++ b/q2_moshpit/__init__.py @@ -10,6 +10,7 @@ from .kraken2 import bracken, classification, database from .metabat2 import metabat2 from . import eggnog +from . import busco from ._version import get_versions @@ -18,5 +19,5 @@ __all__ = [ 'metabat2', 'bracken', 'classification', 'database', - 'dereplicate_mags', 'eggnog' + 'dereplicate_mags', 'eggnog', 'busco', ] diff --git a/q2_moshpit/_utils.py b/q2_moshpit/_utils.py index 17ad646e..b927cff0 100644 --- a/q2_moshpit/_utils.py +++ b/q2_moshpit/_utils.py @@ -52,9 +52,15 @@ def _process_common_input_params(processing_func, params: dict) -> List[str]: """ processed_args = [] for arg_key, arg_val in params.items(): - # bool is a subclass of int so to only reject ints we need to do: - if type(arg_val) != int and not arg_val: # noqa: E721 - continue - else: + # This if condition excludes arguments which are falsy + # (False, None, "", []), except for integers and floats. + if ( # noqa: E721 + type(arg_val) == int or + type(arg_val) == float or + arg_val + ): processed_args.extend(processing_func(arg_key, arg_val)) + else: + continue + return processed_args diff --git a/q2_moshpit/assets/busco/css/styles.css b/q2_moshpit/assets/busco/css/styles.css new file mode 100644 index 00000000..433d6f7d --- /dev/null +++ b/q2_moshpit/assets/busco/css/styles.css @@ -0,0 +1,20 @@ +#plot { + margin-top: 50px; +} + +.vega-bind { + margin-bottom: 15px; +} + +.vega-bind-name { + margin-right: 10px; + white-space: nowrap +} + +.header-inline { + display: inline-block; + float: left; + margin-right: 10px; + margin-top: 8px; + margin-bottom: 8px; +} diff --git a/q2_moshpit/assets/busco/index.html b/q2_moshpit/assets/busco/index.html new file mode 100644 index 00000000..be2d478d --- /dev/null +++ b/q2_moshpit/assets/busco/index.html @@ -0,0 +1,138 @@ +{% extends "base.html" %} {% block head %} +Embedding Vega-Lite + + + + + + + +{% endblock %} {% block content %} + + +
+
+
+
Plot description
+
+

+ The left plot shows the results generated by BUSCO for all bins and + samples. "BUSCO attempts to provide a quantitative assessment + of the completeness in terms of the expected gene content of a genome + assembly, transcriptome, or annotated gene set. The results are + simplified into categories of Complete and single-copy, Complete and + duplicated, Fragmented, or Missing BUSCOs. BUSCO completeness results + make sense only in the context of the biology of your organism". Visit the + + BUSCO User Guide + for more information. +

+

+ Hoover over the graph to obtain information about the lineage dataset + used for each bin, and the number of genes in each BUSCO category. +

+

+ The right barplot shows assembly statistics calculated for each bin using BBTools. + Specifically, it displays the statistics computed by the stats.sh procedure from BBMap. + View the + + source code and documentation + + of stats.sh for more information. +

+

+ Choose the assembly statistic that you wish to display from the drop-down manu below the graphs. + Hoover over the graph to show the numerical values that each bar represents. +

+ +
+ Downloads + +
+
+
+
+
+ +
+ {% if vega_plots_overview is defined %} +
+
+
+
+ {% else %} +

Unable to generate the completeness plot

+ {% endif %} +
+ +{% if vega_plots_overview is defined %} + + + + +{% endif %} {% endblock %} {% block footer %} {% set loading_selector = +'#loading' %} {% include 'js-error-handler.html' %} {% endblock %} diff --git a/q2_moshpit/assets/busco/js/bootstrapMagic.js b/q2_moshpit/assets/busco/js/bootstrapMagic.js new file mode 100644 index 00000000..2323fe92 --- /dev/null +++ b/q2_moshpit/assets/busco/js/bootstrapMagic.js @@ -0,0 +1,31 @@ +function removeBS3refs() { + // remove Bootstrap 3 CSS/JS reference + let head = document.getElementsByTagName("head")[0] + let links = head.getElementsByTagName("link") + for (let i = 0; i < links.length; i++) { + if (links[i].href.includes("q2templateassets/css/bootstrap")) { + links[i].remove() + } + } + let scripts = head.getElementsByTagName("script") + for (let i = 0; i < scripts.length; i++) { + if (scripts[i].src.includes("q2templateassets/js/bootstrap")) { + scripts[i].remove() + } + } +} + +function adjustTagsToBS3() { + // adjust tags to BS3 + let tabs = document.getElementsByClassName("nav nav-tabs")[0].children + for (let i = 0; i < tabs.length; i++) { + let isActive = tabs[i].className.includes("active") + tabs[i].className = "nav-item" + let link = tabs[i].getElementsByTagName("a")[0] + if (isActive) { + link.classList.add("active") + } + link.classList.add("nav-link") + + } +} diff --git a/q2_moshpit/busco/__init__.py b/q2_moshpit/busco/__init__.py new file mode 100644 index 00000000..080a3f3b --- /dev/null +++ b/q2_moshpit/busco/__init__.py @@ -0,0 +1,11 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2022-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +from .busco import evaluate_busco + +__all__ = ["evaluate_busco"] diff --git a/q2_moshpit/busco/busco.py b/q2_moshpit/busco/busco.py new file mode 100644 index 00000000..a5042c1e --- /dev/null +++ b/q2_moshpit/busco/busco.py @@ -0,0 +1,121 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + + +import os +import tempfile +import q2_moshpit.busco.utils +from q2_moshpit.busco.utils import ( + _parse_busco_params, + _render_html, +) +from q2_moshpit._utils import _process_common_input_params +from typing import List +from q2_types_genomics.per_sample_data._format import MultiMAGSequencesDirFmt + + +def evaluate_busco( + output_dir: str, + bins: MultiMAGSequencesDirFmt, + mode: str = "genome", + lineage_dataset: str = None, + augustus: bool = False, + augustus_parameters: str = None, + augustus_species: str = None, + auto_lineage: bool = False, + auto_lineage_euk: bool = False, + auto_lineage_prok: bool = False, + cpu: int = 1, + config: str = None, + contig_break: int = 10, + datasets_version: str = None, + download: List[str] = None, + download_base_url: str = None, + download_path: str = None, + evalue: float = 1e-03, + force: bool = False, + limit: int = 3, + help: bool = False, + list_datasets: bool = False, + long: bool = False, + metaeuk_parameters: str = None, + metaeuk_rerun_parameters: str = None, + miniprot: bool = False, + offline: bool = False, + quiet: bool = False, + restart: bool = False, + scaffold_composition: bool = False, + tar: bool = False, + update_data: bool = False, + version: bool = False, +) -> None: + """ + qiime2 visualization for the BUSCO assessment tool + . + + Args: + see all possible inputs by running `qiime moshpit plot_busco` + + Output: + plots.zip: zip file containing all of the busco plots + busco_output: all busco output files + qiime_html: html for rendering the output plots + """ + + # Create dictionary with local variables + # (kwargs passed to the function or their defaults) excluding + # "output_dir" and "bins" + kwargs = { + k: v for k, v in locals().items() if k not in ["output_dir", "bins"] + } + + # Filter out all kwargs that are None, False or 0.0 + common_args = _process_common_input_params( + processing_func=_parse_busco_params, params=kwargs + ) + + # Creates output directory with path 'tmp' + with tempfile.TemporaryDirectory() as tmp: + # Run busco for every sample. Returns dictionary to report files. + # Result NOT included in final output + busco_results_dir = os.path.join(tmp, "busco_output") + path_to_run_summaries = q2_moshpit.busco.utils._run_busco( + output_dir=busco_results_dir, + mags=bins, + params=common_args, + ) + + # Collect result for each sample and save to file. + # Result included in final output (file for download) + all_summaries_path = os.path.join( + output_dir, "all_batch_summaries.csv" + ) + all_summaries_df = q2_moshpit.busco.utils._collect_summaries_and_save( + all_summaries_path=all_summaries_path, + path_to_run_summaries=path_to_run_summaries, + ) + + # Draw BUSCO plots for all samples + # Result NOT included in final output + plots_dir = os.path.join(tmp, "plots") + paths_to_plots = q2_moshpit.busco.utils._draw_busco_plots( + path_to_run_summaries=path_to_run_summaries, + plots_dir=plots_dir + ) + + # Zip graphs for user download + # Result included in final output (file for download) + zip_name = os.path.join(output_dir, "busco_plots.zip") + q2_moshpit.busco.utils._zip_busco_plots( + paths_to_plots=paths_to_plots, + zip_path=zip_name + ) + + # Render qiime html report + # Result included in final output + _render_html(output_dir, all_summaries_df) diff --git a/q2_moshpit/busco/tests/__init__.py b/q2_moshpit/busco/tests/__init__.py new file mode 100644 index 00000000..16cef8fc --- /dev/null +++ b/q2_moshpit/busco/tests/__init__.py @@ -0,0 +1,7 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2022-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- diff --git a/q2_moshpit/busco/tests/data/MANIFEST b/q2_moshpit/busco/tests/data/MANIFEST new file mode 100644 index 00000000..cffa369e --- /dev/null +++ b/q2_moshpit/busco/tests/data/MANIFEST @@ -0,0 +1,6 @@ +sample-id,mag-id,filename +sample1,bin1,sample1/bin1.fasta +sample1,bin2,sample1/bin2.fasta +sample2,bin1,sample2/bin1.fasta +sample2,bin2,sample2/bin2.fasta +sample3,bin1,sample3/bin1.fasta diff --git a/q2_moshpit/busco/tests/data/all_batch_summaries.csv b/q2_moshpit/busco/tests/data/all_batch_summaries.csv new file mode 100644 index 00000000..147c96a5 --- /dev/null +++ b/q2_moshpit/busco/tests/data/all_batch_summaries.csv @@ -0,0 +1,10 @@ +Input_file,Dataset,Complete,Single,Duplicated,Fragmented,Missing,n_markers,Scaffold N50,Contigs N50,Percent gaps,Number of scaffolds,sample_id +67392c6c-9f45-4c84-85f5-ae0bfc668892.fasta,bacteria_odb10,97.6,96.8,0.8,0.8,1.6,124,170295,170295,0.000%,27,sample1 +67123d05-b5ae-4a53-873b-727952881899.fasta,bacteria_odb10,97.6,96.0,1.6,1.6,0.8,124,109922,109922,0.000%,65,sample1 +311112c9-7f8b-460c-9cad-3864af3148c2.fasta,bacteria_odb10,97.6,96.0,1.6,1.6,0.8,124,111744,111744,0.000%,127,sample1 +fa0a025e-3fac-4e6b-8736-69a38c33b3f5.fasta,bacteria_odb10,97.6,96.0,1.6,1.6,0.8,124,114741,114741,0.000%,63,sample2 +12f0542c-8375-4ec2-b25a-515962533a7a.fasta,bacteria_odb10,96.0,95.2,0.8,0.8,3.2,124,171535,171535,0.000%,26,sample2 +971335f0-18b9-422e-81a7-5862c49b1e3d.fasta,bacteria_odb10,97.6,96.0,1.6,1.6,0.8,124,104568,104568,0.000%,127,sample2 +040d5cd4-4230-4533-a0a9-f03e3263c338.fasta,bacteria_odb10,97.6,96.0,1.6,1.6,0.8,124,89358,89358,0.000%,132,sample3 +e08a46f3-6fbe-4415-b10f-f11d0d187d17.fasta,bacteria_odb10,97.6,96.0,1.6,1.6,0.8,124,114888,114888,0.000%,60,sample3 +e24225c7-6455-49b1-ad8d-86f95dfbfa2e.fasta,bacteria_odb10,97.6,96.8,0.8,0.8,1.6,124,171506,171506,0.000%,27,sample3 diff --git a/q2_moshpit/busco/tests/data/all_batch_summaries_formatted.csv b/q2_moshpit/busco/tests/data/all_batch_summaries_formatted.csv new file mode 100644 index 00000000..b07b2e01 --- /dev/null +++ b/q2_moshpit/busco/tests/data/all_batch_summaries_formatted.csv @@ -0,0 +1,10 @@ +input_file,dataset,complete,single,duplicated,fragmented,missing,n_markers,scaffold_n50,contigs_n50,percent_gaps,number_of_scaffolds,sample_id,mag_id,single_,duplicated_,fragmented_,missing_ +67392c6c-9f45-4c84-85f5-ae0bfc668892.fasta,bacteria_odb10,97.6,96.8,0.8,0.8,1.6,124,170295,170295,0.0,27,sample1,67392c6c-9f45-4c84-85f5-ae0bfc668892,96.8,97.6,98.39999999999999,99.99999999999999 +67123d05-b5ae-4a53-873b-727952881899.fasta,bacteria_odb10,97.6,96.0,1.6,1.6,0.8,124,109922,109922,0.0,65,sample1,67123d05-b5ae-4a53-873b-727952881899,96.0,97.6,99.19999999999999,99.99999999999999 +311112c9-7f8b-460c-9cad-3864af3148c2.fasta,bacteria_odb10,97.6,96.0,1.6,1.6,0.8,124,111744,111744,0.0,127,sample1,311112c9-7f8b-460c-9cad-3864af3148c2,96.0,97.6,99.19999999999999,99.99999999999999 +fa0a025e-3fac-4e6b-8736-69a38c33b3f5.fasta,bacteria_odb10,97.6,96.0,1.6,1.6,0.8,124,114741,114741,0.0,63,sample2,fa0a025e-3fac-4e6b-8736-69a38c33b3f5,96.0,97.6,99.19999999999999,99.99999999999999 +12f0542c-8375-4ec2-b25a-515962533a7a.fasta,bacteria_odb10,96.0,95.2,0.8,0.8,3.2,124,171535,171535,0.0,26,sample2,12f0542c-8375-4ec2-b25a-515962533a7a,95.2,96.0,96.8,100.0 +971335f0-18b9-422e-81a7-5862c49b1e3d.fasta,bacteria_odb10,97.6,96.0,1.6,1.6,0.8,124,104568,104568,0.0,127,sample2,971335f0-18b9-422e-81a7-5862c49b1e3d,96.0,97.6,99.19999999999999,99.99999999999999 +040d5cd4-4230-4533-a0a9-f03e3263c338.fasta,bacteria_odb10,97.6,96.0,1.6,1.6,0.8,124,89358,89358,0.0,132,sample3,040d5cd4-4230-4533-a0a9-f03e3263c338,96.0,97.6,99.19999999999999,99.99999999999999 +e08a46f3-6fbe-4415-b10f-f11d0d187d17.fasta,bacteria_odb10,97.6,96.0,1.6,1.6,0.8,124,114888,114888,0.0,60,sample3,e08a46f3-6fbe-4415-b10f-f11d0d187d17,96.0,97.6,99.19999999999999,99.99999999999999 +e24225c7-6455-49b1-ad8d-86f95dfbfa2e.fasta,bacteria_odb10,97.6,96.8,0.8,0.8,1.6,124,171506,171506,0.0,27,sample3,e24225c7-6455-49b1-ad8d-86f95dfbfa2e,96.8,97.6,98.39999999999999,99.99999999999999 diff --git a/q2_moshpit/busco/tests/data/batch_summary_sample1.txt b/q2_moshpit/busco/tests/data/batch_summary_sample1.txt new file mode 100644 index 00000000..55bac3c5 --- /dev/null +++ b/q2_moshpit/busco/tests/data/batch_summary_sample1.txt @@ -0,0 +1,4 @@ +Input_file Dataset Complete Single Duplicated Fragmented Missing n_markers Scaffold N50 Contigs N50 Percent gaps Number of scaffolds sample_id +67392c6c-9f45-4c84-85f5-ae0bfc668892.fasta bacteria_odb10 97.6 96.8 0.8 0.8 1.6 124 170295 170295 0.000% 27 sample1 +67123d05-b5ae-4a53-873b-727952881899.fasta bacteria_odb10 97.6 96.0 1.6 1.6 0.8 124 109922 109922 0.000% 65 sample1 +311112c9-7f8b-460c-9cad-3864af3148c2.fasta bacteria_odb10 97.6 96.0 1.6 1.6 0.8 124 111744 111744 0.000% 127 sample1 diff --git a/q2_moshpit/busco/tests/data/batch_summary_sample2.txt b/q2_moshpit/busco/tests/data/batch_summary_sample2.txt new file mode 100644 index 00000000..b432ee3d --- /dev/null +++ b/q2_moshpit/busco/tests/data/batch_summary_sample2.txt @@ -0,0 +1,4 @@ +Input_file Dataset Complete Single Duplicated Fragmented Missing n_markers Scaffold N50 Contigs N50 Percent gaps Number of scaffolds sample_id +fa0a025e-3fac-4e6b-8736-69a38c33b3f5.fasta bacteria_odb10 97.6 96.0 1.6 1.6 0.8 124 114741 114741 0.000% 63 sample2 +12f0542c-8375-4ec2-b25a-515962533a7a.fasta bacteria_odb10 96.0 95.2 0.8 0.8 3.2 124 171535 171535 0.000% 26 sample2 +971335f0-18b9-422e-81a7-5862c49b1e3d.fasta bacteria_odb10 97.6 96.0 1.6 1.6 0.8 124 104568 104568 0.000% 127 sample2 diff --git a/q2_moshpit/busco/tests/data/batch_summary_sample3.txt b/q2_moshpit/busco/tests/data/batch_summary_sample3.txt new file mode 100644 index 00000000..bd7c2671 --- /dev/null +++ b/q2_moshpit/busco/tests/data/batch_summary_sample3.txt @@ -0,0 +1,4 @@ +Input_file Dataset Complete Single Duplicated Fragmented Missing n_markers Scaffold N50 Contigs N50 Percent gaps Number of scaffolds sample_id +040d5cd4-4230-4533-a0a9-f03e3263c338.fasta bacteria_odb10 97.6 96.0 1.6 1.6 0.8 124 89358 89358 0.000% 132 sample3 +e08a46f3-6fbe-4415-b10f-f11d0d187d17.fasta bacteria_odb10 97.6 96.0 1.6 1.6 0.8 124 114888 114888 0.000% 60 sample3 +e24225c7-6455-49b1-ad8d-86f95dfbfa2e.fasta bacteria_odb10 97.6 96.8 0.8 0.8 1.6 124 171506 171506 0.000% 27 sample3 diff --git a/q2_moshpit/busco/tests/data/busco_output/sample1/batch_summary.txt b/q2_moshpit/busco/tests/data/busco_output/sample1/batch_summary.txt new file mode 100644 index 00000000..e69de29b diff --git a/q2_moshpit/busco/tests/data/busco_output/sample2/batch_summary.txt b/q2_moshpit/busco/tests/data/busco_output/sample2/batch_summary.txt new file mode 100644 index 00000000..e69de29b diff --git a/q2_moshpit/busco/tests/data/busco_output/sample3/batch_summary.txt b/q2_moshpit/busco/tests/data/busco_output/sample3/batch_summary.txt new file mode 100644 index 00000000..e69de29b diff --git a/q2_moshpit/busco/tests/data/plot_as_dict.json b/q2_moshpit/busco/tests/data/plot_as_dict.json new file mode 100644 index 00000000..5752e490 --- /dev/null +++ b/q2_moshpit/busco/tests/data/plot_as_dict.json @@ -0,0 +1,630 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v5.15.1.json", + "config": { + "axis": { + "labelFontSize": 17, + "titleFontSize": 20 + }, + "header": { + "labelFontSize": 17, + "titleFontSize": 20 + }, + "legend": { + "labelFontSize": 17, + "titleFontSize": 20 + }, + "view": { + "continuousHeight": 300, + "continuousWidth": 300 + } + }, + "datasets": { + "data-86d6ccb2de4aae10868d6a86589fa41a": [ + { + "BUSCO_percentage": 96.8, + "category": "single", + "dataset": "bacteria_odb10", + "fracc_markers": "~120/124", + "mag_id": "67392c6c-9f45-4c84-85f5-ae0bfc668892", + "n_markers": 124, + "order": 1, + "sample_id": "sample1" + }, + { + "BUSCO_percentage": 96.0, + "category": "single", + "dataset": "bacteria_odb10", + "fracc_markers": "~119/124", + "mag_id": "67123d05-b5ae-4a53-873b-727952881899", + "n_markers": 124, + "order": 1, + "sample_id": "sample1" + }, + { + "BUSCO_percentage": 96.0, + "category": "single", + "dataset": "bacteria_odb10", + "fracc_markers": "~119/124", + "mag_id": "311112c9-7f8b-460c-9cad-3864af3148c2", + "n_markers": 124, + "order": 1, + "sample_id": "sample1" + }, + { + "BUSCO_percentage": 96.0, + "category": "single", + "dataset": "bacteria_odb10", + "fracc_markers": "~119/124", + "mag_id": "fa0a025e-3fac-4e6b-8736-69a38c33b3f5", + "n_markers": 124, + "order": 1, + "sample_id": "sample2" + }, + { + "BUSCO_percentage": 95.2, + "category": "single", + "dataset": "bacteria_odb10", + "fracc_markers": "~118/124", + "mag_id": "12f0542c-8375-4ec2-b25a-515962533a7a", + "n_markers": 124, + "order": 1, + "sample_id": "sample2" + }, + { + "BUSCO_percentage": 96.0, + "category": "single", + "dataset": "bacteria_odb10", + "fracc_markers": "~119/124", + "mag_id": "971335f0-18b9-422e-81a7-5862c49b1e3d", + "n_markers": 124, + "order": 1, + "sample_id": "sample2" + }, + { + "BUSCO_percentage": 96.0, + "category": "single", + "dataset": "bacteria_odb10", + "fracc_markers": "~119/124", + "mag_id": "040d5cd4-4230-4533-a0a9-f03e3263c338", + "n_markers": 124, + "order": 1, + "sample_id": "sample3" + }, + { + "BUSCO_percentage": 96.0, + "category": "single", + "dataset": "bacteria_odb10", + "fracc_markers": "~119/124", + "mag_id": "e08a46f3-6fbe-4415-b10f-f11d0d187d17", + "n_markers": 124, + "order": 1, + "sample_id": "sample3" + }, + { + "BUSCO_percentage": 96.8, + "category": "single", + "dataset": "bacteria_odb10", + "fracc_markers": "~120/124", + "mag_id": "e24225c7-6455-49b1-ad8d-86f95dfbfa2e", + "n_markers": 124, + "order": 1, + "sample_id": "sample3" + }, + { + "BUSCO_percentage": 0.8, + "category": "duplicated", + "dataset": "bacteria_odb10", + "fracc_markers": "~1/124", + "mag_id": "67392c6c-9f45-4c84-85f5-ae0bfc668892", + "n_markers": 124, + "order": 2, + "sample_id": "sample1" + }, + { + "BUSCO_percentage": 1.6, + "category": "duplicated", + "dataset": "bacteria_odb10", + "fracc_markers": "~2/124", + "mag_id": "67123d05-b5ae-4a53-873b-727952881899", + "n_markers": 124, + "order": 2, + "sample_id": "sample1" + }, + { + "BUSCO_percentage": 1.6, + "category": "duplicated", + "dataset": "bacteria_odb10", + "fracc_markers": "~2/124", + "mag_id": "311112c9-7f8b-460c-9cad-3864af3148c2", + "n_markers": 124, + "order": 2, + "sample_id": "sample1" + }, + { + "BUSCO_percentage": 1.6, + "category": "duplicated", + "dataset": "bacteria_odb10", + "fracc_markers": "~2/124", + "mag_id": "fa0a025e-3fac-4e6b-8736-69a38c33b3f5", + "n_markers": 124, + "order": 2, + "sample_id": "sample2" + }, + { + "BUSCO_percentage": 0.8, + "category": "duplicated", + "dataset": "bacteria_odb10", + "fracc_markers": "~1/124", + "mag_id": "12f0542c-8375-4ec2-b25a-515962533a7a", + "n_markers": 124, + "order": 2, + "sample_id": "sample2" + }, + { + "BUSCO_percentage": 1.6, + "category": "duplicated", + "dataset": "bacteria_odb10", + "fracc_markers": "~2/124", + "mag_id": "971335f0-18b9-422e-81a7-5862c49b1e3d", + "n_markers": 124, + "order": 2, + "sample_id": "sample2" + }, + { + "BUSCO_percentage": 1.6, + "category": "duplicated", + "dataset": "bacteria_odb10", + "fracc_markers": "~2/124", + "mag_id": "040d5cd4-4230-4533-a0a9-f03e3263c338", + "n_markers": 124, + "order": 2, + "sample_id": "sample3" + }, + { + "BUSCO_percentage": 1.6, + "category": "duplicated", + "dataset": "bacteria_odb10", + "fracc_markers": "~2/124", + "mag_id": "e08a46f3-6fbe-4415-b10f-f11d0d187d17", + "n_markers": 124, + "order": 2, + "sample_id": "sample3" + }, + { + "BUSCO_percentage": 0.8, + "category": "duplicated", + "dataset": "bacteria_odb10", + "fracc_markers": "~1/124", + "mag_id": "e24225c7-6455-49b1-ad8d-86f95dfbfa2e", + "n_markers": 124, + "order": 2, + "sample_id": "sample3" + }, + { + "BUSCO_percentage": 0.8, + "category": "fragmented", + "dataset": "bacteria_odb10", + "fracc_markers": "~1/124", + "mag_id": "67392c6c-9f45-4c84-85f5-ae0bfc668892", + "n_markers": 124, + "order": 3, + "sample_id": "sample1" + }, + { + "BUSCO_percentage": 1.6, + "category": "fragmented", + "dataset": "bacteria_odb10", + "fracc_markers": "~2/124", + "mag_id": "67123d05-b5ae-4a53-873b-727952881899", + "n_markers": 124, + "order": 3, + "sample_id": "sample1" + }, + { + "BUSCO_percentage": 1.6, + "category": "fragmented", + "dataset": "bacteria_odb10", + "fracc_markers": "~2/124", + "mag_id": "311112c9-7f8b-460c-9cad-3864af3148c2", + "n_markers": 124, + "order": 3, + "sample_id": "sample1" + }, + { + "BUSCO_percentage": 1.6, + "category": "fragmented", + "dataset": "bacteria_odb10", + "fracc_markers": "~2/124", + "mag_id": "fa0a025e-3fac-4e6b-8736-69a38c33b3f5", + "n_markers": 124, + "order": 3, + "sample_id": "sample2" + }, + { + "BUSCO_percentage": 0.8, + "category": "fragmented", + "dataset": "bacteria_odb10", + "fracc_markers": "~1/124", + "mag_id": "12f0542c-8375-4ec2-b25a-515962533a7a", + "n_markers": 124, + "order": 3, + "sample_id": "sample2" + }, + { + "BUSCO_percentage": 1.6, + "category": "fragmented", + "dataset": "bacteria_odb10", + "fracc_markers": "~2/124", + "mag_id": "971335f0-18b9-422e-81a7-5862c49b1e3d", + "n_markers": 124, + "order": 3, + "sample_id": "sample2" + }, + { + "BUSCO_percentage": 1.6, + "category": "fragmented", + "dataset": "bacteria_odb10", + "fracc_markers": "~2/124", + "mag_id": "040d5cd4-4230-4533-a0a9-f03e3263c338", + "n_markers": 124, + "order": 3, + "sample_id": "sample3" + }, + { + "BUSCO_percentage": 1.6, + "category": "fragmented", + "dataset": "bacteria_odb10", + "fracc_markers": "~2/124", + "mag_id": "e08a46f3-6fbe-4415-b10f-f11d0d187d17", + "n_markers": 124, + "order": 3, + "sample_id": "sample3" + }, + { + "BUSCO_percentage": 0.8, + "category": "fragmented", + "dataset": "bacteria_odb10", + "fracc_markers": "~1/124", + "mag_id": "e24225c7-6455-49b1-ad8d-86f95dfbfa2e", + "n_markers": 124, + "order": 3, + "sample_id": "sample3" + }, + { + "BUSCO_percentage": 1.6, + "category": "missing", + "dataset": "bacteria_odb10", + "fracc_markers": "~2/124", + "mag_id": "67392c6c-9f45-4c84-85f5-ae0bfc668892", + "n_markers": 124, + "order": 4, + "sample_id": "sample1" + }, + { + "BUSCO_percentage": 0.8, + "category": "missing", + "dataset": "bacteria_odb10", + "fracc_markers": "~1/124", + "mag_id": "67123d05-b5ae-4a53-873b-727952881899", + "n_markers": 124, + "order": 4, + "sample_id": "sample1" + }, + { + "BUSCO_percentage": 0.8, + "category": "missing", + "dataset": "bacteria_odb10", + "fracc_markers": "~1/124", + "mag_id": "311112c9-7f8b-460c-9cad-3864af3148c2", + "n_markers": 124, + "order": 4, + "sample_id": "sample1" + }, + { + "BUSCO_percentage": 0.8, + "category": "missing", + "dataset": "bacteria_odb10", + "fracc_markers": "~1/124", + "mag_id": "fa0a025e-3fac-4e6b-8736-69a38c33b3f5", + "n_markers": 124, + "order": 4, + "sample_id": "sample2" + }, + { + "BUSCO_percentage": 3.2, + "category": "missing", + "dataset": "bacteria_odb10", + "fracc_markers": "~4/124", + "mag_id": "12f0542c-8375-4ec2-b25a-515962533a7a", + "n_markers": 124, + "order": 4, + "sample_id": "sample2" + }, + { + "BUSCO_percentage": 0.8, + "category": "missing", + "dataset": "bacteria_odb10", + "fracc_markers": "~1/124", + "mag_id": "971335f0-18b9-422e-81a7-5862c49b1e3d", + "n_markers": 124, + "order": 4, + "sample_id": "sample2" + }, + { + "BUSCO_percentage": 0.8, + "category": "missing", + "dataset": "bacteria_odb10", + "fracc_markers": "~1/124", + "mag_id": "040d5cd4-4230-4533-a0a9-f03e3263c338", + "n_markers": 124, + "order": 4, + "sample_id": "sample3" + }, + { + "BUSCO_percentage": 0.8, + "category": "missing", + "dataset": "bacteria_odb10", + "fracc_markers": "~1/124", + "mag_id": "e08a46f3-6fbe-4415-b10f-f11d0d187d17", + "n_markers": 124, + "order": 4, + "sample_id": "sample3" + }, + { + "BUSCO_percentage": 1.6, + "category": "missing", + "dataset": "bacteria_odb10", + "fracc_markers": "~2/124", + "mag_id": "e24225c7-6455-49b1-ad8d-86f95dfbfa2e", + "n_markers": 124, + "order": 4, + "sample_id": "sample3" + } + ], + "data-a93d7a0a94d75d2ace5979a9e91e0d4a": [ + { + "contigs_n50": 170295, + "mag_id": "67392c6c-9f45-4c84-85f5-ae0bfc668892", + "number_of_scaffolds": 27, + "percent_gaps": 0.0, + "sample_id": "sample1", + "scaffold_n50": 170295 + }, + { + "contigs_n50": 109922, + "mag_id": "67123d05-b5ae-4a53-873b-727952881899", + "number_of_scaffolds": 65, + "percent_gaps": 0.0, + "sample_id": "sample1", + "scaffold_n50": 109922 + }, + { + "contigs_n50": 111744, + "mag_id": "311112c9-7f8b-460c-9cad-3864af3148c2", + "number_of_scaffolds": 127, + "percent_gaps": 0.0, + "sample_id": "sample1", + "scaffold_n50": 111744 + }, + { + "contigs_n50": 114741, + "mag_id": "fa0a025e-3fac-4e6b-8736-69a38c33b3f5", + "number_of_scaffolds": 63, + "percent_gaps": 0.0, + "sample_id": "sample2", + "scaffold_n50": 114741 + }, + { + "contigs_n50": 171535, + "mag_id": "12f0542c-8375-4ec2-b25a-515962533a7a", + "number_of_scaffolds": 26, + "percent_gaps": 0.0, + "sample_id": "sample2", + "scaffold_n50": 171535 + }, + { + "contigs_n50": 104568, + "mag_id": "971335f0-18b9-422e-81a7-5862c49b1e3d", + "number_of_scaffolds": 127, + "percent_gaps": 0.0, + "sample_id": "sample2", + "scaffold_n50": 104568 + }, + { + "contigs_n50": 89358, + "mag_id": "040d5cd4-4230-4533-a0a9-f03e3263c338", + "number_of_scaffolds": 132, + "percent_gaps": 0.0, + "sample_id": "sample3", + "scaffold_n50": 89358 + }, + { + "contigs_n50": 114888, + "mag_id": "e08a46f3-6fbe-4415-b10f-f11d0d187d17", + "number_of_scaffolds": 60, + "percent_gaps": 0.0, + "sample_id": "sample3", + "scaffold_n50": 114888 + }, + { + "contigs_n50": 171506, + "mag_id": "e24225c7-6455-49b1-ad8d-86f95dfbfa2e", + "number_of_scaffolds": 27, + "percent_gaps": 0.0, + "sample_id": "sample3", + "scaffold_n50": 171506 + } + ] + }, + "hconcat": [ + { + "data": { + "name": "data-86d6ccb2de4aae10868d6a86589fa41a" + }, + "facet": { + "row": { + "field": "sample_id", + "title": "Sample ID", + "type": "nominal" + } + }, + "resolve": { + "scale": { + "y": "independent" + } + }, + "spec": { + "encoding": { + "color": { + "field": "category", + "legend": { + "orient": "top", + "title": "BUSCO Category" + }, + "scale": { + "domain": [ + "single", + "duplicated", + "fragmented", + "missing" + ], + "range": [ + "#1E90FF", + "#87CEFA", + "#FFA500", + "#FF7F50" + ] + }, + "type": "nominal" + }, + "opacity": { + "value": 0.85 + }, + "order": { + "field": "order", + "sort": "ascending", + "type": "quantitative" + }, + "tooltip": [ + { + "field": "sample_id", + "title": "Sample ID", + "type": "nominal" + }, + { + "field": "mag_id", + "title": "MAG ID", + "type": "nominal" + }, + { + "field": "dataset", + "title": "Lineage dataset", + "type": "nominal" + }, + { + "field": "fracc_markers", + "title": "Aprox. number of markers in this category", + "type": "nominal" + }, + { + "field": "BUSCO_percentage", + "title": "% BUSCOs", + "type": "quantitative" + } + ], + "x": { + "aggregate": "sum", + "field": "BUSCO_percentage", + "stack": "normalize", + "title": "BUSCO fraction", + "type": "quantitative" + }, + "y": { + "axis": { + "title": "MAG ID" + }, + "field": "mag_id", + "type": "nominal" + } + }, + "height": 162, + "mark": { + "type": "bar" + }, + "width": 600 + } + }, + { + "data": { + "name": "data-a93d7a0a94d75d2ace5979a9e91e0d4a" + }, + "facet": { + "row": { + "field": "sample_id", + "header": { + "labelFontSize": 0 + }, + "title": null, + "type": "nominal" + } + }, + "resolve": { + "scale": { + "y": "independent" + } + }, + "spec": { + "encoding": { + "opacity": { + "value": 0.85 + }, + "tooltip": [ + { + "field": "x", + "title": "value", + "type": "quantitative" + } + ], + "x": { + "field": "x", + "title": "Assembly Statistic", + "type": "quantitative" + }, + "y": { + "axis": null, + "field": "mag_id", + "type": "nominal" + } + }, + "height": 162, + "mark": { + "type": "bar" + }, + "transform": [ + { + "as": "x", + "calculate": "datum[param_2]" + } + ], + "width": 600 + } + } + ], + "params": [ + { + "bind": { + "input": "select", + "name": "Assambly Statistics: ", + "options": [ + "scaffold_n50", + "contigs_n50", + "percent_gaps", + "number_of_scaffolds" + ] + }, + "name": "param_2", + "value": "scaffold_n50" + } + ], + "spacing": 3 +} \ No newline at end of file diff --git a/q2_moshpit/busco/tests/test_busco.py b/q2_moshpit/busco/tests/test_busco.py new file mode 100644 index 00000000..bcaf28de --- /dev/null +++ b/q2_moshpit/busco/tests/test_busco.py @@ -0,0 +1,104 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2022-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import os +import tempfile +import pandas as pd +from q2_moshpit.busco.busco import evaluate_busco +from unittest.mock import patch, ANY +from qiime2.plugin.testing import TestPluginBase +from q2_types_genomics.per_sample_data._format import MultiMAGSequencesDirFmt + + +class TestBUSCO(TestPluginBase): + package = "q2_moshpit.busco.tests" + + @classmethod + def setUpClass(self): + + # Set base path + p = os.path.join(os.path.dirname(__file__), "data") + + # Get MAGs fixture + self.mags = MultiMAGSequencesDirFmt( + path=p, + mode="r", + ) + + # Integration test busco. + @patch('q2_moshpit.busco.utils._run_busco') + @patch('q2_moshpit.busco.utils._zip_busco_plots') + @patch('q2_moshpit.busco.utils._draw_busco_plots') + @patch('q2_moshpit.busco.utils._collect_summaries_and_save') + def test_integration_busco( + self, + collect_summaries, + draw_busco_plots, + zip_busco_plots, + run_busco + ): + """ + Tests entire busco run and patches the previously tested functions. + Checks for existence of HTML output. + + Args: + collect_summaries (unittest.mock): mock object for function + `_collect_summaries_and_save` + zip_busco_plots (unittest.mock): mock object for function + `_draw_busco_plots`. + zip_busco_plots (unittest.mock): mock object for function + `_zip_busco_plots`. + run_busco (unittest.mock): mock object for function + `_run_busco`. + """ + # import shutil + # path_to_look_at_html = "/Users/santiago/Downloads/busco_debug_bench" + + with tempfile.TemporaryDirectory() as tmp_path: + # This side effect will return the all_summaries_dfs + p = self.get_data_path("all_batch_summaries.csv") + collect_summaries.return_value = pd.read_csv(p) + + # Run busco + evaluate_busco(output_dir=str(tmp_path), bins=self.mags) + + # For render debugging + # shutil.copytree(str(tmp_path), path_to_look_at_html) + + # Check for the existence of the html file + self.assertTrue(os.path.exists(f"{tmp_path}/index.html")) + + # Assert that the calls where done properly + run_busco.assert_called_once_with( + output_dir=run_busco.call_args.kwargs['output_dir'], + mags=self.mags, + params=[ + '--mode', 'genome', + '--cpu', '1', + '--contig_break', '10', + '--evalue', '0.001', + '--limit', '3' + ] + ) + + collect_summaries.assert_called_once_with( + all_summaries_path=os.path.join( + tmp_path, "all_batch_summaries.csv" + ), + path_to_run_summaries=ANY, + ) + + draw_busco_plots.assert_called_once_with( + path_to_run_summaries=ANY, + plots_dir=draw_busco_plots.call_args.kwargs["plots_dir"] + ) + + zip_busco_plots.assert_called_once_with( + paths_to_plots=ANY, + zip_path=os.path.join(tmp_path, "busco_plots.zip") + ) diff --git a/q2_moshpit/busco/tests/test_utils.py b/q2_moshpit/busco/tests/test_utils.py new file mode 100644 index 00000000..a15325d8 --- /dev/null +++ b/q2_moshpit/busco/tests/test_utils.py @@ -0,0 +1,329 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2022-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import os +import tempfile +import zipfile +import pandas as pd +from q2_moshpit.busco.utils import ( + _parse_busco_params, + _draw_busco_plots, + _zip_busco_plots, + _run_busco, + _draw_busco_plots_for_render, + _collect_summaries_and_save, + _parse_df_columns, +) +from unittest.mock import patch, call +from qiime2.plugin.testing import TestPluginBase +from q2_types_genomics.per_sample_data._format import MultiMAGSequencesDirFmt + + +class TestBUSCO(TestPluginBase): + package = "q2_moshpit.busco.tests" + + @classmethod + def setUpClass(self): + + # Set base path + p = os.path.join(os.path.dirname(__file__), "data") + + # Get MAGs fixture + self.mags = MultiMAGSequencesDirFmt( + path=p, + mode="r", + ) + + # Test `_parse_busco_params` + def test_parse_busco_params_1(self): + observed = _parse_busco_params("auto_lineage", True) + expected = ["--auto-lineage"] + self.assertSetEqual(set(observed), set(expected)) + + def test_parse_busco_params_2(self): + observed = _parse_busco_params("evalue", 0.66) + expected = ["--evalue", str(0.66)] + self.assertSetEqual(set(observed), set(expected)) + + def test_parse_busco_params_3(self): + observed = _parse_busco_params("augustus", True) + expected = ["--augustus"] + self.assertSetEqual(set(observed), set(expected)) + + def test_parse_busco_params_4(self): + observed = _parse_busco_params("lineage_dataset", "bacteria-XYZ") + expected = ["--lineage_dataset", "bacteria-XYZ"] + self.assertSetEqual(set(observed), set(expected)) + + def test_collect_summaries_and_save(self): + """ + Test for `_collect_summaries_and_save` function. + Uses data stored in ./data. Checks for data frame equality. + """ + with tempfile.TemporaryDirectory() as tmp_path: + path_to_summaries = {} + + for i in range(1, 4): + path_to_summaries[f"sample{i}"] = self.get_data_path( + filename=f"batch_summary_sample{i}.txt" + ) + + observed = _collect_summaries_and_save( + path_to_run_summaries=path_to_summaries, + all_summaries_path=os.path.join(tmp_path, "aggregated.csv"), + ) + + expected = pd.read_csv( + self.get_data_path(filename="all_batch_summaries.csv") + ) + pd.set_option('display.max_columns', None) + + try: + pd.testing.assert_frame_equal(observed, expected) + except AssertionError as e: + print(e) + self.assertTrue(False) + else: + self.assertTrue(True) + + # Test `_draw_busco_plots` + def draw_n_busco_plots(self, filename, delim): + """ + Creates plot from a table containing information about + one or more samples. Checks for the existence of the output + plots. + + Args: + filename (str): name of file in ./data to construct the images + delim (str): delimiter of `filename` + """ + # Create an empty dictionary to store the DataFrames + path_to_run_summaries = {} + + # Group the DataFrame by the 'sample_id' column + p = self.get_data_path(f"{filename}") + df = pd.read_csv(p, delimiter=delim) + grouped = df.groupby("sample_id") + + # Iterate through the groups and store each group as a + # DataFrame in the dictionary + # Creates output directory with path 'tmp' + with tempfile.TemporaryDirectory() as tmp_path: + for sample_id, group_df in grouped: + path_to_df = f"{tmp_path}/{sample_id}.csv" + group_df.to_csv(path_to_df, sep="\t", index=False) + path_to_run_summaries[sample_id] = path_to_df + + # Draw plots + paths_to_plots = _draw_busco_plots( + path_to_run_summaries=path_to_run_summaries, + plots_dir=os.path.join(tmp_path, "plots"), + ) + + # Check if busco plots are in fact generated + for _, value in paths_to_plots.items(): + self.assertTrue(os.path.exists(value)) + + def test_draw_busco_plots_multiple(self): + self.draw_n_busco_plots( + filename="all_batch_summaries.csv", delim="," + ) + + def test_draw_busco_plots_one(self): + self.draw_n_busco_plots( + filename="batch_summary_sample1.txt", delim="\t" + ) + + # Test `_draw_busco_plots_for_render` + def test_draw_busco_plots_for_render(self): + """ + Tests function `_draw_busco_plots_for_render`. + Checks for dictionary equality. + """ + # Load data + p = self.get_data_path("all_batch_summaries.csv") + all_summaries_df = pd.read_csv(p) + + # Draw plot + observed = _draw_busco_plots_for_render( + all_summaries_df, + width=600, + height=18, + titleFontSize=20, + labelFontSize=17, + ) + + # Load expected data + p = self.get_data_path("plot_as_dict.json") + with open(p, "r") as json_file: + expected = json_file.read() + + self.maxDiff = None + self.assertEqual(expected, observed) + + # Test `_draw_busco_plots` + def mock_draw_busco_plots(self, tmp_path: str, num_files: int) -> dict: + """ + Mocks the generation of sample wise plots by generating + empty files. + + Args: + tmp_path (str): Path where to write the empty files. + num_files (int): number of empty files to create, one per sample. + + Returns: + paths_to_plots (dict): dictionary with keys sample_id and value + path to empty file. + """ + # Generate random images + paths_to_plots = {} + + # Path to output + out_dir = os.path.join(tmp_path, "zip_this_dir") + os.makedirs(out_dir) + + # Loop to create the empty files + for i in range(num_files): + # Specify the name of each empty file + file_name = f"empty_file_{i}.svg" + + # Combine the directory path and file name to create the full + # file path + file_path = os.path.join(out_dir, file_name) + paths_to_plots[f"empty_file_{i}"] = file_path + + # Create an empty file + with open(file_path, 'w'): + pass + + return paths_to_plots + + def test_zip_busco_plots_multiple(self): + """ + Checks for existence of zip file. + """ + with tempfile.TemporaryDirectory() as tmp_path: + paths_to_plots = self.mock_draw_busco_plots( + num_files=6, tmp_path=tmp_path + ) + + # Zip graphs for user download + zip_path = os.path.join(tmp_path, "busco_plots.zip") + _zip_busco_plots(paths_to_plots=paths_to_plots, zip_path=zip_path) + + # Check for existence of file + self.assertTrue(zipfile.is_zipfile(zip_path)) + + def test_zip_busco_plots_one(self): + """ + Checks for existence of zip file. + """ + with tempfile.TemporaryDirectory() as tmp_path: + paths_to_plots = self.mock_draw_busco_plots( + num_files=1, tmp_path=tmp_path + ) + + # Zip graphs for user download + zip_path = os.path.join(tmp_path, "busco_plots.zip") + _zip_busco_plots(paths_to_plots=paths_to_plots, zip_path=zip_path) + + # Check for existence of file + self.assertTrue(zipfile.is_zipfile(zip_path)) + + @patch('subprocess.run') + def test_run_busco(self, subp_run): + """ + Test function `_run_busco`. Checks for dictionary equality. + """ + output_dir = self.get_data_path("busco_output") + sample_ids = os.listdir(output_dir) + + # Initialize assertion objects + expected = {} + calls = [] + + # Define command arguments + fake_props = ["--a", "--b", "0.6"] + + # Fabricate list of calls and the expected output + for sample_id in sample_ids: + # Make a dictionary to compare output + p = os.path.join(output_dir, sample_id, "batch_summary.txt") + expected[sample_id] = p + + # Append call to list of calls to assert the patching + calls.append(call( + [ + "busco", + "--a", + "--b", "0.6", + "--in", self.get_data_path(f"{sample_id}"), + "--out_path", output_dir, + "-o", sample_id + ], + check=True + )) + + # Run busco and save paths to run summaries + observed = _run_busco( + output_dir=output_dir, + mags=self.mags, + params=fake_props, + ) + + # Assert output + self.assertDictEqual(expected, observed) + + # Check for appropiate calls + subp_run.assert_has_calls(calls, any_order=True) + + @patch("subprocess.run") + def test_run_busco_exception(self, subp_run): + """ + Test function `_run_busco`. Checks for a raised exception. + """ + with tempfile.TemporaryDirectory() as tmp_path: + # Define command arguments + fake_props = ["--a", "--b", "0.6"] + output_dir = os.path.join(tmp_path, "busco_output") + + with self.assertRaises(FileNotFoundError): + # Run busco and save paths to run summaries + _ = _run_busco( + output_dir=output_dir, + mags=self.mags, + params=fake_props, + ) + + # Assert that the patch was called once. + cmd = [ + "busco", + "--a", + "--b", "0.6", + "--in", self.get_data_path("sample1"), + "--out_path", output_dir, + "-o", "sample1" + ] + subp_run.assert_called_once_with(cmd, check=True) + + def test_parse_df_columns(self): + # This side effect will return the all_summaries_dfs + p1 = self.get_data_path("all_batch_summaries.csv") + observed = pd.read_csv(p1) + observed = _parse_df_columns(observed) + + p2 = self.get_data_path("all_batch_summaries_formatted.csv") + expected = pd.read_csv(p2) + + try: + pd.testing.assert_frame_equal(observed, expected) + except AssertionError as e: + print(e) + self.assertTrue(False) + else: + self.assertTrue(True) diff --git a/q2_moshpit/busco/utils.py b/q2_moshpit/busco/utils.py new file mode 100644 index 00000000..c3e1a394 --- /dev/null +++ b/q2_moshpit/busco/utils.py @@ -0,0 +1,495 @@ +import os +import q2templates +from shutil import copytree +import pandas as pd +import altair as alt +import matplotlib.pyplot as plt +from zipfile import ZipFile +from .._utils import run_command +from copy import deepcopy +from typing import List, Dict +from q2_types_genomics.per_sample_data._format import MultiMAGSequencesDirFmt + + +arguments_with_hyphens = { + "auto_lineage": "auto-lineage", + "auto_lineage_euk": "auto-lineage-euk", + "auto_lineage_prok": "auto-lineage-prok", + "list_datasets": "list-datasets", + "update_data": "update-data", +} + + +def _parse_busco_params(arg_key, arg_val) -> List[str]: + """Creates a list with argument and its value to be consumed by BUSCO. + Argument names will be converted to command line parameters by + appending a '--' prefix and in some cases replacing "_" for "-" + (only for e.g. `arguments_with_hyphens`) + + Args: + arg_key (str): Argument name. + arg_val: Argument value. + Returns: + [converted_arg, arg_value]: List containing a prepared command line + parameter and, optionally, its value. + """ + + # If the key is in arguments_with_hyphens, modify key + if arg_key in arguments_with_hyphens.keys(): + arg_key = arguments_with_hyphens[arg_key] + + if isinstance(arg_val, bool): + return [f"--{arg_key}"] + else: + return [f"--{arg_key}", str(arg_val)] + + +def _draw_busco_plots_for_render( + df: pd.DataFrame, + width: int = None, + height: int = None, + labelFontSize: int = None, + titleFontSize: int = None, +) -> str: + """ + Draws a horizontal normalized bar plot for every sample for which BUSCO was + run. Each barplot shows the BUSCO results for each of the MAGs in the + sample. The plots for all samples are drawn in one composite plot which + is then returned as a dictionary for rendering (but casted to a string). + + Args: + df (pd.DataFrame): tabular batch summary for all samples + width (int): width of the plot + height (int): height of each bar in the plot + labelFontSize (int): size of the labels in plot + titleFontSize (int): size of titles in plot + + Output: + Output plot in dictionary from casted to a string. + """ + + # Format data frame + df = _parse_df_columns(df) + + # Get number of samples + n_samples = len(df["mag_id"].unique()) + + # Format data for plotting + busco_plot_data = pd.melt( + df, + id_vars=["sample_id", "mag_id", "dataset", "n_markers"], + value_vars=["single", "duplicated", "fragmented", "missing"], + value_name="BUSCO_percentage", + var_name="category", + ) + + secondary_plot_data = df[[ + "sample_id", + "mag_id", + 'scaffold_n50', + 'contigs_n50', + 'percent_gaps', + 'number_of_scaffolds', + ]] + + # Specify order + mapping = {"single": 1, "duplicated": 2, "fragmented": 3, "missing": 4} + busco_plot_data["order"] = busco_plot_data["category"].map(mapping) + + # Estimate fraction of sequences in each BUSCO category + busco_plot_data["fracc_markers"] = ( + "~" + + round( + busco_plot_data["BUSCO_percentage"] * + busco_plot_data["n_markers"] / 100 + ).map(int).map(str) + + "/" + busco_plot_data["n_markers"].map(str) + ) + + # Plot + domain = ["single", "duplicated", "fragmented", "missing"] + range_ = ["#1E90FF", "#87CEFA", "#FFA500", "#FF7F50"] + + busco_plot = ( + alt.Chart(busco_plot_data) + .mark_bar() + .encode( + x=alt.X( + "sum(BUSCO_percentage)", + stack="normalize", + title="BUSCO fraction" + ), + y=alt.Y("mag_id", axis=alt.Axis(title="MAG ID")), + color=alt.Color( + "category", + scale=alt.Scale(domain=domain, range=range_), + legend=alt.Legend(title="BUSCO Category", orient="top"), + ), + order=alt.Order("order", sort="ascending"), + tooltip=[ + alt.Tooltip("sample_id", title="Sample ID"), + alt.Tooltip("mag_id", title="MAG ID"), + alt.Tooltip("dataset", title="Lineage dataset"), + alt.Tooltip( + "fracc_markers", + title="Aprox. number of markers in this category" + ), + alt.Tooltip("BUSCO_percentage", title="% BUSCOs"), + ], + opacity=alt.value(0.85), + ) + .properties(width=width, height=height * n_samples) + .facet(row=alt.Row("sample_id", title="Sample ID")) + .resolve_scale(y="independent") + ) + + # Secondary plot + # Drop down menu + dropdown = alt.binding_select( + options=[ + 'scaffold_n50', + 'contigs_n50', + 'percent_gaps', + 'number_of_scaffolds', + ], + name="Assambly Statistics: " + ) + + xcol_param = alt.param( + value='scaffold_n50', + bind=dropdown + ) + + secondary_plot = alt.Chart(secondary_plot_data).mark_bar().encode( + x=alt.X('x:Q').title('Assembly Statistic'), + y=alt.Y('mag_id', axis=None), + tooltip=[alt.Tooltip('x:Q', title="value")], + opacity=alt.value(0.85) + ).transform_calculate( + x=f'datum[{xcol_param.name}]' + ).add_params( + xcol_param + ).properties( + width=width, + height=height * n_samples + ).facet( + row=alt.Row( + "sample_id", + title=None, + header=alt.Header(labelFontSize=0) + ) + ).resolve_scale( + y="independent" + ) + + # concatenate plots horizontally + output_plot = alt.hconcat( + busco_plot, secondary_plot, spacing=3 + ).configure_axis( + labelFontSize=labelFontSize, titleFontSize=titleFontSize + ).configure_legend( + labelFontSize=labelFontSize, titleFontSize=titleFontSize + ).configure_header( + labelFontSize=labelFontSize, titleFontSize=titleFontSize + ) + + # Return + return output_plot.to_json() + + +def _run_busco( + output_dir: str, mags: MultiMAGSequencesDirFmt, params: List[str] +) -> Dict[str, str]: + """Evaluates bins for all samples using BUSCO. + + Args: + output_dir (str): Location where the final results should be stored. + mags (MultiMAGSequencesDirFmt): The mags to be analyzed. + params (List[str]): List of parsed arguments to pass to BUSCO. + + Returns: + dict: Dictionary where keys are sample IDs and values are the paths + to the `batch_summary.txt` generated by BUSCO, e.g. + `tmp/busco_output//batch_summary.txt`. + """ + + # Define base command + base_cmd = ["busco", *params] + + # Creates pandas df "manifest" from bins + manifest: pd.DataFrame = mags.manifest.view(pd.DataFrame) + + # Make a new column in manifest with the directories of files + # listed in column "filename" + manifest["sample_dir"] = manifest.filename.apply( + lambda x: os.path.dirname(x) + ) + + # numpy.ndarray with unique dirs + sample_dirs = manifest["sample_dir"].unique() + + # Initialize dictionary with paths to run summaries + path_to_run_summaries = {} + + # For every unique sample dir run busco + for sample_dir in sample_dirs: + # Get sample id from tip dirname + sample = os.path.split(sample_dir)[-1] + + # Deep copy base command extend it with the sample specific + # info and run it + cmd = deepcopy(base_cmd) + cmd.extend([ + "--in", + sample_dir, + "--out_path", + output_dir, + "-o", + sample + ]) + run_command(cmd) + + # Check for output + path_to_run_summary = os.path.join( + output_dir, sample, "batch_summary.txt" + ) + if os.path.isfile(path_to_run_summary): + path_to_run_summaries[sample] = path_to_run_summary + else: + raise FileNotFoundError( + f"BUSCO batch summary file {path_to_run_summary} not found." + ) + + # Return a dict where key is sample id and value is path + # "tmp/sample_id/batch_summary.txt" + return path_to_run_summaries + + +def _draw_busco_plots( + path_to_run_summaries: dict, plots_dir: str + ) -> Dict[str, str]: + """ + Generates plots for all `batch_summary.txt` (one for every sample) + and saves them to `plots_dir`. + + Args: + plots_dir (str): Path where the results should be stored. + dict: Dictionary where keys are sample IDs and values are the paths + to the `batch_summary.txt` generated by BUSCO, e.g. + `tmp/busco_output//batch_summary.txt`. + + Returns: + dict: Dictionary where keys are sample IDs and values are the paths + to the generated plots, e.g. + `tmp/plots//plot_batch_summary.svg`. + """ + + # Initialize output dictionary + paths_to_plots = {} + + # For every sample make a plot + for sample_id, path_to_summary in path_to_run_summaries.items(): + # Read in text file as data frame + df = pd.read_csv(filepath_or_buffer=path_to_summary, sep="\t") + + # Format data frame + df = _parse_df_columns(df) + + # Create horizontal stacked bar plot + height = 0.9 # Height of the bars + a = 0.7 + + for i, mag_id in enumerate(df.mag_id.unique()): + row = df[df["mag_id"] == mag_id] + plt.barh( + i, row["missing_"], height, color='r', + label='Missing', alpha=a + ) + plt.barh( + i, row["fragmented_"], height, color='tab:orange', + label='Fragmented', alpha=a + ) + plt.barh( + i, row["duplicated_"], height, color='tab:cyan', + label='Duplicated', alpha=a + ) + plt.barh( + i, row["single_"], height, color='tab:blue', + label='Single', alpha=a + ) + + # Add vertical lines and adjust x-axis limit + plt.gca().xaxis.grid(True, linestyle='--', linewidth=0.5) + plt.xlim(0, 100) + + # Add labels, title, and legend + plt.ylabel("MAG ID's") + plt.xlabel('% BUSCO') + plt.title('') + + plt.yticks(range(len(df.mag_id.unique())), df.mag_id.unique()) + plt.legend( + ['Missing', 'Fragmented', 'Duplicated', 'Single'], loc='lower left' + ) + + # Save figure to file + output_name = os.path.join( + plots_dir, sample_id, "plot_batch_summary.svg" + ) + os.makedirs(os.path.dirname(output_name), exist_ok=True) + plt.savefig(output_name, format="svg", bbox_inches='tight') + + # Save path to dictionary + paths_to_plots[sample_id] = output_name + + # Return paths to all generated plots + return paths_to_plots + + +def _zip_busco_plots(paths_to_plots: dict, zip_path: str) -> None: + """ + Creates a single zip archive containing all plots produced by BUSCO, + one for each sample. + + Args: + paths_to_plots: Dictionary mapping sample to plot path. + zip_path (str): The path to the zip archive. + """ + + # Get shortest common path between files + common_path = os.path.commonpath(paths_to_plots.values()) + + # Write to zipfile + with ZipFile(zip_path, "w") as zf: + for _, path_to_plot in paths_to_plots.items(): + arcname = os.path.relpath(path_to_plot, common_path) + zf.write(path_to_plot, arcname=arcname) + + +def _collect_summaries_and_save( + all_summaries_path: str, + path_to_run_summaries: dict + ) -> pd.DataFrame: + """ + Reads-in the sample wise summaries and concatenates them it one + pd.DataFrame, which is saved to file. + + Args: + all_summaries_path (str): Directory path where to write the + pd.DataFrame + path_to_run_summaries (dict): dict where key is sample id + and value is path "tmp/sample_id/batch_summary.txt" + + Returns: + all_summaries_df (pd.DataFrame): Data frame composed of the individual + run summaries. + """ + + all_summaries_list = [] + for sample_id, path_to_summary in path_to_run_summaries.items(): + df = pd.read_csv(filepath_or_buffer=path_to_summary, sep="\t") + df["sample_id"] = sample_id + all_summaries_list.append(df) + + # Concatenate + all_summaries_df = pd.concat(all_summaries_list, ignore_index=True) + + # Save to file + all_summaries_df.to_csv(all_summaries_path, index=False) + + return all_summaries_df + + +def _render_html( + output_dir: str, + all_summaries_df: pd.DataFrame, + ): + """ + Renders an qiime2 html file with the plots summarizing the BUSCO output. + + Args: + output_dir (str): Directory path where to write the pd.DataFrame + all_summaries_df (pd.DataFrame): Data frame composed of the individual + run summaries. + """ + # Prepare context for jinja2 template + context = { + "vega_plots_overview": _draw_busco_plots_for_render( + all_summaries_df, + width=600, + height=9, + titleFontSize=20, + labelFontSize=17, + ), + } + + # Copy BUSCO results from tmp dir to output_dir + moshpit_path = os.path.dirname( # Path to parent dir, q2_moshpit + os.path.dirname(__file__) + ) + TEMPLATES = os.path.join(moshpit_path, "assets") + index = os.path.join(TEMPLATES, "busco", "index.html") + copytree( + src=os.path.join(TEMPLATES, "busco"), + dst=output_dir, + dirs_exist_ok=True + ) + + # Render + q2templates.render(index, output_dir, context=context) + + # Remove unwanted files + # until Bootstrap 3 is replaced with v5, remove the v3 scripts as + # the HTML files are adjusted to work with v5 + os.remove( + os.path.join( + output_dir, "q2templateassets", "css", "bootstrap.min.css" + ) + ) + os.remove( + os.path.join( + output_dir, "q2templateassets", "js", "bootstrap.min.js" + ) + ) + + +def _parse_df_columns(df: pd.DataFrame) -> pd.DataFrame: + """ + Parses the columns of a batch_summery data frame generated by busco + and formats its columns names such that: + + - all "-" are replaces by "_" + - they contain only alphanumeric characters + - everything is in lowercase + + Additionally, it creates a new column "mag_id" which contains the MAG ID. + It also casts the "percent_gaps" column from string to float. + + Args: + df (pd.DataFrame): Unformatted data frame + + Returns: + df (pd.DataFrame): Formatted data frame + """ + + # Clean column names + df.columns = df.columns.str.replace(" ", "_") + df.columns = df.columns.str.replace("[^a-zA-Z0-9_]", "", regex=True) + df.columns = df.columns.str.lower() + + # Rename column "input_file" + df["mag_id"] = df["input_file"].str.split(".", expand=True)[0] + + # Cast into percent_gaps col to float + df["percent_gaps"] = df["percent_gaps"].str.split( + '%', expand=True + )[0].map(float) + + # Make new columns for downloadable plots + # (only used in _draw_busco_plots) + df["single_"] = df["single"] + df["duplicated_"] = df["single_"] + df["duplicated"] + df["fragmented_"] = df["duplicated_"] + df["fragmented"] + df["missing_"] = df["fragmented_"] + df['missing'] + + return df diff --git a/q2_moshpit/citations.bib b/q2_moshpit/citations.bib index 37009f8b..56d4110e 100644 --- a/q2_moshpit/citations.bib +++ b/q2_moshpit/citations.bib @@ -50,3 +50,23 @@ @article{kang2019 doi = {10.7717/peerj.7359}, keywords = {Clustering,Metagenome binning,Metagenomics} } + +@article{manni_busco_2021, + title = {{BUSCO} {Update}: {Novel} and {Streamlined} {Workflows} along with {Broader} and {Deeper} {Phylogenetic} {Coverage} for {Scoring} of {Eukaryotic}, {Prokaryotic}, and {Viral} {Genomes}}, + volume = {38}, + issn = {1537-1719}, + shorttitle = {{BUSCO} {Update}}, + url = {https://academic.oup.com/mbe/article/38/10/4647/6329644}, + doi = {10.1093/molbev/msab199}, + abstract = {Methods for evaluating the quality of genomic and metagenomic data are essential to aid genome assembly procedures and to correctly interpret the results of subsequent analyses. BUSCO estimates the completeness and redundancy of processed genomic data based on universal single-copy orthologs. Here, we present new functionalities and major improvements of the BUSCO software, as well as the renewal and expansion of the underlying data sets in sync with the OrthoDB v10 release. Among the major novelties, BUSCO now enables phylogenetic placement of the input sequence to automatically select the most appropriate BUSCO data set for the assessment, allowing the analysis of metagenomeassembled genomes of unknown origin. A newly introduced genome workflow increases the efficiency and runtimes especially on large eukaryotic genomes. BUSCO is the only tool capable of assessing both eukaryotic and prokaryotic species, and can be applied to various data types, from genome assemblies and metagenomic bins, to transcriptomes and gene sets.}, + language = {en}, + number = {10}, + urldate = {2023-08-09}, + journal = {Molecular Biology and Evolution}, + author = {Manni, Mosè and Berkeley, Matthew R and Seppey, Mathieu and Simão, Felipe A and Zdobnov, Evgeny M}, + editor = {Kelley, Joanna}, + month = sep, + year = {2021}, + pages = {4647--4654}, + file = {Manni et al. - 2021 - BUSCO Update Novel and Streamlined Workflows alon.pdf:/Users/santiago/Zotero/storage/SQ2VFGPF/Manni et al. - 2021 - BUSCO Update Novel and Streamlined Workflows alon.pdf:application/pdf}, +} diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index fe5d5722..37335e67 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -11,7 +11,8 @@ from q2_types.feature_data import FeatureData, Sequence, Taxonomy from q2_types.feature_table import FeatureTable, Frequency, PresenceAbsence from q2_types.per_sample_sequences import ( - SequencesWithQuality, PairedEndSequencesWithQuality + SequencesWithQuality, + PairedEndSequencesWithQuality, ) from q2_types.sample_data import SampleData from q2_types.feature_map import FeatureMap, MAGtoContigs @@ -30,107 +31,107 @@ from q2_types_genomics.per_sample_data._type import AlignmentMap from q2_types_genomics.reference_db import ReferenceDB, Diamond, Eggnog -citations = Citations.load('citations.bib', package='q2_moshpit') +citations = Citations.load("citations.bib", package="q2_moshpit") kraken2_params = { - 'threads': Int % Range(1, None), - 'confidence': Float % Range(0, 1, inclusive_end=True), - 'minimum_base_quality': Int % Range(0, None), - 'memory_mapping': Bool, - 'minimum_hit_groups': Int % Range(1, None), - 'quick': Bool, - 'report_minimizer_data': Bool + "threads": Int % Range(1, None), + "confidence": Float % Range(0, 1, inclusive_end=True), + "minimum_base_quality": Int % Range(0, None), + "memory_mapping": Bool, + "minimum_hit_groups": Int % Range(1, None), + "quick": Bool, + "report_minimizer_data": Bool, } kraken2_param_descriptions = { - 'threads': 'Number of threads.', - 'confidence': 'Confidence score threshold.', - 'minimum_base_quality': 'Minimum base quality used in classification.' - ' Only applies when reads are used as input.', - 'memory_mapping': 'Avoids loading the database into RAM.', - 'minimum_hit_groups': 'Minimum number of hit groups (overlapping ' - 'k-mers sharing the same minimizer).', - 'quick': 'Quick operation (use first hit or hits).', - 'report_minimizer_data': 'Include number of read-minimizers per-taxon and' - ' unique read-minimizers per-taxon in the repot.' + "threads": "Number of threads.", + "confidence": "Confidence score threshold.", + "minimum_base_quality": "Minimum base quality used in classification. " + "Only applies when reads are used as input.", + "memory_mapping": "Avoids loading the database into RAM.", + "minimum_hit_groups": "Minimum number of hit groups (overlapping " + "k-mers sharing the same minimizer).", + "quick": "Quick operation (use first hit or hits).", + "report_minimizer_data": "Include number of read-minimizers per-taxon and " + "unique read-minimizers per-taxon in the repot.", } plugin = Plugin( - name='moshpit', + name="moshpit", version=q2_moshpit.__version__, website="https://github.com/bokulich-lab/q2-moshpit", - package='q2_moshpit', + package="q2_moshpit", description=( - 'MOdular SHotgun metagenome Pipelines with Integrated ' - 'provenance Tracking: QIIME 2 plugin gor metagenome analysis with' - 'tools for genome binning and functional annotation.'), - short_description='QIIME 2 plugin for metagenome analysis.', + "MOdular SHotgun metagenome Pipelines with Integrated " + "provenance Tracking: QIIME 2 plugin gor metagenome analysis with" + "tools for genome binning and functional annotation." + ), + short_description="QIIME 2 plugin for metagenome analysis.", ) -importlib.import_module('q2_moshpit.eggnog') -importlib.import_module('q2_moshpit.metabat2') +importlib.import_module("q2_moshpit.eggnog") +importlib.import_module("q2_moshpit.metabat2") plugin.methods.register_function( function=q2_moshpit.metabat2.bin_contigs_metabat, inputs={ - 'contigs': SampleData[Contigs], - 'alignment_maps': SampleData[AlignmentMap] + "contigs": SampleData[Contigs], + "alignment_maps": SampleData[AlignmentMap] }, parameters={ - 'min_contig': Int % Range(1500, None), - 'max_p': Int % Range(1, 100), - 'min_s': Int % Range(1, 100), - 'max_edges': Int % Range(1, None), - 'p_tnf': Int % Range(0, 100), - 'no_add': Bool, - 'min_cv': Int % Range(1, None), - 'min_cv_sum': Int % Range(1, None), - 'min_cls_size': Int % Range(1, None), - 'num_threads': Int % Range(0, None), - 'seed': Int % Range(0, None), - 'debug': Bool, - 'verbose': Bool + "min_contig": Int % Range(1500, None), + "max_p": Int % Range(1, 100), + "min_s": Int % Range(1, 100), + "max_edges": Int % Range(1, None), + "p_tnf": Int % Range(0, 100), + "no_add": Bool, + "min_cv": Int % Range(1, None), + "min_cv_sum": Int % Range(1, None), + "min_cls_size": Int % Range(1, None), + "num_threads": Int % Range(0, None), + "seed": Int % Range(0, None), + "debug": Bool, + "verbose": Bool, }, outputs=[ - ('mags', SampleData[MAGs]), - ('contig_map', FeatureMap[MAGtoContigs]), - ('unbinned_contigs', SampleData[Contigs % Properties('unbinned')]) + ("mags", SampleData[MAGs]), + ("contig_map", FeatureMap[MAGtoContigs]), + ("unbinned_contigs", SampleData[Contigs % Properties("unbinned")]), ], input_descriptions={ - 'contigs': 'Placeholder.', - 'alignment_maps': 'Placeholder.' + "contigs": "Placeholder.", "alignment_maps": "Placeholder." }, parameter_descriptions={ - 'min_contig': 'Minimum size of a contig for binning.', - 'max_p': 'Percentage of "good" contigs considered for binning ' - 'decided by connection among contigs. The greater, the ' - 'more sensitive.', - 'min_s': 'Minimum score of a edge for binning. The greater, the ' - 'more specific.', - 'max_edges': 'Maximum number of edges per node. The greater, the ' - 'more sensitive.', - 'p_tnf': 'TNF probability cutoff for building TNF graph. Use it to ' - 'skip the preparation step. (0: auto)', - 'no_add': 'Turning off additional binning for lost or small contigs.', - 'min_cv': 'Minimum mean coverage of a contig in each library ' - 'for binning.', - 'min_cv_sum': 'Minimum total effective mean coverage of a contig ' - '(sum of depth over minCV) for binning.', - 'min_cls_size': 'Minimum size of a bin as the output.', - 'num_threads': 'Number of threads to use (0: use all cores).', - 'seed': 'For exact reproducibility. (0: use random seed)', - 'debug': 'Debug output.', - 'verbose': 'Verbose output.' + "min_contig": "Minimum size of a contig for binning.", + "max_p": 'Percentage of "good" contigs considered for binning ' + "decided by connection among contigs. The greater, the " + "more sensitive.", + "min_s": "Minimum score of a edge for binning. The greater, the " + "more specific.", + "max_edges": "Maximum number of edges per node. The greater, the " + "more sensitive.", + "p_tnf": "TNF probability cutoff for building TNF graph. Use it to " + "skip the preparation step. (0: auto)", + "no_add": "Turning off additional binning for lost or small contigs.", + "min_cv": "Minimum mean coverage of a contig in each library for " + "binning.", + "min_cv_sum": "Minimum total effective mean coverage of a contig " + "(sum of depth over minCV) for binning.", + "min_cls_size": "Minimum size of a bin as the output.", + "num_threads": "Number of threads to use (0: use all cores).", + "seed": "For exact reproducibility. (0: use random seed)", + "debug": "Debug output.", + "verbose": "Verbose output.", }, output_descriptions={ - 'mags': 'The resulting MAGs.', - 'contig_map': 'Mapping of MAG identifiers to the contig identifiers ' - 'contained in each MAG.', - 'unbinned_contigs': 'Contigs that were not binned into any MAG.' - }, - name='Bin contigs into MAGs using MetaBAT 2.', - description='This method uses MetaBAT 2 to bin provided contigs ' - 'into MAGs.', - citations=[citations["kang2019"]] + "mags": "The resulting MAGs.", + "contig_map": "Mapping of MAG identifiers to the contig identifiers " + "contained in each MAG.", + "unbinned_contigs": "Contigs that were not binned into any MAG.", + }, + name="Bin contigs into MAGs using MetaBAT 2.", + description="This method uses MetaBAT 2 to bin provided contigs " + "into MAGs.", + citations=[citations["kang2019"]], ) T_kraken_in, T_kraken_out_rep, T_kraken_out_hits = TypeMap({ @@ -157,8 +158,8 @@ }, parameters=kraken2_params, outputs=[ - ('reports', T_kraken_out_rep), - ('hits', T_kraken_out_hits), + ("reports", T_kraken_out_rep), + ("hits", T_kraken_out_hits), ], input_descriptions={ "seqs": "The sequences to be classified. Single-end or paired-end " @@ -167,113 +168,123 @@ }, parameter_descriptions=kraken2_param_descriptions, output_descriptions={ - 'reports': 'Reports produced by Kraken2.', - 'hits': 'Output files produced by Kraken2.', + "reports": "Reports produced by Kraken2.", + "hits": "Output files produced by Kraken2.", }, - name='Perform taxonomic classification of reads or MAGs using Kraken 2.', - description='This method uses Kraken 2 to classify provided NGS reads ' - 'or MAGs into taxonomic groups.', - citations=[citations["wood2019"]] + name="Perform taxonomic classification of reads or MAGs using Kraken 2.", + description="This method uses Kraken 2 to classify provided NGS reads " + "or MAGs into taxonomic groups.", + citations=[citations["wood2019"]], ) plugin.methods.register_function( function=q2_moshpit.kraken2.bracken.estimate_bracken, inputs={ - "kraken_reports": SampleData[Kraken2Reports % Properties('reads')], - "bracken_db": BrackenDB + "kraken_reports": SampleData[Kraken2Reports % Properties("reads")], + "bracken_db": BrackenDB, }, parameters={ - 'threshold': Int % Range(0, None), - 'read_len': Int % Range(0, None), - 'level': Str % Choices(['D', 'P', 'C', 'O', 'F', 'G', 'S']) + "threshold": Int % Range(0, None), + "read_len": Int % Range(0, None), + "level": Str % Choices(["D", "P", "C", "O", "F", "G", "S"]), }, outputs=[ - ('reports', SampleData[Kraken2Reports % Properties('bracken')]), - ('taxonomy', FeatureData[Taxonomy]), - ('table', FeatureTable[Frequency]) + ("reports", SampleData[Kraken2Reports % Properties("bracken")]), + ("taxonomy", FeatureData[Taxonomy]), + ("table", FeatureTable[Frequency]), ], input_descriptions={ "kraken_reports": "Reports produced by Kraken2.", - "bracken_db": "Bracken database." + "bracken_db": "Bracken database.", }, parameter_descriptions={ - 'threshold': 'Bracken: number of reads required PRIOR to abundance ' - 'estimation to perform re-estimation.', - 'read_len': 'Bracken: read length to get all classifications for.', - 'level': 'Bracken: taxonomic level to estimate abundance at.' + "threshold": "Bracken: number of reads required PRIOR to abundance " + "estimation to perform re-estimation.", + "read_len": "Bracken: read length to get all classifications for.", + "level": "Bracken: taxonomic level to estimate abundance at.", }, output_descriptions={ - 'reports': 'Reports modified by Bracken.', + "reports": "Reports modified by Bracken.", }, - name='Perform read abundance re-estimation using Bracken.', - description='This method uses Bracken to re-estimate read abundances.', - citations=[citations["wood2019"]] + name="Perform read abundance re-estimation using Bracken.", + description="This method uses Bracken to re-estimate read abundances.", + citations=[citations["wood2019"]], ) plugin.methods.register_function( function=q2_moshpit.kraken2.build_kraken_db, - inputs={ - "seqs": List[FeatureData[Sequence]] - }, + inputs={"seqs": List[FeatureData[Sequence]]}, parameters={ - 'collection': Str % Choices( - ['viral', 'minusb', 'standard', 'standard8', - 'standard16', 'pluspf', 'pluspf8', 'pluspf16', - 'pluspfp', 'pluspfp8', 'pluspfp16', 'eupathdb'], + "collection": Str + % Choices( + [ + "viral", + "minusb", + "standard", + "standard8", + "standard16", + "pluspf", + "pluspf8", + "pluspf16", + "pluspfp", + "pluspfp8", + "pluspfp16", + "eupathdb", + ], ), - 'threads': Int % Range(1, None), - 'kmer_len': Int % Range(1, None), - 'minimizer_len': Int % Range(1, None), - 'minimizer_spaces': Int % Range(1, None), - 'no_masking': Bool, - 'max_db_size': Int % Range(0, None), - 'use_ftp': Bool, - 'load_factor': Float % Range(0, 1), - 'fast_build': Bool, - 'read_len': List[Int % Range(1, None)], + "threads": Int % Range(1, None), + "kmer_len": Int % Range(1, None), + "minimizer_len": Int % Range(1, None), + "minimizer_spaces": Int % Range(1, None), + "no_masking": Bool, + "max_db_size": Int % Range(0, None), + "use_ftp": Bool, + "load_factor": Float % Range(0, 1), + "fast_build": Bool, + "read_len": List[Int % Range(1, None)], }, outputs=[ - ('kraken2_database', Kraken2DB), - ('bracken_database', BrackenDB), + ("kraken2_database", Kraken2DB), + ("bracken_database", BrackenDB), ], input_descriptions={ "seqs": "Sequences to be added to the Kraken 2 database." }, parameter_descriptions={ - 'collection': 'Name of the database collection to be fetched. ' - 'Please check https://benlangmead.github.io/aws-' - 'indexes/k2 for the description of the available ' - 'options.', - 'threads': 'Number of threads. Only applicable when building a ' - 'custom database.', - 'kmer_len': 'K-mer length in bp/aa.', - 'minimizer_len': 'Minimizer length in bp/aa.', - 'minimizer_spaces': 'Number of characters in minimizer that are ' - 'ignored in comparisons.', - 'no_masking': 'Avoid masking low-complexity sequences prior to ' - 'building; masking requires dustmasker or segmasker ' - 'to be installed in PATH', - 'max_db_size': 'Maximum number of bytes for Kraken 2 hash table; ' - 'if the estimator determines more would normally be ' - 'needed, the reference library will be downsampled ' - 'to fit.', - 'use_ftp': 'Use FTP for downloading instead of RSYNC.', - 'load_factor': 'Proportion of the hash table to be populated.', - 'fast_build': 'Do not require database to be deterministically ' - 'built when using multiple threads. This is faster, ' - 'but does introduce variability in minimizer/LCA pairs.', - 'read_len': 'Ideal read lengths to be used while building the Bracken ' - 'database.' + "collection": "Name of the database collection to be fetched. " + "Please check https://benlangmead.github.io/aws-" + "indexes/k2 for the description of the available " + "options.", + "threads": "Number of threads. Only applicable when building a " + "custom database.", + "kmer_len": "K-mer length in bp/aa.", + "minimizer_len": "Minimizer length in bp/aa.", + "minimizer_spaces": "Number of characters in minimizer that are " + "ignored in comparisons.", + "no_masking": "Avoid masking low-complexity sequences prior to " + "building; masking requires dustmasker or segmasker " + "to be installed in PATH", + "max_db_size": "Maximum number of bytes for Kraken 2 hash table; " + "if the estimator determines more would normally be " + "needed, the reference library will be downsampled " + "to fit.", + "use_ftp": "Use FTP for downloading instead of RSYNC.", + "load_factor": "Proportion of the hash table to be populated.", + "fast_build": "Do not require database to be deterministically " + "built when using multiple threads. This is faster, " + "but does introduce variability in minimizer/LCA pairs.", + "read_len": "Ideal read lengths to be used while building the Bracken " + "database.", }, output_descriptions={ - 'kraken2_database': 'Kraken2 database.', - 'bracken_database': 'Bracken database.' - }, - name='Build Kraken 2 database.', - description='This method builds a Kraken 2/Bracken databases from ' - 'provided DNA sequences or simply fetches pre-built ' - 'versions from an online resource.', - citations=[citations["wood2019"], citations["lu2017"]] + "kraken2_database": "Kraken2 database.", + "bracken_database": "Bracken database.", + }, + name="Build Kraken 2 database.", + description="This method builds a Kraken 2/Bracken databases from " + "provided DNA sequences or simply fetches pre-built " + "versions from an online resource.", + citations=[citations["wood2019"], citations["lu2017"]], ) plugin.methods.register_function( @@ -311,37 +322,33 @@ plugin.methods.register_function( function=q2_moshpit.kraken2.kraken2_to_features, - inputs={ - 'reports': SampleData[Kraken2Reports] - }, + inputs={"reports": SampleData[Kraken2Reports]}, parameters={ - 'coverage_threshold': Float % Range(0, 100, inclusive_end=True) + "coverage_threshold": Float % Range(0, 100, inclusive_end=True) }, outputs=[ - ('table', FeatureTable[PresenceAbsence]), - ('taxonomy', FeatureData[Taxonomy]) + ("table", FeatureTable[PresenceAbsence]), + ("taxonomy", FeatureData[Taxonomy]), ], - input_descriptions={ - 'reports': 'Per-sample Kraken 2 reports.' - }, + input_descriptions={"reports": "Per-sample Kraken 2 reports."}, parameter_descriptions={ - 'coverage_threshold': 'The minimum percent coverage required to' - ' produce a feature.' + "coverage_threshold": "The minimum percent coverage required to " + "produce a feature." }, output_descriptions={ - 'table': 'A presence/absence table of selected features. The features' - ' are not of even ranks, but will be the most specific rank' - ' available.', - 'taxonomy': 'Infra-clade ranks are ignored ' - 'unless they are strain-level. Missing internal ranks ' - 'are annotated by their next most specific rank, ' - 'with the exception of k__Bacteria and k__Archaea which ' - 'match their domain\'s name.', - }, - name='Select downstream features from Kraken 2', - description='Convert a Kraken 2 report, which is an annotated NCBI ' - 'taxonomy tree into generic artifacts for downstream ' - 'analyses.' + "table": "A presence/absence table of selected features. The features " + "are not of even ranks, but will be the most specific rank " + "available.", + "taxonomy": "Infra-clade ranks are ignored " + "unless they are strain-level. Missing internal ranks " + "are annotated by their next most specific rank, " + "with the exception of k__Bacteria and k__Archaea which " + "match their domain's name.", + }, + name="Select downstream features from Kraken 2", + description="Convert a Kraken 2 report, which is an annotated NCBI " + "taxonomy tree into generic artifacts for downstream " + "analyses.", ) plugin.methods.register_function( @@ -366,7 +373,7 @@ # 'taxonomic assignments of its contigs. ' }, output_descriptions={ - 'taxonomy': 'Infra-clade ranks are ignored' + 'taxonomy': 'Infra-clade ranks are ignored ' 'unless they are strain-level. Missing internal ranks ' 'are annotated by their next most specific rank, ' 'with the exception of k__Bacteria and k__Archaea which ' @@ -385,13 +392,13 @@ 'diamond_db': ReferenceDB[Diamond], }, parameters={ - 'num_cpus': Int, - 'db_in_memory': Bool, + "num_cpus": Int, + "db_in_memory": Bool, }, input_descriptions={ 'sequences': 'Sequence data of the contigs we want to ' 'search for hits using the Diamond Database', - 'diamond_db': 'The filepath to an artifact containing the' + 'diamond_db': 'The filepath to an artifact containing the ' 'Diamond database', }, parameter_descriptions={ @@ -408,26 +415,148 @@ ], name='Run eggNOG search using diamond aligner', description="This method performs the steps by which we find our " - "possible target sequences to annotate using the diamond " - "search functionality from the eggnog `emapper.py` script", + "possible target sequences to annotate using the diamond " + "search functionality from the eggnog `emapper.py` script", ) plugin.methods.register_function( function=q2_moshpit.eggnog.eggnog_annotate, inputs={ - 'eggnog_hits': SampleData[BLAST6], - 'eggnog_db': ReferenceDB[Eggnog], + "eggnog_hits": SampleData[BLAST6], + "eggnog_db": ReferenceDB[Eggnog], }, parameters={ - 'db_in_memory': Bool, + "db_in_memory": Bool, }, parameter_descriptions={ - 'db_in_memory': 'Read eggnog database into memory. The ' - 'eggnog database is very large(>44GB), so this ' - 'option should only be used on clusters or other ' - 'machines with enough memory.', + "db_in_memory": "Read eggnog database into memory. The " + "eggnog database is very large(>44GB), so this " + "option should only be used on clusters or other " + "machines with enough memory.", }, - outputs=[('ortholog_annotations', FeatureData[NOG])], - name='Annotate orthologs against eggNOG database', + outputs=[("ortholog_annotations", FeatureData[NOG])], + name="Annotate orthologs against eggNOG database", description="Apply eggnog mapper to annotate seed orthologs.", ) + +busco_params = { + "mode": Str % Choices(["genome"]), + "lineage_dataset": Str, + "augustus": Bool, + "augustus_parameters": Str, + "augustus_species": Str, + "auto_lineage": Bool, + "auto_lineage_euk": Bool, + "auto_lineage_prok": Bool, + "cpu": Int % Range(1, None), + "config": Str, + "contig_break": Int % Range(0, None), + "datasets_version": Str, + "download": List[Str], + "download_base_url": Str, + "download_path": Str, + "evalue": Float % Range(0, None, inclusive_start=False), + "force": Bool, + "limit": Int % Range(1, 20), + "help": Bool, + "list_datasets": Bool, + "long": Bool, + "metaeuk_parameters": Str, + "metaeuk_rerun_parameters": Str, + "miniprot": Bool, + "offline": Bool, + "quiet": Bool, + "restart": Bool, + "scaffold_composition": Bool, + "tar": Bool, + "update_data": Bool, + "version": Bool, +} +busco_param_descriptions = { + "mode": "Specify which BUSCO analysis mode to run." + "Currently only the 'genome' or 'geno' option is supported, " + "for genome assemblies. In the future modes for transcriptome " + "assemblies and for annotated gene sets (proteins) will be made " + "available.", + "lineage_dataset": "Specify the name of the BUSCO lineage to be used. " + "To see all possible options run `busco " + "--list-datasets`.", + "augustus": "Use augustus gene predictor for eukaryote runs.", + "augustus_parameters": "Pass additional arguments to Augustus. " + "All arguments should be contained within a single " + "string with no white space, with each argument " + "separated by a comma. " + "Example: '--PARAM1=VALUE1,--PARAM2=VALUE2'.", + "augustus_species": "Specify a species for Augustus training.", + "auto_lineage": "Run auto-lineage to find optimum lineage path.", + "auto_lineage_euk": "Run auto-placement just on eukaryote tree to find " + "optimum lineage path.", + "auto_lineage_prok": "Run auto-lineage just on non-eukaryote trees to " + "find optimum lineage path.", + "cpu": "Specify the number (N=integer) of threads/cores to use.", + "config": "Provide a config file.", + "contig_break": "Number of contiguous Ns to signify a break between " + "contigs. Default is n=10. " + "See https://gitlab.com/ezlab/busco/-/issues/691 for a " + "more detailed explanation.", + "datasets_version": "Specify the version of BUSCO datasets, e.g. odb10.", + "download": "Download dataset. Possible values are a specific dataset " + "name, 'all', 'prokaryota', 'eukaryota', or 'virus'. " + "If used together with other command line arguments, " + "make sure to place this last. Example: '[dataset ...]'.", + "download_base_url": "Set the url to the remote BUSCO dataset location.", + "download_path": "Specify local filepath for storing BUSCO dataset " + "downloads.", + "evalue": "E-value cutoff for BLAST searches. " + "Allowed formats, 0.001 or 1e-03, Default: 1e-03.", + "force": "Force rewriting of existing files. Must be used when output " + "files with the provided name already exist.", + "help": "Show this help message and exit.", + "limit": "How many candidate regions (contig or transcript) to consider " + "per BUSCO. Default: 3.", + "list_datasets": "Print the list of available BUSCO datasets.", + "long": "Optimization Augustus self-training mode (Default: Off); " + "adds considerably to the run time, " + "but can improve results for some non-model organisms.", + "metaeuk_parameters": "Pass additional arguments to Metaeuk for the first " + "run. All arguments should be contained within a " + "single string with no white space, with each " + "argument separated by a comma. " + "Example: `--PARAM1=VALUE1,--PARAM2=VALUE2`.", + "metaeuk_rerun_parameters": "Pass additional arguments to Metaeuk for the " + "second run. All arguments should be " + "contained within a single string with no " + "white space, with each argument separated by " + "a comma. " + "Example: `--PARAM1=VALUE1,--PARAM2=VALUE2`.", + "miniprot": "Use miniprot gene predictor for eukaryote runs.", + "offline": "To indicate that BUSCO cannot attempt to download files.", + "quiet": "Disable the info logs, displays only errors.", + "restart": "Continue a run that had already partially completed.", + "scaffold_composition": "Writes ACGTN content per scaffold to a file " + "`scaffold_composition.txt`.", + "tar": "Compress some subdirectories with many files to save space.", + "update_data": "Download and replace with last versions all lineages " + "datasets and files necessary to their automated " + "selection.", + "version": "Show this version and exit.", +} + + +plugin.visualizers.register_function( + function=q2_moshpit.busco.evaluate_busco, + inputs={ + "bins": SampleData[MAGs], + }, + parameters=busco_params, + input_descriptions={ + "bins": "MAGs to be analyzed.", + }, + parameter_descriptions=busco_param_descriptions, + name="Evaluate quality of the generated MAGs using BUSCO.", + description="This method uses BUSCO " + "(Benchmarking Universal Single-Copy Ortholog assessment " + "tool) to assess the quality of assembled MAGs and generates " + "visualizations summarizing the results.", + citations=[citations["manni_busco_2021"]], +) diff --git a/q2_moshpit/tests/test_utils.py b/q2_moshpit/tests/test_utils.py index a19d2263..77f9f37c 100644 --- a/q2_moshpit/tests/test_utils.py +++ b/q2_moshpit/tests/test_utils.py @@ -22,6 +22,18 @@ def fake_processing_func(key, val): return [_construct_param(key), str(val)] +def fake_processing_func_no_falsy_filtering(key, val): + """ + NOTE: There is a need for a function that does this since + `_process_common_input_params` already filter falsy values. + If a second filter is applied then some parameters are omitted. + """ + if isinstance(val, bool): + return [_construct_param(key)] + else: + return [_construct_param(key), str(val)] + + class TestUtils(TestPluginBase): package = 'q2_moshpit.tests' @@ -61,6 +73,46 @@ def test_process_common_inputs_mix(self): exp = ['--arg2', 'some-value', '--arg4'] self.assertListEqual(obs, exp) + def test_process_common_inputs_mix_with_falsy_values(self): + data = { + "a": 0, + "b": 1, + "c": 0.0, + "d": 3.14, + "e": "", + "f": "Hello", + "g": None, + "h": 42, + "i": 0.0, + "j": [], + "k": "World", + "l": False, + "m": True, + } + observed = _process_common_input_params( + fake_processing_func_no_falsy_filtering, data + ) + expected = [ + "--a", + "0", + "--b", + "1", + "--c", + "0.0", + "--d", + "3.14", + "--f", + "Hello", + "--h", + "42", + "--i", + "0.0", + "--k", + "World", + "--m", + ] + self.assertSetEqual(set(observed), set(expected)) + if __name__ == '__main__': unittest.main() diff --git a/setup.py b/setup.py index 60a9d4db..f132fb4c 100644 --- a/setup.py +++ b/setup.py @@ -25,16 +25,25 @@ ['q2-moshpit=q2_moshpit.plugin_setup:plugin'] }, package_data={ - 'q2_moshpit': ['citations.bib', 'tests/data/*'], + 'q2_moshpit': [ + 'citations.bib', + 'tests/data/*', + "assets/busco/*", + "assets/busco/js/*", + "assets/busco/css/*", + ], 'q2_moshpit.usage_examples': ['tests/data/*'], 'q2_moshpit.metabat2.tests': [ 'data/*', 'data/bins/*/*', 'data/contigs/*', 'data/depth/*', 'data/maps/*', 'data/bins-small/*/*', 'data/bins-no-uuid/*/*' ], - 'q2_moshpit.checkm.tests': [ - 'data/*', 'data/bins/*', 'data/bins/*/*', - 'data/checkm_reports/*/*/*', 'data/plots/*/*/*' + "q2_moshpit.busco.tests": [ + "data/*", + "data/busco_output/*", + "data/busco_output/sample1/*", + "data/busco_output/sample2/*", + "data/busco_output/sample3/*", ], 'q2_moshpit.eggnog': [ 'tests/data/*',