diff --git a/q2_moshpit/assets/busco/index.html b/q2_moshpit/assets/busco/index.html index be2d478d..aaa0c742 100644 --- a/q2_moshpit/assets/busco/index.html +++ b/q2_moshpit/assets/busco/index.html @@ -6,34 +6,16 @@ // temporary hack to make it look good with Bootstrap 5 removeBS3refs(); - - - - + + + + {% endblock %} {% block content %} - +
-
+
Plot description

@@ -41,52 +23,61 @@

Plot description
samples. "BUSCO attempts to provide a quantitative assessment of the completeness in terms of the expected gene content of a genome assembly, transcriptome, or annotated gene set. The results are - simplified into categories of Complete and single-copy, Complete and - duplicated, Fragmented, or Missing BUSCOs. BUSCO completeness results + simplified into categories of complete and single-copy, complete and + duplicated, fragmented, or missing BUSCOs. BUSCO completeness results make sense only in the context of the biology of your organism". Visit the - - BUSCO User Guide + BUSCO User Guide for more information.

-

- Hoover over the graph to obtain information about the lineage dataset - used for each bin, and the number of genes in each BUSCO category. -

The right barplot shows assembly statistics calculated for each bin using BBTools. Specifically, it displays the statistics computed by the stats.sh procedure from BBMap. View the - + source code and documentation of stats.sh for more information.

-

- Choose the assembly statistic that you wish to display from the drop-down manu below the graphs. - Hoover over the graph to show the numerical values that each bar represents. -

+
+ +
+
Plot Controls
+
+

+ To prevent data overload, the samples are grouped into collapsible sections below. You can + expand/collapse each section by clicking on the little arrow on the right side of the section header. + Each section will contain a maximum of 100 bins. +

+ If you want to find statistics for a specific sample, you can pick your sample from the dropdown below - + this will find the appropriate section and expand it. +

+ You can hover over the bars to obtain information about the lineage dataset used for each bin, + and the number of genes in each BUSCO category. Additionally, you can choose the assembly statistic + that you wish to display from the dropdown menu included in every section. +

+
+
+ Find sample: +
+
+ + +
@@ -96,43 +87,161 @@
Plot description
- {% if vega_plots_overview is defined %} -
-
-
+
+
+
+
- {% else %} -

Unable to generate the completeness plot

- {% endif %}
-{% if vega_plots_overview is defined %} - -{% endif %} {% endblock %} {% block footer %} {% set loading_selector = +{% endblock %} {% block footer %} {% set loading_selector = '#loading' %} {% include 'js-error-handler.html' %} {% endblock %} diff --git a/q2_moshpit/busco/busco.py b/q2_moshpit/busco/busco.py index fbcbae99..d57bc644 100644 --- a/q2_moshpit/busco/busco.py +++ b/q2_moshpit/busco/busco.py @@ -5,14 +5,12 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- - - import os import tempfile -import q2_moshpit.busco.utils from q2_moshpit.busco.utils import ( - _parse_busco_params, - _render_html, + _parse_busco_params, _render_html, _run_busco, + _collect_summaries_and_save, _draw_busco_plots, + _zip_busco_plots, ) from q2_moshpit._utils import _process_common_input_params from q2_types.per_sample_sequences._format import MultiMAGSequencesDirFmt @@ -71,7 +69,7 @@ def evaluate_busco( # Run busco for every sample. Returns dictionary to report files. # Result NOT included in final output busco_results_dir = os.path.join(tmp, "busco_output") - path_to_run_summaries = q2_moshpit.busco.utils._run_busco( + path_to_run_summaries = _run_busco( output_dir=busco_results_dir, mags=bins, params=common_args, @@ -82,7 +80,7 @@ def evaluate_busco( all_summaries_path = os.path.join( output_dir, "all_batch_summaries.csv" ) - all_summaries_df = q2_moshpit.busco.utils._collect_summaries_and_save( + all_summaries_df = _collect_summaries_and_save( all_summaries_path=all_summaries_path, path_to_run_summaries=path_to_run_summaries, ) @@ -90,7 +88,7 @@ def evaluate_busco( # Draw BUSCO plots for all samples # Result NOT included in final output plots_dir = os.path.join(tmp, "plots") - paths_to_plots = q2_moshpit.busco.utils._draw_busco_plots( + paths_to_plots = _draw_busco_plots( path_to_run_summaries=path_to_run_summaries, plots_dir=plots_dir ) @@ -98,7 +96,7 @@ def evaluate_busco( # Zip graphs for user download # Result included in final output (file for download) zip_name = os.path.join(output_dir, "busco_plots.zip") - q2_moshpit.busco.utils._zip_busco_plots( + _zip_busco_plots( paths_to_plots=paths_to_plots, zip_path=zip_name ) diff --git a/q2_moshpit/busco/utils.py b/q2_moshpit/busco/utils.py index d3a1161e..227a27fc 100644 --- a/q2_moshpit/busco/utils.py +++ b/q2_moshpit/busco/utils.py @@ -1,3 +1,4 @@ +import json import os import q2templates from shutil import copytree @@ -44,6 +45,27 @@ def _parse_busco_params(arg_key, arg_val) -> List[str]: return [f"--{arg_key}", str(arg_val)] +def _partition_dataframe(df, max_rows): + groups = [group for _, group in df.groupby('sample_id')] + partitions = [] + temp = [] + total_rows = 0 + + for group in groups: + if total_rows + len(group) > max_rows: + partitions.append(pd.concat(temp, ignore_index=True)) + temp = [group] + total_rows = len(group) + else: + temp.append(group) + total_rows += len(group) + + if temp: + partitions.append(pd.concat(temp, ignore_index=True)) + + return partitions + + def _draw_busco_plots_for_render( df: pd.DataFrame, width: int = None, @@ -108,6 +130,15 @@ def _draw_busco_plots_for_render( domain = ["single", "duplicated", "fragmented", "missing"] range_ = ["#1E90FF", "#87CEFA", "#FFA500", "#FF7F50"] + # Get the first 10 sample ids + if len(df['sample_id'].unique()) <= 10: + default_regex = "" + else: + default_regex = df['sample_id'].unique()[0:10] + default_regex = '$|^'.join(default_regex) + default_regex = '^' + default_regex + "$" + + # Make BUSCO bar plots (the plots on the left) busco_plot = ( alt.Chart(busco_plot_data) .mark_bar() @@ -202,7 +233,7 @@ def _draw_busco_plots_for_render( ) # Return - return output_plot.to_json() + return output_plot.to_dict() def _run_busco( @@ -420,17 +451,36 @@ def _render_html( all_summaries_df (pd.DataFrame): Data frame composed of the individual run summaries. """ - # Prepare context for jinja2 template - context = { - "vega_plots_overview": _draw_busco_plots_for_render( - all_summaries_df, + # Partition DataFrame + dfs = _partition_dataframe(all_summaries_df, max_rows=100) + + context = {} + counter_left = 1 + for i, df in enumerate(dfs): + sample_count = df['sample_id'].nunique() + counter_right = counter_left + sample_count - 1 + sample_counter = {"from": counter_left, "to": counter_right} + counter_left += sample_count + subcontext = _draw_busco_plots_for_render( + df, width=600, height=30, titleFontSize=20, labelFontSize=17, spacing=20 - ), - } + ) + context.update( + {f"sample{i}": { + "subcontext": subcontext, + "sample_counter": sample_counter, + "sample_ids": df['sample_id'].unique().tolist(), + }} + ) + + vega_out_fp = os.path.join(output_dir, "vega.json") + with open(vega_out_fp, 'w') as json_file: + vega_json = json.dumps(context) + json_file.write(vega_json) # Copy BUSCO results from tmp dir to output_dir moshpit_path = os.path.dirname( # Path to parent dir, q2_moshpit @@ -445,7 +495,7 @@ def _render_html( ) # Render - q2templates.render(index, output_dir, context=context) + q2templates.render(index, output_dir, context={"vega_json": vega_json}) # Remove unwanted files # until Bootstrap 3 is replaced with v5, remove the v3 scripts as