diff --git a/q2_moshpit/assets/busco/index.html b/q2_moshpit/assets/busco/index.html
index be2d478d..aaa0c742 100644
--- a/q2_moshpit/assets/busco/index.html
+++ b/q2_moshpit/assets/busco/index.html
@@ -6,34 +6,16 @@
// temporary hack to make it look good with Bootstrap 5
removeBS3refs();
-
-
-
-
+
+
+
+
{% endblock %} {% block content %}
-
+
-
+
Plot description
@@ -41,52 +23,61 @@
Plot description
samples. "BUSCO attempts to provide a quantitative assessment
of the completeness in terms of the expected gene content of a genome
assembly, transcriptome, or annotated gene set. The results are
- simplified into categories of Complete and single-copy, Complete and
- duplicated, Fragmented, or Missing BUSCOs. BUSCO completeness results
+ simplified into categories of complete and single-copy, complete and
+ duplicated, fragmented, or missing BUSCOs. BUSCO completeness results
make sense only in the context of the biology of your organism". Visit the
-
- BUSCO User Guide
+ BUSCO User Guide
for more information.
-
- Hoover over the graph to obtain information about the lineage dataset
- used for each bin, and the number of genes in each BUSCO category.
-
The right barplot shows assembly statistics calculated for each bin using BBTools.
Specifically, it displays the statistics computed by the stats.sh procedure from BBMap.
View the
-
+
source code and documentation
of stats.sh for more information.
-
- Choose the assembly statistic that you wish to display from the drop-down manu below the graphs.
- Hoover over the graph to show the numerical values that each bar represents.
-
+ To prevent data overload, the samples are grouped into collapsible sections below. You can
+ expand/collapse each section by clicking on the little arrow on the right side of the section header.
+ Each section will contain a maximum of 100 bins.
+
+ If you want to find statistics for a specific sample, you can pick your sample from the dropdown below -
+ this will find the appropriate section and expand it.
+
+ You can hover over the bars to obtain information about the lineage dataset used for each bin,
+ and the number of genes in each BUSCO category. Additionally, you can choose the assembly statistic
+ that you wish to display from the dropdown menu included in every section.
+
+
+
+ Find sample:
+
+
+
+
+
+
+
+
@@ -96,43 +87,161 @@
Plot description
- {% if vega_plots_overview is defined %}
-
-
-
+
+
+
+
- {% else %}
-
Unable to generate the completeness plot
- {% endif %}
-{% if vega_plots_overview is defined %}
-
-{% endif %} {% endblock %} {% block footer %} {% set loading_selector =
+{% endblock %} {% block footer %} {% set loading_selector =
'#loading' %} {% include 'js-error-handler.html' %} {% endblock %}
diff --git a/q2_moshpit/busco/busco.py b/q2_moshpit/busco/busco.py
index fbcbae99..d57bc644 100644
--- a/q2_moshpit/busco/busco.py
+++ b/q2_moshpit/busco/busco.py
@@ -5,14 +5,12 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
-
-
import os
import tempfile
-import q2_moshpit.busco.utils
from q2_moshpit.busco.utils import (
- _parse_busco_params,
- _render_html,
+ _parse_busco_params, _render_html, _run_busco,
+ _collect_summaries_and_save, _draw_busco_plots,
+ _zip_busco_plots,
)
from q2_moshpit._utils import _process_common_input_params
from q2_types.per_sample_sequences._format import MultiMAGSequencesDirFmt
@@ -71,7 +69,7 @@ def evaluate_busco(
# Run busco for every sample. Returns dictionary to report files.
# Result NOT included in final output
busco_results_dir = os.path.join(tmp, "busco_output")
- path_to_run_summaries = q2_moshpit.busco.utils._run_busco(
+ path_to_run_summaries = _run_busco(
output_dir=busco_results_dir,
mags=bins,
params=common_args,
@@ -82,7 +80,7 @@ def evaluate_busco(
all_summaries_path = os.path.join(
output_dir, "all_batch_summaries.csv"
)
- all_summaries_df = q2_moshpit.busco.utils._collect_summaries_and_save(
+ all_summaries_df = _collect_summaries_and_save(
all_summaries_path=all_summaries_path,
path_to_run_summaries=path_to_run_summaries,
)
@@ -90,7 +88,7 @@ def evaluate_busco(
# Draw BUSCO plots for all samples
# Result NOT included in final output
plots_dir = os.path.join(tmp, "plots")
- paths_to_plots = q2_moshpit.busco.utils._draw_busco_plots(
+ paths_to_plots = _draw_busco_plots(
path_to_run_summaries=path_to_run_summaries,
plots_dir=plots_dir
)
@@ -98,7 +96,7 @@ def evaluate_busco(
# Zip graphs for user download
# Result included in final output (file for download)
zip_name = os.path.join(output_dir, "busco_plots.zip")
- q2_moshpit.busco.utils._zip_busco_plots(
+ _zip_busco_plots(
paths_to_plots=paths_to_plots,
zip_path=zip_name
)
diff --git a/q2_moshpit/busco/utils.py b/q2_moshpit/busco/utils.py
index d3a1161e..227a27fc 100644
--- a/q2_moshpit/busco/utils.py
+++ b/q2_moshpit/busco/utils.py
@@ -1,3 +1,4 @@
+import json
import os
import q2templates
from shutil import copytree
@@ -44,6 +45,27 @@ def _parse_busco_params(arg_key, arg_val) -> List[str]:
return [f"--{arg_key}", str(arg_val)]
+def _partition_dataframe(df, max_rows):
+ groups = [group for _, group in df.groupby('sample_id')]
+ partitions = []
+ temp = []
+ total_rows = 0
+
+ for group in groups:
+ if total_rows + len(group) > max_rows:
+ partitions.append(pd.concat(temp, ignore_index=True))
+ temp = [group]
+ total_rows = len(group)
+ else:
+ temp.append(group)
+ total_rows += len(group)
+
+ if temp:
+ partitions.append(pd.concat(temp, ignore_index=True))
+
+ return partitions
+
+
def _draw_busco_plots_for_render(
df: pd.DataFrame,
width: int = None,
@@ -108,6 +130,15 @@ def _draw_busco_plots_for_render(
domain = ["single", "duplicated", "fragmented", "missing"]
range_ = ["#1E90FF", "#87CEFA", "#FFA500", "#FF7F50"]
+ # Get the first 10 sample ids
+ if len(df['sample_id'].unique()) <= 10:
+ default_regex = ""
+ else:
+ default_regex = df['sample_id'].unique()[0:10]
+ default_regex = '$|^'.join(default_regex)
+ default_regex = '^' + default_regex + "$"
+
+ # Make BUSCO bar plots (the plots on the left)
busco_plot = (
alt.Chart(busco_plot_data)
.mark_bar()
@@ -202,7 +233,7 @@ def _draw_busco_plots_for_render(
)
# Return
- return output_plot.to_json()
+ return output_plot.to_dict()
def _run_busco(
@@ -420,17 +451,36 @@ def _render_html(
all_summaries_df (pd.DataFrame): Data frame composed of the individual
run summaries.
"""
- # Prepare context for jinja2 template
- context = {
- "vega_plots_overview": _draw_busco_plots_for_render(
- all_summaries_df,
+ # Partition DataFrame
+ dfs = _partition_dataframe(all_summaries_df, max_rows=100)
+
+ context = {}
+ counter_left = 1
+ for i, df in enumerate(dfs):
+ sample_count = df['sample_id'].nunique()
+ counter_right = counter_left + sample_count - 1
+ sample_counter = {"from": counter_left, "to": counter_right}
+ counter_left += sample_count
+ subcontext = _draw_busco_plots_for_render(
+ df,
width=600,
height=30,
titleFontSize=20,
labelFontSize=17,
spacing=20
- ),
- }
+ )
+ context.update(
+ {f"sample{i}": {
+ "subcontext": subcontext,
+ "sample_counter": sample_counter,
+ "sample_ids": df['sample_id'].unique().tolist(),
+ }}
+ )
+
+ vega_out_fp = os.path.join(output_dir, "vega.json")
+ with open(vega_out_fp, 'w') as json_file:
+ vega_json = json.dumps(context)
+ json_file.write(vega_json)
# Copy BUSCO results from tmp dir to output_dir
moshpit_path = os.path.dirname( # Path to parent dir, q2_moshpit
@@ -445,7 +495,7 @@ def _render_html(
)
# Render
- q2templates.render(index, output_dir, context=context)
+ q2templates.render(index, output_dir, context={"vega_json": vega_json})
# Remove unwanted files
# until Bootstrap 3 is replaced with v5, remove the v3 scripts as