From 245839ada8ea43801e401308cfa0f862e9c0760d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Fri, 15 Sep 2023 22:00:32 +0200 Subject: [PATCH 01/20] added python report generation (automated report) --- spacemake/report/__init__.py | 0 spacemake/report/classes/__init__.py | 0 .../report/classes/automated_analysis.py | 468 ++++++++++++++++++ spacemake/snakemake/main.smk | 34 +- 4 files changed, 489 insertions(+), 13 deletions(-) create mode 100644 spacemake/report/__init__.py create mode 100644 spacemake/report/classes/__init__.py create mode 100644 spacemake/report/classes/automated_analysis.py diff --git a/spacemake/report/__init__.py b/spacemake/report/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/spacemake/report/classes/__init__.py b/spacemake/report/classes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/spacemake/report/classes/automated_analysis.py b/spacemake/report/classes/automated_analysis.py new file mode 100644 index 00000000..3a267c18 --- /dev/null +++ b/spacemake/report/classes/automated_analysis.py @@ -0,0 +1,468 @@ +import base64 +import io +import logging +import os + +import anndata as ad +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import scanpy as sc +from jinja2 import Template +from scipy.sparse import csr_matrix +from spacemake.config import ConfigFile +from spacemake.project_df import ProjectDF +from spacemake.util import message_aggregation + +absolute_path = os.path.dirname(__file__) + +cpalette = { + "grey": "#999999", + "light_orange": "#E69F00", + "light_blue": "#56B4E9", + "green": "#009E73", + "yellow": "#F0E442", + "blue": "#0072B2", + "orange": "#D55E00", + "pink": "#CC79A7", +} + +clrs = { + "umis": cpalette["light_orange"], + "genes": cpalette["light_blue"], + "reads": cpalette["green"], + "pcr": cpalette["pink"], + "pct_counts_mt": "black", +} + + +logger_name = "spacemake.snakemake.scripts.n_intersect_sequences" +logger = logging.getLogger(logger_name) + + +def setup_parser(parser): + """ + Set up command-line arguments for the script. + + :param parser: Argument parser object. + :type parser: argparse.ArgumentParser + :returns: Updated argument parser object. + :rtype: argparse.ArgumentParser + """ + # These allow to get the run_mode_variables.* from the config file + # and pbf_metrics.px_by_um via project_df.puck_barcode_file_metrics + parser.add_argument( + "--project-id", + type=str, + help="the project_id in spacemake project_df.csv", + required=True, + ) + parser.add_argument( + "--sample-id", + type=str, + help="the sample_id in spacemake project_df.csv", + required=True, + ) + parser.add_argument( + "--run-mode", + type=str, + help="one of the run_mode from the project+sample in project_df.csv", + required=True, + ) + parser.add_argument( + "--puck-barcode-file-id-qc", + type=str, + help="a path to the puck_barcode_file_id_qc", + required=True, + ) + # These have the paths to the input files used for generating plots + parser.add_argument( + "--obs-df", + type=str, + help="path to the csv file that stores properties per observation (per cell)", + required=True, + ) + parser.add_argument( + "--var-df", + type=str, + help="path to the csv file that stores properties per variable (per gene)", + required=True, + ) + parser.add_argument( + "--neighborhood-enrichment", + type=str, + help="path to the csv file with the neighborhood enrichment results", + required=False, + default="", + ) + # These additionally configure the plotting functionality + parser.add_argument( + "--is-spatial", + type=str, + help="Whether the current sample is spatial or not", + required=True, + choices=["True", "False"], + default="False", + ) + # This specifies the UMI filter for the current report + parser.add_argument( + "--umi-cutoff", + type=str, + help="one of the UMI cutoffs from the --run-mode", + required=True, + ) + # This specifies where the output file will be generated + parser.add_argument( + "--out-html-report", + type=str, + help="where the HTML report will be saved", + required=True, + ) + + return parser + + +def plot_metric(values, axis, nbins=100, color="#000000"): + # decide linear or logarithmic scale + min_difference = values.max() - values.min() + + hist, bins = np.histogram(values, bins=nbins) + if np.abs(min_difference) < 100: + axis.bar(bins[:-1], hist, color=color) + + else: + logbins = np.logspace(np.log10(bins[0] + 1), np.log10(bins[-1]), nbins) + axis.hist(values, bins=logbins, color=color) + axis.set_xscale("log") + + axis.spines[["right", "top"]].set_visible(False) + + +def plot_to_base64(fig): + my_stringIObytes = io.BytesIO() + fig.savefig(my_stringIObytes, format="jpg") + my_stringIObytes.seek(0) + return base64.b64encode(my_stringIObytes.read()).decode() + + +def generate_automated_analysis_metadata( + project_id: str, + sample_id: str, + run_mode: str, + puck_barcode_file_id_qc: str, + obs_df_path: str, + var_df_path: str, + neighborhood_enrichment: str, + is_spatial: bool = False, + umi_cutoff: int = 0, +): + report = { + "table_overview": [], + "data_complete": False, + "n_cells": 0, + "umi_cutoff": 0, + "n_genes": 0, + "analyzed_clustering_resolutions": [], + "is_spatial": False, + "plots": [], + } + main_plots = { + "histogram_metrics_spatial_units": "", + "umi_spatial_unit_original": "", + "umi_spatial_unit_scaled": "", + } + + # Loading data + obs_df = pd.read_csv(obs_df_path) + var_df = pd.read_csv(var_df_path) + + # Loading project_df for metadata + config = ConfigFile.from_yaml("config.yaml") + project_df = ProjectDF("project_df.csv", config=config) + + # Create anndata for scanpy plotting + empty_ad = csr_matrix((len(obs_df), len(var_df)), dtype=int) + adata = ad.AnnData(empty_ad) + adata.obs = obs_df + adata.var = var_df + adata.obsm["X_umap"] = adata.obs[["umap_0", "umap_1"]].values + + report["n_genes"] = len(var_df) + report["n_cells"] = len(obs_df) + + report["umi_cutoff"] = umi_cutoff + report["data_complete"] = True if np.any(obs_df.columns.str.startswith("leiden")) else False + report["is_spatial"] = is_spatial + + report["plots"] = main_plots + + # Plot Histogram of metrics over spatial units + fig, axes = plt.subplots(3, 2, figsize=(7, 6)) + plot_metric(obs_df["n_reads"], axes[0, 0], 100, clrs["reads"]) + axes[0, 0].set_ylabel("# of\nspatial units") + axes[0, 0].set_xlabel("# of reads") + plot_metric(obs_df["reads_per_counts"], axes[0, 1], 100, clrs["pcr"]) + axes[0, 1].set_ylabel("# of\nspatial units") + axes[0, 1].set_xlabel("# of reads/UMI") + plot_metric(obs_df["n_genes_by_counts"], axes[1, 0], 100, clrs["genes"]) + axes[1, 0].set_ylabel("# of\nspatial units") + axes[1, 0].set_xlabel("# of genes") + plot_metric(obs_df["total_counts"], axes[1, 1], 100, clrs["umis"]) + axes[1, 1].set_ylabel("# of\nspatial units") + axes[1, 1].set_xlabel("# of UMIs") + plot_metric(obs_df["pct_counts_mt"], axes[2, 0], 100, clrs["pct_counts_mt"]) + axes[2, 0].set_ylabel("# of\nspatial units") + axes[2, 0].set_xlabel("% of mitochondrial counts") + axes[2, 1].axis("off") + plt.tight_layout() + main_plots["histogram_metrics_spatial_units"] = plot_to_base64(fig) + + # Variables for spatial + px_by_um, puck_bead_size = 1, 1 + x_breaks, y_breaks = None, None + x_mm_breaks, y_mm_breaks = None, None + puck_width_um = 1 + + # Distribution of UMIs in 2D + if is_spatial: + # Loading metadata from spatial pucks and run mode + puck_metrics = ( + project_df.get_puck_barcode_file_metrics( + project_id=project_id, + sample_id=sample_id, + puck_barcode_file_id=puck_barcode_file_id_qc, + ), + )[0] + + puck_settings = project_df.get_puck_variables( + project_id, sample_id, return_empty=True + ) + + run_mode_vars = project_df.config.get_run_mode(run_mode).variables + + print(puck_metrics) + print(puck_settings) + print(run_mode_vars) + + px_by_um = puck_metrics["px_by_um"] + mesh_spot_diameter_um = run_mode_vars["mesh_spot_diameter_um"] + meshed = run_mode_vars["mesh_data"] + spot_diameter_um = puck_settings["spot_diameter_um"] + + adata.obsm["spatial"] = adata.obs[["x_pos", "y_pos"]].values + + # Set limits and axes units for the spatial plots + x_limits = adata.obsm["spatial"][:, 0].min(), adata.obsm["spatial"][:, 0].max() + y_limits = adata.obsm["spatial"][:, 1].min(), adata.obsm["spatial"][:, 1].max() + puck_width_um = (x_limits[1] - x_limits[0]) / px_by_um + + ratio = (x_limits[1] - x_limits[0]) / (y_limits[1] - y_limits[0]) + + scale_factor = 2 if puck_width_um < 3000 else 3 + mm_dist = max(10**scale_factor, round(puck_width_um / 3, scale_factor)) + mm_diff = mm_dist / 1000 + + def_plot_bead_size = 0.5 if report["n_cells"] > 5000 else 0.75 + def_plot_bead_size = 0.1 if report["n_cells"] > 10000 else def_plot_bead_size + def_plot_bead_size = 0.05 if report["n_cells"] > 25000 else def_plot_bead_size + + puck_bead_size = max( + def_plot_bead_size, mesh_spot_diameter_um if meshed else spot_diameter_um + ) + x_mm_breaks = np.arange(0, puck_width_um, mm_dist) + x_mm_breaks = [f"{round(i, 1)} mm" for i in x_mm_breaks * mm_diff / mm_dist] + y_mm_breaks = np.arange(0, puck_width_um / ratio, mm_dist) + y_mm_breaks = [f"{round(i, 1)} mm" for i in y_mm_breaks * mm_diff / mm_dist] + + x_breaks = np.arange(x_limits[0], x_limits[1], px_by_um * mm_dist) + y_breaks = np.arange(y_limits[0], y_limits[1], px_by_um * mm_dist) + + # Plot spatial UMI (original) + fig, axes = plt.subplots(1, 1, figsize=(5, 5)) + sc.pl.spatial( + adata, + img_key=None, + size=puck_bead_size * 1.5, + spot_size=px_by_um, + color="total_counts", + ax=axes, + show=False, + cmap="inferno", + ) + axes.spines[["right", "top"]].set_visible(False) + axes.set_xticks(x_breaks) + axes.set_xticklabels(x_mm_breaks) + axes.set_yticks(y_breaks) + axes.set_yticklabels(y_mm_breaks) + axes.set_ylabel("") + axes.set_xlabel("") + + main_plots["umi_spatial_unit_original"] = plot_to_base64(fig) + + # Plot spatial UMI (rescaled) + + fig, axes = plt.subplots(1, 1, figsize=(5, 5)) + sc.pl.spatial( + adata, + img_key=None, + size=puck_bead_size * 1.5, + spot_size=px_by_um, + color="total_counts", + ax=axes, + show=False, + cmap="inferno", + vmax=np.quantile(adata.obs["total_counts"], 0.9), + ) + axes.spines[["right", "top"]].set_visible(False) + axes.set_xticks(x_breaks) + axes.set_xticklabels(x_mm_breaks) + axes.set_yticks(y_breaks) + axes.set_yticklabels(y_mm_breaks) + axes.set_ylabel("") + axes.set_xlabel("") + + main_plots["umi_spatial_unit_scaled"] = plot_to_base64(fig) + + if report["data_complete"]: + resolutions = np.unique( + pd.Series(obs_df.columns[obs_df.columns.str.startswith("leiden")]) + .apply(lambda x: x.split("_")[1]) + .values + ) + for i, resolution in enumerate(resolutions): + _current_analyzed_clustering_resolution = { + "resolution": "", + "resolution_id": "", + "plots": {"umap": "", "spatial": "", "neighbor_enrichment": ""}, + } + + _current_analyzed_clustering_resolution['resolution'] = f'{resolution} resolution' + _current_analyzed_clustering_resolution['resolution_id'] = f'rid_{i}' + + # Plot UMAP for current resolution + adata.obs[f"leiden_{resolution}"] = pd.Categorical( + adata.obs[f"leiden_{resolution}"] + ) + fig, axes = plt.subplots(1, 1, figsize=(5, 5)) + sc.pl.umap(adata, color=f"leiden_{resolution}", ax=axes, show=False) + axes.spines[["right", "top", "left", "bottom"]].set_visible(False) + axes.set_ylabel("UMAP 0") + axes.set_xlabel("UMAP 1") + _current_analyzed_clustering_resolution["plots"]["umap"] = plot_to_base64( + fig + ) + + # Spatial plot and neighborhood enrichment (if spatial) + if is_spatial: + # Plot Spatial clusters + fig, axes = plt.subplots(1, 1, figsize=(5, 5)) + sc.pl.spatial( + adata, + img_key=None, + size=puck_bead_size * 1.5, + spot_size=px_by_um, + color=f"leiden_{resolution}", + ax=axes, + show=False, + ) # marker can be "o" or "h" if is hexagonal + axes.spines[["right", "top"]].set_visible(False) + axes.set_xticks(x_breaks) + axes.set_xticklabels(x_mm_breaks) + axes.set_yticks(y_breaks) + axes.set_yticklabels(y_mm_breaks) + axes.set_ylabel("") + axes.set_xlabel("") + _current_analyzed_clustering_resolution["plots"][ + "spatial" + ] = plot_to_base64(fig) + + # Plot Neighborhood enrichment + nhood_enrich = pd.read_csv(neighborhood_enrichment) + nhood_enrich_current_res = nhood_enrich[ + nhood_enrich["resolution"].astype(str) == str(resolution) + ] + ne_data = np.zeros( + ( + nhood_enrich_current_res["cluster_a"].max() + 1, + nhood_enrich_current_res["cluster_b"].max() + 1, + ) + ) + ne_data[ + nhood_enrich_current_res["cluster_a"], + nhood_enrich_current_res["cluster_b"], + ] = nhood_enrich_current_res["zscore"] + + fig, axes = plt.subplots(1, 1, figsize=(4, 4)) + plmat = axes.matshow( + ne_data, cmap="inferno", vmin=-50, vmax=100, origin="lower" + ) + cbar = plt.colorbar(plmat, fraction=0.046) + cbar.set_label("Neighbor enrichment score") + axes.set_xlabel("cluster identity") + axes.set_ylabel("cluster identity") + axes.spines[["right", "top"]].set_visible(False) + axes.xaxis.tick_bottom() + plt.tight_layout() + _current_analyzed_clustering_resolution["plots"][ + "neighbor_enrichment" + ] = plot_to_base64(fig) + + report["analyzed_clustering_resolutions"].append( + _current_analyzed_clustering_resolution.copy() + ) + + report["table_overview"] = [ + {"name": "UMI filter", "value": umi_cutoff}, + {"name": "# genes in data", "value": report["n_genes"]}, + {"name": "# of spots in data", "value": report["n_cells"]}, + {"name": "median UMI", "value": obs_df["total_counts"].median()}, + {"name": "median genes", "value": obs_df["n_genes_by_counts"].median()}, + {"name": "puck width (um)", "value": puck_width_um}, + ] + + return report + + +def generate_html_report(data, template_file): + with open(template_file, "r") as template_data: + template_content = template_data.read() + template = Template(template_content) + + html_report = template.render(report=data) + + return html_report + + +@message_aggregation(logger_name) +def cmdline(): + """cmdline.""" + import argparse + + parser = argparse.ArgumentParser( + allow_abbrev=False, + description="generate spacemake's 'automated analysis' with python", + ) + parser = setup_parser(parser) + + args = parser.parse_args() + template_file = os.path.join(absolute_path, "../templates/automated_analysis.html") + + report_metadata = generate_automated_analysis_metadata( + args.project_id, + args.sample_id, + args.run_mode, + args.puck_barcode_file_id_qc, + args.obs_df, + args.var_df, + args.neighborhood_enrichment, + args.is_spatial, + args.umi_cutoff, + ) + html_report = generate_html_report(report_metadata, template_file) + + with open(args.out_html_report, "w") as output: + output.write(html_report) + + +if __name__ == "__main__": + cmdline() \ No newline at end of file diff --git a/spacemake/snakemake/main.smk b/spacemake/snakemake/main.smk index 0b2fa5be..52bf30ba 100644 --- a/spacemake/snakemake/main.smk +++ b/spacemake/snakemake/main.smk @@ -650,7 +650,7 @@ rule create_automated_analysis_processed_data_files: puck_barcode_file_id=wildcards.puck_barcode_file_id_qc), script: 'scripts/automated_analysis_create_processed_data_files.py' - + rule create_automated_report: input: **automated_analysis_processed_data_files, @@ -658,21 +658,29 @@ rule create_automated_report: output: automated_report params: - run_mode_variables = lambda wildcards: - project_df.config.get_run_mode(wildcards.run_mode).variables, - puck_variables = lambda wildcards: - project_df.get_puck_variables(wildcards.project_id, wildcards.sample_id, - return_empty=True), - pbf_metrics = lambda wildcards: project_df.get_puck_barcode_file_metrics( - project_id = wildcards.project_id, - sample_id = wildcards.sample_id, - puck_barcode_file_id = wildcards.puck_barcode_file_id_qc), is_spatial = lambda wildcards: project_df.is_spatial(wildcards.project_id, wildcards.sample_id, puck_barcode_file_id=wildcards.puck_barcode_file_id_qc), - r_shared_scripts= repo_dir + '/scripts/shared_functions.R' - script: - 'scripts/automated_analysis_create_report.Rmd' + run: + from spacemake.report.classes.automated_analysis import generate_automat + + template_file = os.path.join(spacemake_dir, "report/templates/automated_analysis.html") + + report_metadata = generate_automated_analysis_metadata( + wildcards.project_id, + wildcards.sample_id, + wildcards.run_mode, + wildcards.puck_barcode_file_id_qc, + input['obs_df']', + input['var_df']', + input['nhood_enrichment'], + params['is_spatial'], + wildcards.umi_cutoff, + ) + html_report = generate_html_report(report_metadata, template_file) + + with open(output[0], "w") as output: + output.write(html_report) rule split_final_bam: input: From f70efaa79ee5ac540cf81137f3efe394af08b87b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Sun, 17 Sep 2023 19:43:02 +0200 Subject: [PATCH 02/20] update automated analysis logger --- spacemake/report/classes/automated_analysis.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/spacemake/report/classes/automated_analysis.py b/spacemake/report/classes/automated_analysis.py index 3a267c18..91adfcb3 100644 --- a/spacemake/report/classes/automated_analysis.py +++ b/spacemake/report/classes/automated_analysis.py @@ -36,7 +36,7 @@ } -logger_name = "spacemake.snakemake.scripts.n_intersect_sequences" +logger_name = "spacemake.report.automated_analysis" logger = logging.getLogger(logger_name) @@ -343,11 +343,12 @@ def generate_automated_analysis_metadata( adata.obs[f"leiden_{resolution}"] = pd.Categorical( adata.obs[f"leiden_{resolution}"] ) - fig, axes = plt.subplots(1, 1, figsize=(5, 5)) + fig, axes = plt.subplots(1, 1, figsize=(8, 5)) sc.pl.umap(adata, color=f"leiden_{resolution}", ax=axes, show=False) axes.spines[["right", "top", "left", "bottom"]].set_visible(False) axes.set_ylabel("UMAP 0") axes.set_xlabel("UMAP 1") + plt.tight_layout() _current_analyzed_clustering_resolution["plots"]["umap"] = plot_to_base64( fig ) @@ -355,7 +356,7 @@ def generate_automated_analysis_metadata( # Spatial plot and neighborhood enrichment (if spatial) if is_spatial: # Plot Spatial clusters - fig, axes = plt.subplots(1, 1, figsize=(5, 5)) + fig, axes = plt.subplots(1, 1, figsize=(8, 5)) sc.pl.spatial( adata, img_key=None, @@ -372,6 +373,7 @@ def generate_automated_analysis_metadata( axes.set_yticklabels(y_mm_breaks) axes.set_ylabel("") axes.set_xlabel("") + plt.tight_layout() _current_analyzed_clustering_resolution["plots"][ "spatial" ] = plot_to_base64(fig) From 02cfa10552714067612f7d9074a578af52f36f4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Sun, 17 Sep 2023 19:43:37 +0200 Subject: [PATCH 03/20] add qc_sequencing in python --- spacemake/report/classes/qc_sequencing.py | 829 ++++++++++++++++++++++ 1 file changed, 829 insertions(+) create mode 100644 spacemake/report/classes/qc_sequencing.py diff --git a/spacemake/report/classes/qc_sequencing.py b/spacemake/report/classes/qc_sequencing.py new file mode 100644 index 00000000..feb2e3ed --- /dev/null +++ b/spacemake/report/classes/qc_sequencing.py @@ -0,0 +1,829 @@ +import base64 +import io +import logging +import os +from typing import Union + +import anndata as ad +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import scanpy as sc +from jinja2 import Template +from scipy.sparse import csr_matrix +from spacemake.config import ConfigFile +from spacemake.project_df import ProjectDF +from spacemake.util import message_aggregation + +absolute_path = os.path.dirname(__file__) + +cpalette = { + "grey": "#999999", + "light_orange": "#E69F00", + "light_blue": "#56B4E9", + "green": "#009E73", + "yellow": "#F0E442", + "blue": "#0072B2", + "orange": "#D55E00", + "pink": "#CC79A7", +} + +clrs = { + "umis": cpalette["light_orange"], + "genes": cpalette["light_blue"], + "reads": cpalette["green"], + "pcr": cpalette["pink"], + "pct_counts_mt": "black", +} + +SAMPLEINFO_VARS = [ + "species", + "sequencing_date", + "investigator", + "experiment", +] +SPATIAL_METRICS = [ + "n_genes_by_counts", + "total_counts", + "pct_counts_mt", + "n_reads", + "reads_per_counts", + "n_joined", + "exact_entropy", + "exact_compression", +] +SPATIAL_METRICS_TITLES = { + "n_genes_by_counts": "# of genes per spatial unit", + "total_counts": "# of UMIs per spatial unit", + "pct_counts_mt": "# % mt counts per spatial unit", + "n_reads": "# of reads per spatial unit", + "reads_per_counts": "reads/UMI per spatial unit", + "n_joined": "# beads joined per spatial unit", + "exact_entropy": "Shannon entropy per spatial unit", + "exact_compression": "barcode length after compression per spatial unit", +} + +logger_name = "spacemake.report.qc_sequencing" +logger = logging.getLogger(logger_name) + +# TODO: check the IDs for the plots grouped by tabs +# TODO: add the nucleotide quartiles for the non-meshed modes +# TODO: add legend to the entropy plots + +# How to run, example: +# python /home/dleonpe/spacemake_project/repos/spacemake/spacemake/report/classes/qc_sequencing.py \ +# --project-id fc_sts_76 --sample-id fc_sts_76_3 --run-modes fc_sts_novaseq_SP_mesh_7um \ +# --dge-summary-paths /home/dleonpe/spacemake_project/data/1_spacemake_mouse_new/projects/fc_sts_76/processed_data/fc_sts_76_3/illumina/complete_data/dge/dge.all.polyA_adapter_trimmed.mm_included.spatial_beads.mesh_7_hexagon_fc_010_L3_tile_2157.obs.csv \ +# --complete-data-root /home/dleonpe/spacemake_project/data/1_spacemake_mouse_new/projects/fc_sts_76/processed_data/fc_sts_76_3/illumina/complete_data \ +# --split-reads-read-type /home/dleonpe/spacemake_project/data/1_spacemake_mouse_new/projects/fc_sts_76/processed_data/fc_sts_76_3/illumina/complete_data/split_reads.polyA_adapter_trimmed/read_type_num.txt \ +# --is-spatial True --out-html-report /home/dleonpe/spacemake_project/repos/spacemake/spacemake/report/classes/fc_sts_76_3_qc_sequencing_new.html \ +# --puck-barcode-file-id-qc fc_010_L3_tile_2157 + + +def setup_parser(parser): + """ + Set up command-line arguments for the script. + + :param parser: Argument parser object. + :type parser: argparse.ArgumentParser + :returns: Updated argument parser object. + :rtype: argparse.ArgumentParser + """ + # These allow to get the run_mode_variables.* from the config file + # and pbf_metrics.px_by_um via project_df.puck_barcode_file_metrics + parser.add_argument( + "--project-id", + type=str, + help="the project_id in spacemake project_df.csv", + required=True, + ) + parser.add_argument( + "--sample-id", + type=str, + help="the sample_id in spacemake project_df.csv", + required=True, + ) + parser.add_argument( + "--puck-barcode-file-id-qc", + type=str, + help="a path to the puck_barcode_file_id_qc", + required=True, + ) + # These have the paths to the input files used for generating plots + parser.add_argument( + "--dge-summary-paths", + type=str, + nargs="+", + help="paths to the dge summary files, must be in same number and order as --run-modes", + required=True, + ) + parser.add_argument( + "--run-modes", + type=str, + nargs="+", + help="run modes of the sample for which the report will be generated", + required=False, + default=None, + ) + parser.add_argument( + "--complete-data-root", + type=str, + help="path to where the sample data is stored", + required=True, + ) + parser.add_argument( + "--split-reads-read-type", + type=str, + nargs="+", + help="path to the 'read_type_num' file(s) generated by read annotator. One per mapping strategy.", + required=True, + ) + # These additionally configure the plotting functionality + parser.add_argument( + "--is-spatial", + type=str, + help="Whether the current sample is spatial or not", + required=True, + choices=["True", "False"], + default="False", + ) + # This specifies where the output file will be generated + parser.add_argument( + "--out-html-report", + type=str, + help="where the HTML report will be saved", + required=True, + ) + + return parser + + +def reverse_readline(filename, buf_size=8192): + """A generator that returns the lines of a file in reverse order""" + with open(filename, "rb") as fh: + segment = None + offset = 0 + fh.seek(0, os.SEEK_END) + file_size = remaining_size = fh.tell() + while remaining_size > 0: + offset = min(file_size, offset + buf_size) + fh.seek(file_size - offset) + buffer = fh.read(min(remaining_size, buf_size)).decode(encoding="utf-8") + remaining_size -= buf_size + lines = buffer.split("\n") + # The first line of the buffer is probably not a complete line so + # we'll save it and append it to the last line of the next buffer + # we read + if segment is not None: + # If the previous chunk starts right from the beginning of line + # do not concat the segment to the last line of new chunk. + # Instead, yield the segment first + if buffer[-1] != "\n": + lines[-1] += segment + else: + yield segment + segment = lines[0] + for index in range(len(lines) - 1, 0, -1): + if lines[index]: + yield lines[index] + # Don't yield None if the file was empty + if segment is not None: + yield segment + + +def read_star_log_file(log_file): + file = os.path.join(log_file) + + if not os.path.isfile(file): + return (1, 1) + + star_stats = { + "input_reads": 0, + "uniq_mapped_reads": 0, + "multi_mapped_reads": 0, + "too_many_mapped_reads": 0, + "too_many_mapped_reads": 0, + "unmapped_too_short": 0, + "unmapped_other": 0, + "chimeric": 0, + } + + log_name_stat = { + "Number of input reads": "input_reads", + "Uniquely mapped reads number": "uniq_mapped_reads", + "Number of reads mapped to multiple loci": "multi_mapped_reads", + "Number of reads mapped to too many loci": "too_many_mapped_reads", + "Number of reads unmapped: too many mismatches": "unmapped_mismatches", + "Number of reads unmapped: too short": "unmapped_too_short", + "Number of reads unmapped: other": "unmapped_other", + "Number of chimeric reads": "chimeric", + "Average mapped length": "avg_mapped_length" + } + + with open(file) as fi: + idx = 0 + for line in fi: + _log_id = line.strip().split("|")[0].strip() + if _log_id in log_name_stat.keys(): + if _log_id == "Average mapped length": + star_stats[log_name_stat[_log_id]] = line.strip().split("|")[1] + else: + star_stats[log_name_stat[_log_id]] = round(int(line.strip().split("|")[1])/1e6, 2) + # Do not convert to millions the Average mapped length + + idx += 1 + + return star_stats + + +def read_split_reads_file(split_reads_file): + file = os.path.join(split_reads_file) + + read_types = dict() + with open(file) as fi: + for line in fi: + line = line.strip().split(" ") + read_types[line[0]] = round(int(line[1])/1e6, 2) + + return read_types + + +def read_bowtie_log_file(log_file): + file = os.path.join(log_file) + bowtie_stats = { + "input_reads": 0, + "unique_aligned": 0, + "multimapper": 0, + "unaligned": 0, + } + log_line_stat = { + 5: "input_reads", + 3: "unaligned", + 2: "unique_aligned", + 1: "multimapper", + } + max_line = max(log_line_stat.keys()) + + idx = 0 + for line in reverse_readline(file): + if idx in log_line_stat.keys(): + bowtie_stats[log_line_stat[idx]] = round(int(line.strip().split(" ")[0])/1e6, 2) + idx += 1 + if max_line < idx: + break + + return bowtie_stats + + +def plot_metric(values, axis, nbins=100, color="#000000"): + # decide linear or logarithmic scale + min_difference = values.max() - values.min() + + hist, bins = np.histogram(values, bins=nbins) + if np.abs(min_difference) < 100: + axis.bar(bins[:-1], hist, color=color) + + else: + logbins = np.logspace(np.log10(bins[0] + 1), np.log10(bins[-1]), nbins) + axis.hist(values, bins=logbins, color=color) + axis.set_xscale("log") + + axis.spines[["right", "top"]].set_visible(False) + + +def plot_umi_cutoff(obs_df): + umi_cutoffs = np.arange(10, 20000, 10) + + def summarise_dge_summary(df, umi_cutoff): + df_filter = df[df["total_counts"] > umi_cutoff] + df_filter["n_beads"] = len(df_filter) + df_summary = df_filter[ + [ + "n_reads", + "total_counts", + "n_genes_by_counts", + "reads_per_counts", + "n_beads", + ] + ].median() + + return df_summary + + umi_cutoff_data = pd.DataFrame( + np.vstack( + [ + summarise_dge_summary(obs_df, umi_cutoff).values + for umi_cutoff in umi_cutoffs + ] + ), + columns=[ + "n_reads", + "total_counts", + "n_genes_by_counts", + "reads_per_counts", + "n_beads", + ], + ) + umi_cutoff_data["umi_cutoffs"] = umi_cutoffs + + fig, axes = plt.subplots(2, 3, figsize=(8, 4)) + axes[0, 0].plot( + umi_cutoff_data["umi_cutoffs"], + umi_cutoff_data["n_beads"], + color="black", + linewidth=1, + ) + axes[0, 0].set_ylabel("number of\nspatial units") + axes[0, 0].set_xlabel("minimum UMI") + axes[0, 0].set_yscale("log") + axes[0, 0].set_xscale("log") + axes[0, 0].spines[["right", "top"]].set_visible(False) + + axes[0, 1].plot( + umi_cutoff_data["umi_cutoffs"], + umi_cutoff_data["n_reads"], + color=clrs["reads"], + linewidth=1, + ) + axes[0, 1].set_ylabel("median reads\nper spatial unit") + axes[0, 1].set_xlabel("minimum UMI") + axes[0, 1].set_xscale("log") + axes[0, 1].spines[["right", "top"]].set_visible(False) + + axes[0, 2].plot( + umi_cutoff_data["umi_cutoffs"], + umi_cutoff_data["n_genes_by_counts"], + color=clrs["genes"], + linewidth=1, + ) + axes[0, 2].set_ylabel("median genes\nper spatial unit") + axes[0, 2].set_xlabel("minimum UMI") + axes[0, 2].set_xscale("log") + axes[0, 2].spines[["right", "top"]].set_visible(False) + + axes[1, 0].plot( + umi_cutoff_data["umi_cutoffs"], + umi_cutoff_data["total_counts"], + color=clrs["umis"], + linewidth=1, + ) + axes[1, 0].set_ylabel("median UMIs\nper spatial unit") + axes[1, 0].set_xlabel("minimum UMI") + axes[1, 0].set_xscale("log") + axes[1, 0].spines[["right", "top"]].set_visible(False) + + axes[1, 1].plot( + umi_cutoff_data["umi_cutoffs"], + umi_cutoff_data["reads_per_counts"], + color=clrs["pcr"], + linewidth=1, + ) + axes[1, 1].set_ylabel("median reads/UMI\nper spatial unit") + axes[1, 1].set_xlabel("minimum UMI") + axes[1, 1].set_xscale("log") + axes[1, 1].spines[["right", "top"]].set_visible(False) + axes[1, 2].axis("off") + + plt.tight_layout() + return fig, axes + + +def plot_histogram_beads(obs_df): + # One for each run mode + fig, axes = plt.subplots(2, 2, figsize=(7, 3.5)) + plot_metric(obs_df["n_reads"], axes[0, 0], 100, clrs["reads"]) + axes[0, 0].set_ylabel("# of\nspatial units") + axes[0, 0].set_xlabel("# of reads") + + reads_per_counts = obs_df["reads_per_counts"] + reads_per_counts = np.nan_to_num(reads_per_counts) + plot_metric(reads_per_counts, axes[0, 1], 100, clrs["pcr"]) + axes[0, 1].set_ylabel("# of\nspatial units") + axes[0, 1].set_xlabel("# of reads/UMI") + + plot_metric(obs_df["n_genes_by_counts"], axes[1, 0], 100, clrs["genes"]) + axes[1, 0].set_ylabel("# of\nspatial units") + axes[1, 0].set_xlabel("# of genes") + + plot_metric(obs_df["total_counts"], axes[1, 1], 100, clrs["umis"]) + axes[1, 1].set_ylabel("# of\nspatial units") + axes[1, 1].set_xlabel("# of UMIs") + + plt.tight_layout() + return fig, axes + + +def plot_entropy_compression(obs_df, nbins=30): + fig, axes = plt.subplots(2, 1, figsize=(7, 4)) + axes[0].hist( + obs_df["theoretical_entropy"].values, + color=cpalette["grey"], + bins=nbins, + edgecolor="black", + label="Theoretical", + ) + axes[0].hist( + obs_df["exact_entropy"].values, + color=cpalette["orange"], + bins=nbins, + edgecolor="black", + alpha=0.7, + label="Observed", + ) + axes[0].set_xlabel("Shannon entropy of barcodes") + axes[0].set_ylabel("# of barcodes") + axes[0].spines[["right", "top"]].set_visible(False) + + axes[1].hist( + obs_df["theoretical_compression"].values, + color=cpalette["grey"], + bins=nbins, + edgecolor="black", + label="Theoretical", + ) + axes[1].hist( + obs_df["exact_compression"].values, + color=cpalette["orange"], + bins=nbins, + edgecolor="black", + alpha=0.7, + label="Observed", + ) + axes[1].set_xlabel("Length of barcodes after compression") + axes[1].set_ylabel("# of barcodes") + axes[1].spines[["right", "top"]].set_visible(False) + axes[1].set_xlim(left=0) + return fig, axes + + +def plot_spatial_qc_metric( + adata, + metric="total_counts", + puck_bead_size=1, + px_by_um=1, + x_breaks=None, + y_breaks=None, + x_mm_breaks=None, + y_mm_breaks=None, +): + fig, axes = plt.subplots(1, 1, figsize=(5, 5)) + sc.pl.spatial( + adata, + img_key=None, + size=puck_bead_size * 1.5, + spot_size=px_by_um, + color=metric, + ax=axes, + title=SPATIAL_METRICS_TITLES[metric], + show=False, + cmap="inferno", + ) + axes.spines[["right", "top"]].set_visible(False) + axes.set_xticks(x_breaks) + axes.set_xticklabels(x_mm_breaks) + axes.set_yticks(y_breaks) + axes.set_yticklabels(y_mm_breaks) + axes.set_ylabel("") + axes.set_xlabel("") + + return fig, axes + + +def plot_to_base64(fig): + my_stringIObytes = io.BytesIO() + fig.savefig(my_stringIObytes, format="jpg", dpi=300) + my_stringIObytes.seek(0) + return base64.b64encode(my_stringIObytes.read()).decode() + + +def generate_table_mapping_statistics( + complete_data_root: str, split_reads_read_type: str +): + # Initialize empty lists to store the filenames + bowtie_log_files = [] + star_log_files = [] + + # Iterate over the files in the folder + for filename in os.listdir(complete_data_root): + # Check if the file ends with .bam.log + if filename.endswith(".bam.log") and "final" not in filename: + bowtie_log_files.append(filename) + # Check if the file ends with .Log.final.out + elif filename.endswith(".Log.final.out") and filename != "star.Log.final.out": + star_log_files.append(filename) + + bowtie_logs = [] + for bowtie_log_file in bowtie_log_files: + bowtie_log = read_bowtie_log_file( + os.path.join(complete_data_root, bowtie_log_file) + ) + bowtie_log["name"] = bowtie_log_file.split(".")[0] + bowtie_log["mapper"] = "bowtie2" + bowtie_logs.append(bowtie_log) + + star_logs = [] + for star_log_file in star_log_files: + star_log = read_star_log_file(os.path.join(complete_data_root, star_log_file)) + star_log["name"] = star_log_file.split(".")[1] + star_log["mapper"] = "STAR" + star_logs.append(star_log) + + # we sort the mapping statistics by the number of input reads, merge into a single list + all_logs = star_logs + bowtie_logs + + idx_log = np.argsort([log["input_reads"] for log in all_logs])[::-1] + all_logs = [all_logs[idx] for idx in idx_log] + + reads_type = read_split_reads_file(split_reads_read_type) + reads_type["name"] = "reads_type" + all_logs += [reads_type] + + return all_logs + + +def load_dge_summary(obs_df_path, with_adata=True): + obs_df = pd.read_csv(obs_df_path) + empty_ad = csr_matrix((len(obs_df), 1), dtype=int) + adata = ad.AnnData(empty_ad) + adata.obs = obs_df + if with_adata: + return obs_df, adata + else: + return obs_df + + +def generate_qc_sequencing_metadata( + project_id: str, + sample_id: str, + dge_summary_paths: Union[ + list, dict + ], # from snakemake_helper_funtions.get_qc_sheet_input_files + complete_data_root: str, + split_reads_read_type: Union[str, list], + puck_barcode_file_id_qc: str, + is_spatial: bool = False, + run_modes: list = None, +): + if isinstance(dge_summary_paths, str): + dge_summary_paths = [dge_summary_paths] + if isinstance(run_modes, str): + run_modes = [run_modes] + if ( + isinstance(run_modes, list) + and isinstance(dge_summary_paths, list) + and len(run_modes) == len(dge_summary_paths) + ): + dge_summary_paths = { + run_mode: dge_summary_path + for run_mode, dge_summary_path in zip(run_modes, dge_summary_paths) + } + elif ( + isinstance(run_modes, list) + and isinstance(dge_summary_paths, list) + and len(run_modes) != len(dge_summary_paths) + ): + raise ValueError( + "'run_modes' and 'dge_summary_paths' must have the same length" + ) + + report = { + "type": "qc_report", + "sampleinfo": [], + "runmodes": [], + "mappingstats": [], + "summarisedmetrics": [], + "is_spatial": is_spatial, + "plots": [], + } + main_plots = { + "kneeplot": [], + "umicutoff": [], + "histogrambeads": [], + "nucleotidedistribution": [], + "shannonentropy": [], + "spatialqc": [], + } + report["plots"] = main_plots + + # Loading project_df for metadata + config = ConfigFile.from_yaml("config.yaml") + project_df = ProjectDF("project_df.csv", config=config) + + # Table: sample info + sample_info = project_df.get_sample_info(project_id, sample_id) + report["sampleinfo"] = {} + report["sampleinfo"]["project_id"] = project_id + report["sampleinfo"]["sample_id"] = sample_id + report["sampleinfo"]["puck_barcode_file_id"] = puck_barcode_file_id_qc + report["sampleinfo"].update({svar: sample_info[svar] for svar in SAMPLEINFO_VARS}) + + # Table: run modes + if run_modes is None: + run_modes = sample_info["run_mode"] + + for run_mode in run_modes: + run_mode_info = {} + run_mode_info["variables"] = project_df.config.get_run_mode(run_mode).variables + run_mode_info["name"] = run_mode + report["runmodes"].append(run_mode_info) + + # Table: mapping statistics + if isinstance(split_reads_read_type, str): + split_reads_read_type = [split_reads_read_type] + + for s in split_reads_read_type: + # assumes that the directory of s is the name of the mapping strategy + mappingstat = {'name': os.path.basename(os.path.dirname(s)), 'all_stats': generate_table_mapping_statistics(complete_data_root, s)} + report["mappingstats"].append(mappingstat) + + # Loading all dge data summaries + # Table: summarised metrics over beads + dge_summaries = {} + for run_mode in run_modes: + dge_summaries[run_mode] = {} + summarised_metrics = {"name": run_mode, "variables": None} + _dge_summary = load_dge_summary(dge_summary_paths[run_mode]) + dge_summaries[run_mode]["df"] = _dge_summary[0] + dge_summaries[run_mode]["adata"] = _dge_summary[1] + + obs_df = dge_summaries[run_mode]["df"] + metrics_beads = ( + obs_df[["n_genes_by_counts", "reads_per_counts", "n_reads", "total_counts"]] + .median() + .to_dict() + ) + metrics_beads["n_beads"] = len(obs_df) + metrics_beads["sum_reads"] = obs_df["n_reads"].sum() + summarised_metrics["variables"] = metrics_beads + report["summarisedmetrics"].append(summarised_metrics) + + # Plots + # 'Knee'-plot + for run_mode, dge_summary in dge_summaries.items(): + kneeplot = {"name": run_mode, "plot": None} + obs_df = dge_summary["df"] + fig, axis = plt.subplots(1, 1, figsize=(5, 3)) + axis.plot( + np.arange(len(obs_df)), np.cumsum(obs_df["n_reads"].values), color="black", linewidth=1 + ) + axis.set_ylabel("Cumulative\nsum of reads") + axis.set_xlabel("Beads sorted by number of reads") + axis.spines[["right", "top"]].set_visible(False) + plt.tight_layout() + kneeplot["plot"] = plot_to_base64(fig) + report["plots"]["kneeplot"].append(kneeplot) + + # UMI cutoff plots + for run_mode, dge_summary in dge_summaries.items(): + umicutoff = {"name": run_mode, "plot": None} + obs_df = dge_summary["df"] + fig, axes = plot_umi_cutoff(obs_df) + umicutoff["plot"] = plot_to_base64(fig) + report["plots"]["umicutoff"].append(umicutoff) + + # Histogram beads + for run_mode, dge_summary in dge_summaries.items(): + histogrambeads = {"name": run_mode, "plot": None} + obs_df = dge_summary["df"] + fig, axes = plot_histogram_beads(obs_df) + histogrambeads["plot"] = plot_to_base64(fig) + report["plots"]["histogrambeads"].append(histogrambeads) + + # TODO: + # Nucleotide distribution per beads + + # Shannon entropy + for run_mode, dge_summary in dge_summaries.items(): + shannonentropy = {"name": run_mode, "plot": None} + obs_df = dge_summary["df"] + fig, axes = plot_entropy_compression(obs_df) + shannonentropy["plot"] = plot_to_base64(fig) + report["plots"]["shannonentropy"].append(shannonentropy) + + # Return here if not spatial + if not is_spatial: + return report + + # Proceed with Spatial QC plots + # Variables for spatial + px_by_um, puck_bead_size = 1, 1 + x_breaks, y_breaks = None, None + x_mm_breaks, y_mm_breaks = None, None + puck_width_um = 1 + + for run_mode, dge_summary in dge_summaries.items(): + # Loading adata for scanpy plotting + adata = dge_summary["adata"] + adata.obsm["spatial"] = adata.obs[["x_pos", "y_pos"]].values + report["n_cells"] = len(adata) + + # Loading metadata from spatial pucks and run mode + puck_metrics = ( + project_df.get_puck_barcode_file_metrics( + project_id=project_id, + sample_id=sample_id, + puck_barcode_file_id=puck_barcode_file_id_qc, + ), + )[0] + + puck_settings = project_df.get_puck_variables( + project_id, sample_id, return_empty=True + ) + + run_mode_vars = project_df.config.get_run_mode(run_mode).variables + + px_by_um = puck_metrics["px_by_um"] + mesh_spot_diameter_um = run_mode_vars["mesh_spot_diameter_um"] + meshed = run_mode_vars["mesh_data"] + spot_diameter_um = puck_settings["spot_diameter_um"] + + # Set limits and axes units for the spatial plots + x_limits = adata.obsm["spatial"][:, 0].min(), adata.obsm["spatial"][:, 0].max() + y_limits = adata.obsm["spatial"][:, 1].min(), adata.obsm["spatial"][:, 1].max() + puck_width_um = (x_limits[1] - x_limits[0]) / px_by_um + + ratio = (x_limits[1] - x_limits[0]) / (y_limits[1] - y_limits[0]) + + scale_factor = 2 if puck_width_um < 3000 else 3 + mm_dist = max(10**scale_factor, round(puck_width_um / 3, scale_factor)) + mm_diff = mm_dist / 1000 + + def_plot_bead_size = 0.5 if report["n_cells"] > 5000 else 0.75 + def_plot_bead_size = 0.1 if report["n_cells"] > 10000 else def_plot_bead_size + def_plot_bead_size = 0.05 if report["n_cells"] > 25000 else def_plot_bead_size + + puck_bead_size = max( + def_plot_bead_size, mesh_spot_diameter_um if meshed else spot_diameter_um + ) + x_mm_breaks = np.arange(0, puck_width_um, mm_dist) + x_mm_breaks = [f"{round(i, 1)} mm" for i in x_mm_breaks * mm_diff / mm_dist] + y_mm_breaks = np.arange(0, puck_width_um / ratio, mm_dist) + y_mm_breaks = [f"{round(i, 1)} mm" for i in y_mm_breaks * mm_diff / mm_dist] + + x_breaks = np.arange(x_limits[0], x_limits[1], px_by_um * mm_dist) + y_breaks = np.arange(y_limits[0], y_limits[1], px_by_um * mm_dist) + + # Plot spatial metrics + spatialqc = {"name": run_mode, "plots": []} + for spatial_metric in SPATIAL_METRICS: + fig, axes = plot_spatial_qc_metric( + adata, + spatial_metric, + puck_bead_size=puck_bead_size, + px_by_um=px_by_um, + x_breaks=x_breaks, + y_breaks=y_breaks, + x_mm_breaks=x_mm_breaks, + y_mm_breaks=y_mm_breaks, + ) + spatialqc["plots"].append(plot_to_base64(fig)) + report["plots"]["spatialqc"].append(spatialqc) + + return report + + +def generate_html_report(data, template_file): + with open(template_file, "r") as template_data: + template_content = template_data.read() + template = Template(template_content) + + html_report = template.render(report=data) + + return html_report + + +@message_aggregation(logger_name) +def cmdline(): + """cmdline.""" + import argparse + + parser = argparse.ArgumentParser( + allow_abbrev=False, + description="generate spacemake's 'QC sequencing' with python", + ) + parser = setup_parser(parser) + + args = parser.parse_args() + template_file = os.path.join(absolute_path, "../templates/qc_sequencing.html") + + # TODO: implement dge_summary_paths + + report_metadata = generate_qc_sequencing_metadata( + args.project_id, + args.sample_id, + args.dge_summary_paths, + args.complete_data_root, + args.split_reads_read_type, + args.puck_barcode_file_id_qc, + args.is_spatial, + args.run_modes, + ) + + html_report = generate_html_report(report_metadata, template_file) + + with open(args.out_html_report, "w") as output: + output.write(html_report) + + +if __name__ == "__main__": + cmdline() From cd91ae07392ff2ed5e3c6e32e3180178173bcaae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Mon, 18 Sep 2023 11:08:46 +0200 Subject: [PATCH 04/20] reorganize report folder structure --- spacemake/report/classes/__init__.py | 0 .../report/classes/automated_analysis.py | 470 ---------- spacemake/report/classes/qc_sequencing.py | 829 ------------------ 3 files changed, 1299 deletions(-) delete mode 100644 spacemake/report/classes/__init__.py delete mode 100644 spacemake/report/classes/automated_analysis.py delete mode 100644 spacemake/report/classes/qc_sequencing.py diff --git a/spacemake/report/classes/__init__.py b/spacemake/report/classes/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/spacemake/report/classes/automated_analysis.py b/spacemake/report/classes/automated_analysis.py deleted file mode 100644 index 91adfcb3..00000000 --- a/spacemake/report/classes/automated_analysis.py +++ /dev/null @@ -1,470 +0,0 @@ -import base64 -import io -import logging -import os - -import anndata as ad -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import scanpy as sc -from jinja2 import Template -from scipy.sparse import csr_matrix -from spacemake.config import ConfigFile -from spacemake.project_df import ProjectDF -from spacemake.util import message_aggregation - -absolute_path = os.path.dirname(__file__) - -cpalette = { - "grey": "#999999", - "light_orange": "#E69F00", - "light_blue": "#56B4E9", - "green": "#009E73", - "yellow": "#F0E442", - "blue": "#0072B2", - "orange": "#D55E00", - "pink": "#CC79A7", -} - -clrs = { - "umis": cpalette["light_orange"], - "genes": cpalette["light_blue"], - "reads": cpalette["green"], - "pcr": cpalette["pink"], - "pct_counts_mt": "black", -} - - -logger_name = "spacemake.report.automated_analysis" -logger = logging.getLogger(logger_name) - - -def setup_parser(parser): - """ - Set up command-line arguments for the script. - - :param parser: Argument parser object. - :type parser: argparse.ArgumentParser - :returns: Updated argument parser object. - :rtype: argparse.ArgumentParser - """ - # These allow to get the run_mode_variables.* from the config file - # and pbf_metrics.px_by_um via project_df.puck_barcode_file_metrics - parser.add_argument( - "--project-id", - type=str, - help="the project_id in spacemake project_df.csv", - required=True, - ) - parser.add_argument( - "--sample-id", - type=str, - help="the sample_id in spacemake project_df.csv", - required=True, - ) - parser.add_argument( - "--run-mode", - type=str, - help="one of the run_mode from the project+sample in project_df.csv", - required=True, - ) - parser.add_argument( - "--puck-barcode-file-id-qc", - type=str, - help="a path to the puck_barcode_file_id_qc", - required=True, - ) - # These have the paths to the input files used for generating plots - parser.add_argument( - "--obs-df", - type=str, - help="path to the csv file that stores properties per observation (per cell)", - required=True, - ) - parser.add_argument( - "--var-df", - type=str, - help="path to the csv file that stores properties per variable (per gene)", - required=True, - ) - parser.add_argument( - "--neighborhood-enrichment", - type=str, - help="path to the csv file with the neighborhood enrichment results", - required=False, - default="", - ) - # These additionally configure the plotting functionality - parser.add_argument( - "--is-spatial", - type=str, - help="Whether the current sample is spatial or not", - required=True, - choices=["True", "False"], - default="False", - ) - # This specifies the UMI filter for the current report - parser.add_argument( - "--umi-cutoff", - type=str, - help="one of the UMI cutoffs from the --run-mode", - required=True, - ) - # This specifies where the output file will be generated - parser.add_argument( - "--out-html-report", - type=str, - help="where the HTML report will be saved", - required=True, - ) - - return parser - - -def plot_metric(values, axis, nbins=100, color="#000000"): - # decide linear or logarithmic scale - min_difference = values.max() - values.min() - - hist, bins = np.histogram(values, bins=nbins) - if np.abs(min_difference) < 100: - axis.bar(bins[:-1], hist, color=color) - - else: - logbins = np.logspace(np.log10(bins[0] + 1), np.log10(bins[-1]), nbins) - axis.hist(values, bins=logbins, color=color) - axis.set_xscale("log") - - axis.spines[["right", "top"]].set_visible(False) - - -def plot_to_base64(fig): - my_stringIObytes = io.BytesIO() - fig.savefig(my_stringIObytes, format="jpg") - my_stringIObytes.seek(0) - return base64.b64encode(my_stringIObytes.read()).decode() - - -def generate_automated_analysis_metadata( - project_id: str, - sample_id: str, - run_mode: str, - puck_barcode_file_id_qc: str, - obs_df_path: str, - var_df_path: str, - neighborhood_enrichment: str, - is_spatial: bool = False, - umi_cutoff: int = 0, -): - report = { - "table_overview": [], - "data_complete": False, - "n_cells": 0, - "umi_cutoff": 0, - "n_genes": 0, - "analyzed_clustering_resolutions": [], - "is_spatial": False, - "plots": [], - } - main_plots = { - "histogram_metrics_spatial_units": "", - "umi_spatial_unit_original": "", - "umi_spatial_unit_scaled": "", - } - - # Loading data - obs_df = pd.read_csv(obs_df_path) - var_df = pd.read_csv(var_df_path) - - # Loading project_df for metadata - config = ConfigFile.from_yaml("config.yaml") - project_df = ProjectDF("project_df.csv", config=config) - - # Create anndata for scanpy plotting - empty_ad = csr_matrix((len(obs_df), len(var_df)), dtype=int) - adata = ad.AnnData(empty_ad) - adata.obs = obs_df - adata.var = var_df - adata.obsm["X_umap"] = adata.obs[["umap_0", "umap_1"]].values - - report["n_genes"] = len(var_df) - report["n_cells"] = len(obs_df) - - report["umi_cutoff"] = umi_cutoff - report["data_complete"] = True if np.any(obs_df.columns.str.startswith("leiden")) else False - report["is_spatial"] = is_spatial - - report["plots"] = main_plots - - # Plot Histogram of metrics over spatial units - fig, axes = plt.subplots(3, 2, figsize=(7, 6)) - plot_metric(obs_df["n_reads"], axes[0, 0], 100, clrs["reads"]) - axes[0, 0].set_ylabel("# of\nspatial units") - axes[0, 0].set_xlabel("# of reads") - plot_metric(obs_df["reads_per_counts"], axes[0, 1], 100, clrs["pcr"]) - axes[0, 1].set_ylabel("# of\nspatial units") - axes[0, 1].set_xlabel("# of reads/UMI") - plot_metric(obs_df["n_genes_by_counts"], axes[1, 0], 100, clrs["genes"]) - axes[1, 0].set_ylabel("# of\nspatial units") - axes[1, 0].set_xlabel("# of genes") - plot_metric(obs_df["total_counts"], axes[1, 1], 100, clrs["umis"]) - axes[1, 1].set_ylabel("# of\nspatial units") - axes[1, 1].set_xlabel("# of UMIs") - plot_metric(obs_df["pct_counts_mt"], axes[2, 0], 100, clrs["pct_counts_mt"]) - axes[2, 0].set_ylabel("# of\nspatial units") - axes[2, 0].set_xlabel("% of mitochondrial counts") - axes[2, 1].axis("off") - plt.tight_layout() - main_plots["histogram_metrics_spatial_units"] = plot_to_base64(fig) - - # Variables for spatial - px_by_um, puck_bead_size = 1, 1 - x_breaks, y_breaks = None, None - x_mm_breaks, y_mm_breaks = None, None - puck_width_um = 1 - - # Distribution of UMIs in 2D - if is_spatial: - # Loading metadata from spatial pucks and run mode - puck_metrics = ( - project_df.get_puck_barcode_file_metrics( - project_id=project_id, - sample_id=sample_id, - puck_barcode_file_id=puck_barcode_file_id_qc, - ), - )[0] - - puck_settings = project_df.get_puck_variables( - project_id, sample_id, return_empty=True - ) - - run_mode_vars = project_df.config.get_run_mode(run_mode).variables - - print(puck_metrics) - print(puck_settings) - print(run_mode_vars) - - px_by_um = puck_metrics["px_by_um"] - mesh_spot_diameter_um = run_mode_vars["mesh_spot_diameter_um"] - meshed = run_mode_vars["mesh_data"] - spot_diameter_um = puck_settings["spot_diameter_um"] - - adata.obsm["spatial"] = adata.obs[["x_pos", "y_pos"]].values - - # Set limits and axes units for the spatial plots - x_limits = adata.obsm["spatial"][:, 0].min(), adata.obsm["spatial"][:, 0].max() - y_limits = adata.obsm["spatial"][:, 1].min(), adata.obsm["spatial"][:, 1].max() - puck_width_um = (x_limits[1] - x_limits[0]) / px_by_um - - ratio = (x_limits[1] - x_limits[0]) / (y_limits[1] - y_limits[0]) - - scale_factor = 2 if puck_width_um < 3000 else 3 - mm_dist = max(10**scale_factor, round(puck_width_um / 3, scale_factor)) - mm_diff = mm_dist / 1000 - - def_plot_bead_size = 0.5 if report["n_cells"] > 5000 else 0.75 - def_plot_bead_size = 0.1 if report["n_cells"] > 10000 else def_plot_bead_size - def_plot_bead_size = 0.05 if report["n_cells"] > 25000 else def_plot_bead_size - - puck_bead_size = max( - def_plot_bead_size, mesh_spot_diameter_um if meshed else spot_diameter_um - ) - x_mm_breaks = np.arange(0, puck_width_um, mm_dist) - x_mm_breaks = [f"{round(i, 1)} mm" for i in x_mm_breaks * mm_diff / mm_dist] - y_mm_breaks = np.arange(0, puck_width_um / ratio, mm_dist) - y_mm_breaks = [f"{round(i, 1)} mm" for i in y_mm_breaks * mm_diff / mm_dist] - - x_breaks = np.arange(x_limits[0], x_limits[1], px_by_um * mm_dist) - y_breaks = np.arange(y_limits[0], y_limits[1], px_by_um * mm_dist) - - # Plot spatial UMI (original) - fig, axes = plt.subplots(1, 1, figsize=(5, 5)) - sc.pl.spatial( - adata, - img_key=None, - size=puck_bead_size * 1.5, - spot_size=px_by_um, - color="total_counts", - ax=axes, - show=False, - cmap="inferno", - ) - axes.spines[["right", "top"]].set_visible(False) - axes.set_xticks(x_breaks) - axes.set_xticklabels(x_mm_breaks) - axes.set_yticks(y_breaks) - axes.set_yticklabels(y_mm_breaks) - axes.set_ylabel("") - axes.set_xlabel("") - - main_plots["umi_spatial_unit_original"] = plot_to_base64(fig) - - # Plot spatial UMI (rescaled) - - fig, axes = plt.subplots(1, 1, figsize=(5, 5)) - sc.pl.spatial( - adata, - img_key=None, - size=puck_bead_size * 1.5, - spot_size=px_by_um, - color="total_counts", - ax=axes, - show=False, - cmap="inferno", - vmax=np.quantile(adata.obs["total_counts"], 0.9), - ) - axes.spines[["right", "top"]].set_visible(False) - axes.set_xticks(x_breaks) - axes.set_xticklabels(x_mm_breaks) - axes.set_yticks(y_breaks) - axes.set_yticklabels(y_mm_breaks) - axes.set_ylabel("") - axes.set_xlabel("") - - main_plots["umi_spatial_unit_scaled"] = plot_to_base64(fig) - - if report["data_complete"]: - resolutions = np.unique( - pd.Series(obs_df.columns[obs_df.columns.str.startswith("leiden")]) - .apply(lambda x: x.split("_")[1]) - .values - ) - for i, resolution in enumerate(resolutions): - _current_analyzed_clustering_resolution = { - "resolution": "", - "resolution_id": "", - "plots": {"umap": "", "spatial": "", "neighbor_enrichment": ""}, - } - - _current_analyzed_clustering_resolution['resolution'] = f'{resolution} resolution' - _current_analyzed_clustering_resolution['resolution_id'] = f'rid_{i}' - - # Plot UMAP for current resolution - adata.obs[f"leiden_{resolution}"] = pd.Categorical( - adata.obs[f"leiden_{resolution}"] - ) - fig, axes = plt.subplots(1, 1, figsize=(8, 5)) - sc.pl.umap(adata, color=f"leiden_{resolution}", ax=axes, show=False) - axes.spines[["right", "top", "left", "bottom"]].set_visible(False) - axes.set_ylabel("UMAP 0") - axes.set_xlabel("UMAP 1") - plt.tight_layout() - _current_analyzed_clustering_resolution["plots"]["umap"] = plot_to_base64( - fig - ) - - # Spatial plot and neighborhood enrichment (if spatial) - if is_spatial: - # Plot Spatial clusters - fig, axes = plt.subplots(1, 1, figsize=(8, 5)) - sc.pl.spatial( - adata, - img_key=None, - size=puck_bead_size * 1.5, - spot_size=px_by_um, - color=f"leiden_{resolution}", - ax=axes, - show=False, - ) # marker can be "o" or "h" if is hexagonal - axes.spines[["right", "top"]].set_visible(False) - axes.set_xticks(x_breaks) - axes.set_xticklabels(x_mm_breaks) - axes.set_yticks(y_breaks) - axes.set_yticklabels(y_mm_breaks) - axes.set_ylabel("") - axes.set_xlabel("") - plt.tight_layout() - _current_analyzed_clustering_resolution["plots"][ - "spatial" - ] = plot_to_base64(fig) - - # Plot Neighborhood enrichment - nhood_enrich = pd.read_csv(neighborhood_enrichment) - nhood_enrich_current_res = nhood_enrich[ - nhood_enrich["resolution"].astype(str) == str(resolution) - ] - ne_data = np.zeros( - ( - nhood_enrich_current_res["cluster_a"].max() + 1, - nhood_enrich_current_res["cluster_b"].max() + 1, - ) - ) - ne_data[ - nhood_enrich_current_res["cluster_a"], - nhood_enrich_current_res["cluster_b"], - ] = nhood_enrich_current_res["zscore"] - - fig, axes = plt.subplots(1, 1, figsize=(4, 4)) - plmat = axes.matshow( - ne_data, cmap="inferno", vmin=-50, vmax=100, origin="lower" - ) - cbar = plt.colorbar(plmat, fraction=0.046) - cbar.set_label("Neighbor enrichment score") - axes.set_xlabel("cluster identity") - axes.set_ylabel("cluster identity") - axes.spines[["right", "top"]].set_visible(False) - axes.xaxis.tick_bottom() - plt.tight_layout() - _current_analyzed_clustering_resolution["plots"][ - "neighbor_enrichment" - ] = plot_to_base64(fig) - - report["analyzed_clustering_resolutions"].append( - _current_analyzed_clustering_resolution.copy() - ) - - report["table_overview"] = [ - {"name": "UMI filter", "value": umi_cutoff}, - {"name": "# genes in data", "value": report["n_genes"]}, - {"name": "# of spots in data", "value": report["n_cells"]}, - {"name": "median UMI", "value": obs_df["total_counts"].median()}, - {"name": "median genes", "value": obs_df["n_genes_by_counts"].median()}, - {"name": "puck width (um)", "value": puck_width_um}, - ] - - return report - - -def generate_html_report(data, template_file): - with open(template_file, "r") as template_data: - template_content = template_data.read() - template = Template(template_content) - - html_report = template.render(report=data) - - return html_report - - -@message_aggregation(logger_name) -def cmdline(): - """cmdline.""" - import argparse - - parser = argparse.ArgumentParser( - allow_abbrev=False, - description="generate spacemake's 'automated analysis' with python", - ) - parser = setup_parser(parser) - - args = parser.parse_args() - template_file = os.path.join(absolute_path, "../templates/automated_analysis.html") - - report_metadata = generate_automated_analysis_metadata( - args.project_id, - args.sample_id, - args.run_mode, - args.puck_barcode_file_id_qc, - args.obs_df, - args.var_df, - args.neighborhood_enrichment, - args.is_spatial, - args.umi_cutoff, - ) - html_report = generate_html_report(report_metadata, template_file) - - with open(args.out_html_report, "w") as output: - output.write(html_report) - - -if __name__ == "__main__": - cmdline() \ No newline at end of file diff --git a/spacemake/report/classes/qc_sequencing.py b/spacemake/report/classes/qc_sequencing.py deleted file mode 100644 index feb2e3ed..00000000 --- a/spacemake/report/classes/qc_sequencing.py +++ /dev/null @@ -1,829 +0,0 @@ -import base64 -import io -import logging -import os -from typing import Union - -import anndata as ad -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import scanpy as sc -from jinja2 import Template -from scipy.sparse import csr_matrix -from spacemake.config import ConfigFile -from spacemake.project_df import ProjectDF -from spacemake.util import message_aggregation - -absolute_path = os.path.dirname(__file__) - -cpalette = { - "grey": "#999999", - "light_orange": "#E69F00", - "light_blue": "#56B4E9", - "green": "#009E73", - "yellow": "#F0E442", - "blue": "#0072B2", - "orange": "#D55E00", - "pink": "#CC79A7", -} - -clrs = { - "umis": cpalette["light_orange"], - "genes": cpalette["light_blue"], - "reads": cpalette["green"], - "pcr": cpalette["pink"], - "pct_counts_mt": "black", -} - -SAMPLEINFO_VARS = [ - "species", - "sequencing_date", - "investigator", - "experiment", -] -SPATIAL_METRICS = [ - "n_genes_by_counts", - "total_counts", - "pct_counts_mt", - "n_reads", - "reads_per_counts", - "n_joined", - "exact_entropy", - "exact_compression", -] -SPATIAL_METRICS_TITLES = { - "n_genes_by_counts": "# of genes per spatial unit", - "total_counts": "# of UMIs per spatial unit", - "pct_counts_mt": "# % mt counts per spatial unit", - "n_reads": "# of reads per spatial unit", - "reads_per_counts": "reads/UMI per spatial unit", - "n_joined": "# beads joined per spatial unit", - "exact_entropy": "Shannon entropy per spatial unit", - "exact_compression": "barcode length after compression per spatial unit", -} - -logger_name = "spacemake.report.qc_sequencing" -logger = logging.getLogger(logger_name) - -# TODO: check the IDs for the plots grouped by tabs -# TODO: add the nucleotide quartiles for the non-meshed modes -# TODO: add legend to the entropy plots - -# How to run, example: -# python /home/dleonpe/spacemake_project/repos/spacemake/spacemake/report/classes/qc_sequencing.py \ -# --project-id fc_sts_76 --sample-id fc_sts_76_3 --run-modes fc_sts_novaseq_SP_mesh_7um \ -# --dge-summary-paths /home/dleonpe/spacemake_project/data/1_spacemake_mouse_new/projects/fc_sts_76/processed_data/fc_sts_76_3/illumina/complete_data/dge/dge.all.polyA_adapter_trimmed.mm_included.spatial_beads.mesh_7_hexagon_fc_010_L3_tile_2157.obs.csv \ -# --complete-data-root /home/dleonpe/spacemake_project/data/1_spacemake_mouse_new/projects/fc_sts_76/processed_data/fc_sts_76_3/illumina/complete_data \ -# --split-reads-read-type /home/dleonpe/spacemake_project/data/1_spacemake_mouse_new/projects/fc_sts_76/processed_data/fc_sts_76_3/illumina/complete_data/split_reads.polyA_adapter_trimmed/read_type_num.txt \ -# --is-spatial True --out-html-report /home/dleonpe/spacemake_project/repos/spacemake/spacemake/report/classes/fc_sts_76_3_qc_sequencing_new.html \ -# --puck-barcode-file-id-qc fc_010_L3_tile_2157 - - -def setup_parser(parser): - """ - Set up command-line arguments for the script. - - :param parser: Argument parser object. - :type parser: argparse.ArgumentParser - :returns: Updated argument parser object. - :rtype: argparse.ArgumentParser - """ - # These allow to get the run_mode_variables.* from the config file - # and pbf_metrics.px_by_um via project_df.puck_barcode_file_metrics - parser.add_argument( - "--project-id", - type=str, - help="the project_id in spacemake project_df.csv", - required=True, - ) - parser.add_argument( - "--sample-id", - type=str, - help="the sample_id in spacemake project_df.csv", - required=True, - ) - parser.add_argument( - "--puck-barcode-file-id-qc", - type=str, - help="a path to the puck_barcode_file_id_qc", - required=True, - ) - # These have the paths to the input files used for generating plots - parser.add_argument( - "--dge-summary-paths", - type=str, - nargs="+", - help="paths to the dge summary files, must be in same number and order as --run-modes", - required=True, - ) - parser.add_argument( - "--run-modes", - type=str, - nargs="+", - help="run modes of the sample for which the report will be generated", - required=False, - default=None, - ) - parser.add_argument( - "--complete-data-root", - type=str, - help="path to where the sample data is stored", - required=True, - ) - parser.add_argument( - "--split-reads-read-type", - type=str, - nargs="+", - help="path to the 'read_type_num' file(s) generated by read annotator. One per mapping strategy.", - required=True, - ) - # These additionally configure the plotting functionality - parser.add_argument( - "--is-spatial", - type=str, - help="Whether the current sample is spatial or not", - required=True, - choices=["True", "False"], - default="False", - ) - # This specifies where the output file will be generated - parser.add_argument( - "--out-html-report", - type=str, - help="where the HTML report will be saved", - required=True, - ) - - return parser - - -def reverse_readline(filename, buf_size=8192): - """A generator that returns the lines of a file in reverse order""" - with open(filename, "rb") as fh: - segment = None - offset = 0 - fh.seek(0, os.SEEK_END) - file_size = remaining_size = fh.tell() - while remaining_size > 0: - offset = min(file_size, offset + buf_size) - fh.seek(file_size - offset) - buffer = fh.read(min(remaining_size, buf_size)).decode(encoding="utf-8") - remaining_size -= buf_size - lines = buffer.split("\n") - # The first line of the buffer is probably not a complete line so - # we'll save it and append it to the last line of the next buffer - # we read - if segment is not None: - # If the previous chunk starts right from the beginning of line - # do not concat the segment to the last line of new chunk. - # Instead, yield the segment first - if buffer[-1] != "\n": - lines[-1] += segment - else: - yield segment - segment = lines[0] - for index in range(len(lines) - 1, 0, -1): - if lines[index]: - yield lines[index] - # Don't yield None if the file was empty - if segment is not None: - yield segment - - -def read_star_log_file(log_file): - file = os.path.join(log_file) - - if not os.path.isfile(file): - return (1, 1) - - star_stats = { - "input_reads": 0, - "uniq_mapped_reads": 0, - "multi_mapped_reads": 0, - "too_many_mapped_reads": 0, - "too_many_mapped_reads": 0, - "unmapped_too_short": 0, - "unmapped_other": 0, - "chimeric": 0, - } - - log_name_stat = { - "Number of input reads": "input_reads", - "Uniquely mapped reads number": "uniq_mapped_reads", - "Number of reads mapped to multiple loci": "multi_mapped_reads", - "Number of reads mapped to too many loci": "too_many_mapped_reads", - "Number of reads unmapped: too many mismatches": "unmapped_mismatches", - "Number of reads unmapped: too short": "unmapped_too_short", - "Number of reads unmapped: other": "unmapped_other", - "Number of chimeric reads": "chimeric", - "Average mapped length": "avg_mapped_length" - } - - with open(file) as fi: - idx = 0 - for line in fi: - _log_id = line.strip().split("|")[0].strip() - if _log_id in log_name_stat.keys(): - if _log_id == "Average mapped length": - star_stats[log_name_stat[_log_id]] = line.strip().split("|")[1] - else: - star_stats[log_name_stat[_log_id]] = round(int(line.strip().split("|")[1])/1e6, 2) - # Do not convert to millions the Average mapped length - - idx += 1 - - return star_stats - - -def read_split_reads_file(split_reads_file): - file = os.path.join(split_reads_file) - - read_types = dict() - with open(file) as fi: - for line in fi: - line = line.strip().split(" ") - read_types[line[0]] = round(int(line[1])/1e6, 2) - - return read_types - - -def read_bowtie_log_file(log_file): - file = os.path.join(log_file) - bowtie_stats = { - "input_reads": 0, - "unique_aligned": 0, - "multimapper": 0, - "unaligned": 0, - } - log_line_stat = { - 5: "input_reads", - 3: "unaligned", - 2: "unique_aligned", - 1: "multimapper", - } - max_line = max(log_line_stat.keys()) - - idx = 0 - for line in reverse_readline(file): - if idx in log_line_stat.keys(): - bowtie_stats[log_line_stat[idx]] = round(int(line.strip().split(" ")[0])/1e6, 2) - idx += 1 - if max_line < idx: - break - - return bowtie_stats - - -def plot_metric(values, axis, nbins=100, color="#000000"): - # decide linear or logarithmic scale - min_difference = values.max() - values.min() - - hist, bins = np.histogram(values, bins=nbins) - if np.abs(min_difference) < 100: - axis.bar(bins[:-1], hist, color=color) - - else: - logbins = np.logspace(np.log10(bins[0] + 1), np.log10(bins[-1]), nbins) - axis.hist(values, bins=logbins, color=color) - axis.set_xscale("log") - - axis.spines[["right", "top"]].set_visible(False) - - -def plot_umi_cutoff(obs_df): - umi_cutoffs = np.arange(10, 20000, 10) - - def summarise_dge_summary(df, umi_cutoff): - df_filter = df[df["total_counts"] > umi_cutoff] - df_filter["n_beads"] = len(df_filter) - df_summary = df_filter[ - [ - "n_reads", - "total_counts", - "n_genes_by_counts", - "reads_per_counts", - "n_beads", - ] - ].median() - - return df_summary - - umi_cutoff_data = pd.DataFrame( - np.vstack( - [ - summarise_dge_summary(obs_df, umi_cutoff).values - for umi_cutoff in umi_cutoffs - ] - ), - columns=[ - "n_reads", - "total_counts", - "n_genes_by_counts", - "reads_per_counts", - "n_beads", - ], - ) - umi_cutoff_data["umi_cutoffs"] = umi_cutoffs - - fig, axes = plt.subplots(2, 3, figsize=(8, 4)) - axes[0, 0].plot( - umi_cutoff_data["umi_cutoffs"], - umi_cutoff_data["n_beads"], - color="black", - linewidth=1, - ) - axes[0, 0].set_ylabel("number of\nspatial units") - axes[0, 0].set_xlabel("minimum UMI") - axes[0, 0].set_yscale("log") - axes[0, 0].set_xscale("log") - axes[0, 0].spines[["right", "top"]].set_visible(False) - - axes[0, 1].plot( - umi_cutoff_data["umi_cutoffs"], - umi_cutoff_data["n_reads"], - color=clrs["reads"], - linewidth=1, - ) - axes[0, 1].set_ylabel("median reads\nper spatial unit") - axes[0, 1].set_xlabel("minimum UMI") - axes[0, 1].set_xscale("log") - axes[0, 1].spines[["right", "top"]].set_visible(False) - - axes[0, 2].plot( - umi_cutoff_data["umi_cutoffs"], - umi_cutoff_data["n_genes_by_counts"], - color=clrs["genes"], - linewidth=1, - ) - axes[0, 2].set_ylabel("median genes\nper spatial unit") - axes[0, 2].set_xlabel("minimum UMI") - axes[0, 2].set_xscale("log") - axes[0, 2].spines[["right", "top"]].set_visible(False) - - axes[1, 0].plot( - umi_cutoff_data["umi_cutoffs"], - umi_cutoff_data["total_counts"], - color=clrs["umis"], - linewidth=1, - ) - axes[1, 0].set_ylabel("median UMIs\nper spatial unit") - axes[1, 0].set_xlabel("minimum UMI") - axes[1, 0].set_xscale("log") - axes[1, 0].spines[["right", "top"]].set_visible(False) - - axes[1, 1].plot( - umi_cutoff_data["umi_cutoffs"], - umi_cutoff_data["reads_per_counts"], - color=clrs["pcr"], - linewidth=1, - ) - axes[1, 1].set_ylabel("median reads/UMI\nper spatial unit") - axes[1, 1].set_xlabel("minimum UMI") - axes[1, 1].set_xscale("log") - axes[1, 1].spines[["right", "top"]].set_visible(False) - axes[1, 2].axis("off") - - plt.tight_layout() - return fig, axes - - -def plot_histogram_beads(obs_df): - # One for each run mode - fig, axes = plt.subplots(2, 2, figsize=(7, 3.5)) - plot_metric(obs_df["n_reads"], axes[0, 0], 100, clrs["reads"]) - axes[0, 0].set_ylabel("# of\nspatial units") - axes[0, 0].set_xlabel("# of reads") - - reads_per_counts = obs_df["reads_per_counts"] - reads_per_counts = np.nan_to_num(reads_per_counts) - plot_metric(reads_per_counts, axes[0, 1], 100, clrs["pcr"]) - axes[0, 1].set_ylabel("# of\nspatial units") - axes[0, 1].set_xlabel("# of reads/UMI") - - plot_metric(obs_df["n_genes_by_counts"], axes[1, 0], 100, clrs["genes"]) - axes[1, 0].set_ylabel("# of\nspatial units") - axes[1, 0].set_xlabel("# of genes") - - plot_metric(obs_df["total_counts"], axes[1, 1], 100, clrs["umis"]) - axes[1, 1].set_ylabel("# of\nspatial units") - axes[1, 1].set_xlabel("# of UMIs") - - plt.tight_layout() - return fig, axes - - -def plot_entropy_compression(obs_df, nbins=30): - fig, axes = plt.subplots(2, 1, figsize=(7, 4)) - axes[0].hist( - obs_df["theoretical_entropy"].values, - color=cpalette["grey"], - bins=nbins, - edgecolor="black", - label="Theoretical", - ) - axes[0].hist( - obs_df["exact_entropy"].values, - color=cpalette["orange"], - bins=nbins, - edgecolor="black", - alpha=0.7, - label="Observed", - ) - axes[0].set_xlabel("Shannon entropy of barcodes") - axes[0].set_ylabel("# of barcodes") - axes[0].spines[["right", "top"]].set_visible(False) - - axes[1].hist( - obs_df["theoretical_compression"].values, - color=cpalette["grey"], - bins=nbins, - edgecolor="black", - label="Theoretical", - ) - axes[1].hist( - obs_df["exact_compression"].values, - color=cpalette["orange"], - bins=nbins, - edgecolor="black", - alpha=0.7, - label="Observed", - ) - axes[1].set_xlabel("Length of barcodes after compression") - axes[1].set_ylabel("# of barcodes") - axes[1].spines[["right", "top"]].set_visible(False) - axes[1].set_xlim(left=0) - return fig, axes - - -def plot_spatial_qc_metric( - adata, - metric="total_counts", - puck_bead_size=1, - px_by_um=1, - x_breaks=None, - y_breaks=None, - x_mm_breaks=None, - y_mm_breaks=None, -): - fig, axes = plt.subplots(1, 1, figsize=(5, 5)) - sc.pl.spatial( - adata, - img_key=None, - size=puck_bead_size * 1.5, - spot_size=px_by_um, - color=metric, - ax=axes, - title=SPATIAL_METRICS_TITLES[metric], - show=False, - cmap="inferno", - ) - axes.spines[["right", "top"]].set_visible(False) - axes.set_xticks(x_breaks) - axes.set_xticklabels(x_mm_breaks) - axes.set_yticks(y_breaks) - axes.set_yticklabels(y_mm_breaks) - axes.set_ylabel("") - axes.set_xlabel("") - - return fig, axes - - -def plot_to_base64(fig): - my_stringIObytes = io.BytesIO() - fig.savefig(my_stringIObytes, format="jpg", dpi=300) - my_stringIObytes.seek(0) - return base64.b64encode(my_stringIObytes.read()).decode() - - -def generate_table_mapping_statistics( - complete_data_root: str, split_reads_read_type: str -): - # Initialize empty lists to store the filenames - bowtie_log_files = [] - star_log_files = [] - - # Iterate over the files in the folder - for filename in os.listdir(complete_data_root): - # Check if the file ends with .bam.log - if filename.endswith(".bam.log") and "final" not in filename: - bowtie_log_files.append(filename) - # Check if the file ends with .Log.final.out - elif filename.endswith(".Log.final.out") and filename != "star.Log.final.out": - star_log_files.append(filename) - - bowtie_logs = [] - for bowtie_log_file in bowtie_log_files: - bowtie_log = read_bowtie_log_file( - os.path.join(complete_data_root, bowtie_log_file) - ) - bowtie_log["name"] = bowtie_log_file.split(".")[0] - bowtie_log["mapper"] = "bowtie2" - bowtie_logs.append(bowtie_log) - - star_logs = [] - for star_log_file in star_log_files: - star_log = read_star_log_file(os.path.join(complete_data_root, star_log_file)) - star_log["name"] = star_log_file.split(".")[1] - star_log["mapper"] = "STAR" - star_logs.append(star_log) - - # we sort the mapping statistics by the number of input reads, merge into a single list - all_logs = star_logs + bowtie_logs - - idx_log = np.argsort([log["input_reads"] for log in all_logs])[::-1] - all_logs = [all_logs[idx] for idx in idx_log] - - reads_type = read_split_reads_file(split_reads_read_type) - reads_type["name"] = "reads_type" - all_logs += [reads_type] - - return all_logs - - -def load_dge_summary(obs_df_path, with_adata=True): - obs_df = pd.read_csv(obs_df_path) - empty_ad = csr_matrix((len(obs_df), 1), dtype=int) - adata = ad.AnnData(empty_ad) - adata.obs = obs_df - if with_adata: - return obs_df, adata - else: - return obs_df - - -def generate_qc_sequencing_metadata( - project_id: str, - sample_id: str, - dge_summary_paths: Union[ - list, dict - ], # from snakemake_helper_funtions.get_qc_sheet_input_files - complete_data_root: str, - split_reads_read_type: Union[str, list], - puck_barcode_file_id_qc: str, - is_spatial: bool = False, - run_modes: list = None, -): - if isinstance(dge_summary_paths, str): - dge_summary_paths = [dge_summary_paths] - if isinstance(run_modes, str): - run_modes = [run_modes] - if ( - isinstance(run_modes, list) - and isinstance(dge_summary_paths, list) - and len(run_modes) == len(dge_summary_paths) - ): - dge_summary_paths = { - run_mode: dge_summary_path - for run_mode, dge_summary_path in zip(run_modes, dge_summary_paths) - } - elif ( - isinstance(run_modes, list) - and isinstance(dge_summary_paths, list) - and len(run_modes) != len(dge_summary_paths) - ): - raise ValueError( - "'run_modes' and 'dge_summary_paths' must have the same length" - ) - - report = { - "type": "qc_report", - "sampleinfo": [], - "runmodes": [], - "mappingstats": [], - "summarisedmetrics": [], - "is_spatial": is_spatial, - "plots": [], - } - main_plots = { - "kneeplot": [], - "umicutoff": [], - "histogrambeads": [], - "nucleotidedistribution": [], - "shannonentropy": [], - "spatialqc": [], - } - report["plots"] = main_plots - - # Loading project_df for metadata - config = ConfigFile.from_yaml("config.yaml") - project_df = ProjectDF("project_df.csv", config=config) - - # Table: sample info - sample_info = project_df.get_sample_info(project_id, sample_id) - report["sampleinfo"] = {} - report["sampleinfo"]["project_id"] = project_id - report["sampleinfo"]["sample_id"] = sample_id - report["sampleinfo"]["puck_barcode_file_id"] = puck_barcode_file_id_qc - report["sampleinfo"].update({svar: sample_info[svar] for svar in SAMPLEINFO_VARS}) - - # Table: run modes - if run_modes is None: - run_modes = sample_info["run_mode"] - - for run_mode in run_modes: - run_mode_info = {} - run_mode_info["variables"] = project_df.config.get_run_mode(run_mode).variables - run_mode_info["name"] = run_mode - report["runmodes"].append(run_mode_info) - - # Table: mapping statistics - if isinstance(split_reads_read_type, str): - split_reads_read_type = [split_reads_read_type] - - for s in split_reads_read_type: - # assumes that the directory of s is the name of the mapping strategy - mappingstat = {'name': os.path.basename(os.path.dirname(s)), 'all_stats': generate_table_mapping_statistics(complete_data_root, s)} - report["mappingstats"].append(mappingstat) - - # Loading all dge data summaries - # Table: summarised metrics over beads - dge_summaries = {} - for run_mode in run_modes: - dge_summaries[run_mode] = {} - summarised_metrics = {"name": run_mode, "variables": None} - _dge_summary = load_dge_summary(dge_summary_paths[run_mode]) - dge_summaries[run_mode]["df"] = _dge_summary[0] - dge_summaries[run_mode]["adata"] = _dge_summary[1] - - obs_df = dge_summaries[run_mode]["df"] - metrics_beads = ( - obs_df[["n_genes_by_counts", "reads_per_counts", "n_reads", "total_counts"]] - .median() - .to_dict() - ) - metrics_beads["n_beads"] = len(obs_df) - metrics_beads["sum_reads"] = obs_df["n_reads"].sum() - summarised_metrics["variables"] = metrics_beads - report["summarisedmetrics"].append(summarised_metrics) - - # Plots - # 'Knee'-plot - for run_mode, dge_summary in dge_summaries.items(): - kneeplot = {"name": run_mode, "plot": None} - obs_df = dge_summary["df"] - fig, axis = plt.subplots(1, 1, figsize=(5, 3)) - axis.plot( - np.arange(len(obs_df)), np.cumsum(obs_df["n_reads"].values), color="black", linewidth=1 - ) - axis.set_ylabel("Cumulative\nsum of reads") - axis.set_xlabel("Beads sorted by number of reads") - axis.spines[["right", "top"]].set_visible(False) - plt.tight_layout() - kneeplot["plot"] = plot_to_base64(fig) - report["plots"]["kneeplot"].append(kneeplot) - - # UMI cutoff plots - for run_mode, dge_summary in dge_summaries.items(): - umicutoff = {"name": run_mode, "plot": None} - obs_df = dge_summary["df"] - fig, axes = plot_umi_cutoff(obs_df) - umicutoff["plot"] = plot_to_base64(fig) - report["plots"]["umicutoff"].append(umicutoff) - - # Histogram beads - for run_mode, dge_summary in dge_summaries.items(): - histogrambeads = {"name": run_mode, "plot": None} - obs_df = dge_summary["df"] - fig, axes = plot_histogram_beads(obs_df) - histogrambeads["plot"] = plot_to_base64(fig) - report["plots"]["histogrambeads"].append(histogrambeads) - - # TODO: - # Nucleotide distribution per beads - - # Shannon entropy - for run_mode, dge_summary in dge_summaries.items(): - shannonentropy = {"name": run_mode, "plot": None} - obs_df = dge_summary["df"] - fig, axes = plot_entropy_compression(obs_df) - shannonentropy["plot"] = plot_to_base64(fig) - report["plots"]["shannonentropy"].append(shannonentropy) - - # Return here if not spatial - if not is_spatial: - return report - - # Proceed with Spatial QC plots - # Variables for spatial - px_by_um, puck_bead_size = 1, 1 - x_breaks, y_breaks = None, None - x_mm_breaks, y_mm_breaks = None, None - puck_width_um = 1 - - for run_mode, dge_summary in dge_summaries.items(): - # Loading adata for scanpy plotting - adata = dge_summary["adata"] - adata.obsm["spatial"] = adata.obs[["x_pos", "y_pos"]].values - report["n_cells"] = len(adata) - - # Loading metadata from spatial pucks and run mode - puck_metrics = ( - project_df.get_puck_barcode_file_metrics( - project_id=project_id, - sample_id=sample_id, - puck_barcode_file_id=puck_barcode_file_id_qc, - ), - )[0] - - puck_settings = project_df.get_puck_variables( - project_id, sample_id, return_empty=True - ) - - run_mode_vars = project_df.config.get_run_mode(run_mode).variables - - px_by_um = puck_metrics["px_by_um"] - mesh_spot_diameter_um = run_mode_vars["mesh_spot_diameter_um"] - meshed = run_mode_vars["mesh_data"] - spot_diameter_um = puck_settings["spot_diameter_um"] - - # Set limits and axes units for the spatial plots - x_limits = adata.obsm["spatial"][:, 0].min(), adata.obsm["spatial"][:, 0].max() - y_limits = adata.obsm["spatial"][:, 1].min(), adata.obsm["spatial"][:, 1].max() - puck_width_um = (x_limits[1] - x_limits[0]) / px_by_um - - ratio = (x_limits[1] - x_limits[0]) / (y_limits[1] - y_limits[0]) - - scale_factor = 2 if puck_width_um < 3000 else 3 - mm_dist = max(10**scale_factor, round(puck_width_um / 3, scale_factor)) - mm_diff = mm_dist / 1000 - - def_plot_bead_size = 0.5 if report["n_cells"] > 5000 else 0.75 - def_plot_bead_size = 0.1 if report["n_cells"] > 10000 else def_plot_bead_size - def_plot_bead_size = 0.05 if report["n_cells"] > 25000 else def_plot_bead_size - - puck_bead_size = max( - def_plot_bead_size, mesh_spot_diameter_um if meshed else spot_diameter_um - ) - x_mm_breaks = np.arange(0, puck_width_um, mm_dist) - x_mm_breaks = [f"{round(i, 1)} mm" for i in x_mm_breaks * mm_diff / mm_dist] - y_mm_breaks = np.arange(0, puck_width_um / ratio, mm_dist) - y_mm_breaks = [f"{round(i, 1)} mm" for i in y_mm_breaks * mm_diff / mm_dist] - - x_breaks = np.arange(x_limits[0], x_limits[1], px_by_um * mm_dist) - y_breaks = np.arange(y_limits[0], y_limits[1], px_by_um * mm_dist) - - # Plot spatial metrics - spatialqc = {"name": run_mode, "plots": []} - for spatial_metric in SPATIAL_METRICS: - fig, axes = plot_spatial_qc_metric( - adata, - spatial_metric, - puck_bead_size=puck_bead_size, - px_by_um=px_by_um, - x_breaks=x_breaks, - y_breaks=y_breaks, - x_mm_breaks=x_mm_breaks, - y_mm_breaks=y_mm_breaks, - ) - spatialqc["plots"].append(plot_to_base64(fig)) - report["plots"]["spatialqc"].append(spatialqc) - - return report - - -def generate_html_report(data, template_file): - with open(template_file, "r") as template_data: - template_content = template_data.read() - template = Template(template_content) - - html_report = template.render(report=data) - - return html_report - - -@message_aggregation(logger_name) -def cmdline(): - """cmdline.""" - import argparse - - parser = argparse.ArgumentParser( - allow_abbrev=False, - description="generate spacemake's 'QC sequencing' with python", - ) - parser = setup_parser(parser) - - args = parser.parse_args() - template_file = os.path.join(absolute_path, "../templates/qc_sequencing.html") - - # TODO: implement dge_summary_paths - - report_metadata = generate_qc_sequencing_metadata( - args.project_id, - args.sample_id, - args.dge_summary_paths, - args.complete_data_root, - args.split_reads_read_type, - args.puck_barcode_file_id_qc, - args.is_spatial, - args.run_modes, - ) - - html_report = generate_html_report(report_metadata, template_file) - - with open(args.out_html_report, "w") as output: - output.write(html_report) - - -if __name__ == "__main__": - cmdline() From a180996a32402489110726ab6e77168729c3ce0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Mon, 18 Sep 2023 11:09:41 +0200 Subject: [PATCH 05/20] add html report templates --- .gitignore | 1 + .../report/templates/automated_analysis.html | 1813 +++++++++++++++ spacemake/report/templates/qc_sequencing.html | 1964 +++++++++++++++++ 3 files changed, 3778 insertions(+) create mode 100644 spacemake/report/templates/automated_analysis.html create mode 100644 spacemake/report/templates/qc_sequencing.html diff --git a/.gitignore b/.gitignore index a7fcf1a3..f0ca30a0 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ samples.yaml *.png *.rds *.html +!spacemake/report/templates/*.html *.csv *.pdf *.txt.gz diff --git a/spacemake/report/templates/automated_analysis.html b/spacemake/report/templates/automated_analysis.html new file mode 100644 index 00000000..e65094a7 --- /dev/null +++ b/spacemake/report/templates/automated_analysis.html @@ -0,0 +1,1813 @@ + + + + + + + + + + + + + + spacemake {{ report.type }} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+

spacemake report

+
+
+
+
+
+ +
+

Automated analysis

+
+

Overview

+ + + + + + + {% for overview_var in report.table_overview %} + + + + + {% endfor %} + +
+ Data info +
+ {{ overview_var.name }} + + {{ overview_var.value }} +
+
+
+
+

Histogram of metrics over spatial units

+

+ +

Distribution of UMIs in 2D

+

Original data = original UMI counts

+

Scaled data = the top 10% of UMIs (ordered high to low) are set to the minimum UMI of the first 10% UMIs, for visualisation purposes.

+
+

Original

+

+
+
+

Scaled

+

+
+
+
+

Automated analysis with different clustering resolutions

+ {% if report.data_complete %} + {% for analyzed_clustering_resolution in report.analyzed_clustering_resolutions %} +
+

{{ analyzed_clustering_resolution.resolution }}

+

+ {% if report.is_spatial %} +

+

+ {% endif %} +
+ {% endfor %} + {% else %} + +

+ {% endif %} +
+ +
+
+
+ + + + + + + + + + + + + \ No newline at end of file diff --git a/spacemake/report/templates/qc_sequencing.html b/spacemake/report/templates/qc_sequencing.html new file mode 100644 index 00000000..45022294 --- /dev/null +++ b/spacemake/report/templates/qc_sequencing.html @@ -0,0 +1,1964 @@ + + + + + + + + + + + + + + spacemake {{ report.type }} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+

spacemake report

+
+
+
+
+
+ +

Sequencing QC

+
+

Overview

+
+

Sample info

+ + + {% for overview_var in report.sampleinfo %} + + + + + {% endfor %} + +
+ {{ overview_var }} + + {{ report.sampleinfo[overview_var] }} +
+
+
+

Run modes

+

This sample was processed using the following run modes, and run mode variables

+ {% for run_mode in report.runmodes %} + + + + + + + + + + + + + {% for var in run_mode.variables %} + + + + + {% endfor %} + +
+ +
+ Run modes +
+
+ + {{ run_mode.name }} +
+ {{ var }} + + {{ run_mode.variables[var] }} +
+ {% endfor %} +
+
+

Mapping statistics

+ {% for mapping_mode in report.mappingstats %} + + + + + + + + + + + + {% for mapping_mode_vars in mapping_mode.all_stats %} + + {% for var in mapping_mode_vars %} + + + + + {% endfor %} + + {% endfor %} + + + + + + + + +
+ +
+ Mapping mode +
+
+ + {{ mapping_mode.name }} +
+ {{ var }} + + {{ mapping_mode_vars[var] }} +
+ Note: +
+ All values, except avg_mapped_length, shown in millions +
+ {% endfor %} +
+
+

Summarised metrics over beads

+ {% for summarisedmetrics in report.summarisedmetrics %} + + + + + + + + + + + + + {% for var in summarisedmetrics.variables %} + + + + + {% endfor %} + +
+ +
+ Run modes +
+
+ + {{ summarisedmetrics.name }} +
+ {{ var }} + + {{ summarisedmetrics.variables[var] }} +
+ {% endfor %} +
+
+

QC plots

+

Each of the QC plots we show on a per run mode basis, to see if there are any downstream differences based on the run mode variable settings.

+
+

'Knee'-plot

+

Below we plot a so called ‘Knee-plot’: on the y-axis is the Cummulative sum of reads, on the x-axis are the bead barcodes sorted by number of reads. For single-cell samples, this plot tells you roughly how many beads are in the sample.

+ {% for kneeplot in report.plots.kneeplot %} +
+

{{ kneeplot.name }}

+

+
+ {% endfor %} +
+
+

UMI-cutoff plots

+ {% for umicutoff in report.plots.umicutoff %} +
+

{{ umicutoff.name }}

+

+
+ {% endfor %} +
+
+

Histogram of metrics over beads

+

Next we show mertics such as number of UMIs, genes, reads and pcr per physical spot. We further distinguish between each run mode, showing one histogram for each.

+ {% for histogrambeads in report.plots.histogrambeads %} +
+

{{ histogrambeads.name }}

+

+
+ {% endfor %} +
+
+

Nucleotide distribution per beads

+

Next we bin the data based on reads into quartile. For each run_mode the data is divided into 4 beads, by reads. This means, that the first bin will contain beads which account 25% of the reads, the second will contain beads which account for the second 25% of reads and so on.

+

For each run mode we plot the nucleotide distribution per quartile.

+

Only not-meshed run_mode(s) are shown

+ {% for nucleotidedistribution in report.plots.nucleotidedistribution %} +
+

{{ nucleotidedistribution.name }}

+

+
+ {% endfor %} +
+
+

Shannon entropy and string compression

+ {% for shannonentropy in report.plots.shannonentropy %} +
+

{{ shannonentropy.name }}

+

+
+ {% endfor %} +
+ {% if report.is_spatial %} +
+

Spatial QC

+ {% for spatialqc in report.plots.spatialqc %} +
+

{{ spatialqc.name }}

+ {% for plot_spatialqc in spatialqc.plots %} +

+ {% endfor %} +
+ {% endfor %} +
+ {% endif %} +
+
+
+ + + + + + + + + + + + + \ No newline at end of file From 64bbed5807add6d8ee0dc8727e9057cecfdc2fec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Mon, 18 Sep 2023 11:10:03 +0200 Subject: [PATCH 06/20] add python-based qc_sequencing report generator --- spacemake/report/qc_sequencing.py | 885 ++++++++++++++++++++++++++++++ 1 file changed, 885 insertions(+) create mode 100644 spacemake/report/qc_sequencing.py diff --git a/spacemake/report/qc_sequencing.py b/spacemake/report/qc_sequencing.py new file mode 100644 index 00000000..d98f081f --- /dev/null +++ b/spacemake/report/qc_sequencing.py @@ -0,0 +1,885 @@ +import base64 +import io +import logging +import os +from typing import Union + +import anndata as ad +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import scanpy as sc +from jinja2 import Template +from scipy.sparse import csr_matrix +from spacemake.config import ConfigFile +from spacemake.project_df import ProjectDF +from spacemake.util import message_aggregation + +absolute_path = os.path.dirname(__file__) + +cpalette = { + "grey": "#999999", + "light_orange": "#E69F00", + "light_blue": "#56B4E9", + "green": "#009E73", + "yellow": "#F0E442", + "blue": "#0072B2", + "orange": "#D55E00", + "pink": "#CC79A7", +} + +clrs = { + "umis": cpalette["light_orange"], + "genes": cpalette["light_blue"], + "reads": cpalette["green"], + "pcr": cpalette["pink"], + "pct_counts_mt": "black", +} + +nucl_clrs = { + "A": "#F5C900", + "C": "#F55D59", + "T": "#3AA861", + "G": "#7772F5", + "N": "#999999", +} + +SAMPLEINFO_VARS = [ + "species", + "sequencing_date", + "investigator", + "experiment", +] +SPATIAL_METRICS = [ + "n_genes_by_counts", + "total_counts", + "pct_counts_mt", + "n_reads", + "reads_per_counts", + "n_joined", + "exact_entropy", + "exact_compression", +] +SPATIAL_METRICS_TITLES = { + "n_genes_by_counts": "# of genes per spatial unit", + "total_counts": "# of UMIs per spatial unit", + "pct_counts_mt": "# % mt counts per spatial unit", + "n_reads": "# of reads per spatial unit", + "reads_per_counts": "reads/UMI per spatial unit", + "n_joined": "# beads joined per spatial unit", + "exact_entropy": "Shannon entropy per spatial unit", + "exact_compression": "barcode length after compression per spatial unit", +} + +logger_name = "spacemake.report.qc_sequencing" +logger = logging.getLogger(logger_name) + + +def setup_parser(parser): + """ + Set up command-line arguments for the script. + + :param parser: Argument parser object. + :type parser: argparse.ArgumentParser + :returns: Updated argument parser object. + :rtype: argparse.ArgumentParser + """ + # These allow to get the run_mode_variables.* from the config file + # and pbf_metrics.px_by_um via project_df.puck_barcode_file_metrics + parser.add_argument( + "--project-id", + type=str, + help="the project_id in spacemake project_df.csv", + required=True, + ) + parser.add_argument( + "--sample-id", + type=str, + help="the sample_id in spacemake project_df.csv", + required=True, + ) + parser.add_argument( + "--puck-barcode-file-id-qc", + type=str, + help="a path to the puck_barcode_file_id_qc", + required=True, + ) + # These have the paths to the input files used for generating plots + parser.add_argument( + "--dge-summary-paths", + type=str, + nargs="+", + help="paths to the dge summary files, must be in same number and order as --run-modes", + required=True, + ) + parser.add_argument( + "--run-modes", + type=str, + nargs="+", + help="run modes of the sample for which the report will be generated", + required=False, + default=None, + ) + parser.add_argument( + "--complete-data-root", + type=str, + help="path to where the sample data is stored", + required=True, + ) + parser.add_argument( + "--split-reads-read-type", + type=str, + nargs="+", + help="path to the 'read_type_num' file(s) generated by read annotator. One per mapping strategy.", + required=True, + ) + # These additionally configure the plotting functionality + parser.add_argument( + "--is-spatial", + type=str, + help="Whether the current sample is spatial or not", + required=True, + choices=["True", "False"], + default="False", + ) + # This specifies where the output file will be generated + parser.add_argument( + "--out-html-report", + type=str, + help="where the HTML report will be saved", + required=True, + ) + + return parser + + +def reverse_readline(filename, buf_size=8192): + """A generator that returns the lines of a file in reverse order""" + with open(filename, "rb") as fh: + segment = None + offset = 0 + fh.seek(0, os.SEEK_END) + file_size = remaining_size = fh.tell() + while remaining_size > 0: + offset = min(file_size, offset + buf_size) + fh.seek(file_size - offset) + buffer = fh.read(min(remaining_size, buf_size)).decode(encoding="utf-8") + remaining_size -= buf_size + lines = buffer.split("\n") + # The first line of the buffer is probably not a complete line so + # we'll save it and append it to the last line of the next buffer + # we read + if segment is not None: + # If the previous chunk starts right from the beginning of line + # do not concat the segment to the last line of new chunk. + # Instead, yield the segment first + if buffer[-1] != "\n": + lines[-1] += segment + else: + yield segment + segment = lines[0] + for index in range(len(lines) - 1, 0, -1): + if lines[index]: + yield lines[index] + # Don't yield None if the file was empty + if segment is not None: + yield segment + + +def read_star_log_file(log_file): + file = os.path.join(log_file) + + if not os.path.isfile(file): + return (1, 1) + + star_stats = { + "input_reads": 0, + "uniq_mapped_reads": 0, + "multi_mapped_reads": 0, + "too_many_mapped_reads": 0, + "too_many_mapped_reads": 0, + "unmapped_too_short": 0, + "unmapped_other": 0, + "chimeric": 0, + } + + log_name_stat = { + "Number of input reads": "input_reads", + "Uniquely mapped reads number": "uniq_mapped_reads", + "Number of reads mapped to multiple loci": "multi_mapped_reads", + "Number of reads mapped to too many loci": "too_many_mapped_reads", + "Number of reads unmapped: too many mismatches": "unmapped_mismatches", + "Number of reads unmapped: too short": "unmapped_too_short", + "Number of reads unmapped: other": "unmapped_other", + "Number of chimeric reads": "chimeric", + "Average mapped length": "avg_mapped_length", + } + + with open(file) as fi: + idx = 0 + for line in fi: + _log_id = line.strip().split("|")[0].strip() + if _log_id in log_name_stat.keys(): + if _log_id == "Average mapped length": + star_stats[log_name_stat[_log_id]] = line.strip().split("|")[1] + else: + star_stats[log_name_stat[_log_id]] = round(int(line.strip().split("|")[1]) / 1e6, 2) + # Do not convert to millions the Average mapped length + + idx += 1 + + return star_stats + + +def read_split_reads_file(split_reads_file): + file = os.path.join(split_reads_file) + + read_types = dict() + with open(file) as fi: + for line in fi: + line = line.strip().split(" ") + read_types[line[0]] = round(int(line[1]) / 1e6, 2) + + return read_types + + +def read_bowtie_log_file(log_file): + file = os.path.join(log_file) + bowtie_stats = { + "input_reads": 0, + "unique_aligned": 0, + "multimapper": 0, + "unaligned": 0, + } + log_line_stat = { + 5: "input_reads", + 3: "unaligned", + 2: "unique_aligned", + 1: "multimapper", + } + max_line = max(log_line_stat.keys()) + + idx = 0 + for line in reverse_readline(file): + if idx in log_line_stat.keys(): + bowtie_stats[log_line_stat[idx]] = round(int(line.strip().split(" ")[0]) / 1e6, 2) + idx += 1 + if max_line < idx: + break + + return bowtie_stats + + +def plot_metric(values, axis, nbins=100, color="#000000"): + # decide linear or logarithmic scale + min_difference = values.max() - values.min() + + hist, bins = np.histogram(values, bins=nbins) + if np.abs(min_difference) < 100: + axis.bar(bins[:-1], hist, color=color) + + else: + logbins = np.logspace(np.log10(bins[0] + 1), np.log10(bins[-1]), nbins) + axis.hist(values, bins=logbins, color=color) + axis.set_xscale("log") + + axis.spines[["right", "top"]].set_visible(False) + + +def plot_umi_cutoff(obs_df): + umi_cutoffs = np.arange(10, 20000, 10) + + def summarise_dge_summary(df, umi_cutoff): + df_filter = df[df["total_counts"] > umi_cutoff] + + df_summary = df_filter[ + [ + "n_reads", + "total_counts", + "n_genes_by_counts", + "reads_per_counts", + ] + ].median() + df_summary["n_beads"] = len(df_filter) + + return df_summary + + umi_cutoff_data = pd.DataFrame( + np.vstack([summarise_dge_summary(obs_df, umi_cutoff).values for umi_cutoff in umi_cutoffs]), + columns=[ + "n_reads", + "total_counts", + "n_genes_by_counts", + "reads_per_counts", + "n_beads", + ], + ) + umi_cutoff_data["umi_cutoffs"] = umi_cutoffs + + fig, axes = plt.subplots(2, 3, figsize=(8, 4)) + axes[0, 0].plot( + umi_cutoff_data["umi_cutoffs"], + umi_cutoff_data["n_beads"], + color="black", + linewidth=1, + ) + axes[0, 0].set_ylabel("number of\nspatial units") + axes[0, 0].set_xlabel("minimum UMI") + axes[0, 0].set_yscale("log") + axes[0, 0].set_xscale("log") + axes[0, 0].spines[["right", "top"]].set_visible(False) + + axes[0, 1].plot( + umi_cutoff_data["umi_cutoffs"], + umi_cutoff_data["n_reads"], + color=clrs["reads"], + linewidth=1, + ) + axes[0, 1].set_ylabel("median reads\nper spatial unit") + axes[0, 1].set_xlabel("minimum UMI") + axes[0, 1].set_xscale("log") + axes[0, 1].spines[["right", "top"]].set_visible(False) + + axes[0, 2].plot( + umi_cutoff_data["umi_cutoffs"], + umi_cutoff_data["n_genes_by_counts"], + color=clrs["genes"], + linewidth=1, + ) + axes[0, 2].set_ylabel("median genes\nper spatial unit") + axes[0, 2].set_xlabel("minimum UMI") + axes[0, 2].set_xscale("log") + axes[0, 2].spines[["right", "top"]].set_visible(False) + + axes[1, 0].plot( + umi_cutoff_data["umi_cutoffs"], + umi_cutoff_data["total_counts"], + color=clrs["umis"], + linewidth=1, + ) + axes[1, 0].set_ylabel("median UMIs\nper spatial unit") + axes[1, 0].set_xlabel("minimum UMI") + axes[1, 0].set_xscale("log") + axes[1, 0].spines[["right", "top"]].set_visible(False) + + axes[1, 1].plot( + umi_cutoff_data["umi_cutoffs"], + umi_cutoff_data["reads_per_counts"], + color=clrs["pcr"], + linewidth=1, + ) + axes[1, 1].set_ylabel("median reads/UMI\nper spatial unit") + axes[1, 1].set_xlabel("minimum UMI") + axes[1, 1].set_xscale("log") + axes[1, 1].spines[["right", "top"]].set_visible(False) + axes[1, 2].axis("off") + + plt.tight_layout() + return fig, axes + + +def plot_histogram_beads(obs_df): + # One for each run mode + fig, axes = plt.subplots(2, 2, figsize=(7, 3.5)) + plot_metric(obs_df["n_reads"], axes[0, 0], 100, clrs["reads"]) + axes[0, 0].set_ylabel("# of\nspatial units") + axes[0, 0].set_xlabel("# of reads") + + reads_per_counts = obs_df["reads_per_counts"] + reads_per_counts = np.nan_to_num(reads_per_counts) + plot_metric(reads_per_counts, axes[0, 1], 100, clrs["pcr"]) + axes[0, 1].set_ylabel("# of\nspatial units") + axes[0, 1].set_xlabel("# of reads/UMI") + + plot_metric(obs_df["n_genes_by_counts"], axes[1, 0], 100, clrs["genes"]) + axes[1, 0].set_ylabel("# of\nspatial units") + axes[1, 0].set_xlabel("# of genes") + + plot_metric(obs_df["total_counts"], axes[1, 1], 100, clrs["umis"]) + axes[1, 1].set_ylabel("# of\nspatial units") + axes[1, 1].set_xlabel("# of UMIs") + + plt.tight_layout() + return fig, axes + + +def plot_nucleotide_distribution_beads(dge_summary): + dge_summary["reads_cumsum"] = dge_summary["n_reads"].cumsum() + dge_summary["quartile"] = pd.cut( + dge_summary["reads_cumsum"], + bins=4, + include_lowest=True, + labels=["Q1", "Q2", "Q3", "Q4"], + ) + dge_summary.drop(columns=["reads_cumsum"], inplace=True) + + # Create a dataframe to show the count of nucleotides/barcode + cell_bc_len = len(dge_summary["cell_bc"].iloc[0]) + nucls = dge_summary["cell_bc"].str.strip().apply(list).apply(pd.Series) + nucls = pd.concat([dge_summary[["cell_bc", "quartile"]], nucls], axis=1) + nucls = nucls.melt(id_vars=["cell_bc", "quartile"], var_name="pos", value_name="nucl") + nucls = nucls.groupby(["pos", "nucl", "quartile"]).size().reset_index(name="nucl_count") + nucls = nucls.pivot_table( + index=["pos", "nucl"], columns="quartile", values="nucl_count", fill_value=0 + ).reset_index() + lbl_df = dge_summary.groupby("quartile").size().reset_index(name="lbl") + lbl_df["lbl"] = lbl_df.apply(lambda row: f"{row['quartile']} (n={row['lbl']})", axis=1) + lbls = dict(zip(lbl_df["quartile"], lbl_df["lbl"])) + + # Create the plot + fig, axes = plt.subplots(2, 2, figsize=(8, 4)) + x = np.arange(1, cell_bc_len + 1) # the label locations + + for name, group in nucls.groupby("nucl"): + axes[0, 0].plot(x, group["Q1"], color=nucl_clrs[name], linewidth=2, label=name) + axes[0, 0].set_xlim(0.1, cell_bc_len + 1.5) + axes[0, 0].set_xticks([]) + axes[0, 0].set_title(lbls["Q1"]) + axes[0, 0].spines[["right", "top", "bottom"]].set_visible(False) + + axes[0, 1].plot(x, group["Q2"], color=nucl_clrs[name], linewidth=2) + axes[0, 1].set_xlim(0.1, cell_bc_len + 1.5) + axes[0, 1].set_xticks([]) + axes[0, 1].set_title(lbls["Q2"]) + axes[0, 1].spines[["right", "top", "bottom"]].set_visible(False) + + axes[1, 0].plot(x, group["Q3"], color=nucl_clrs[name], linewidth=2) + axes[1, 0].set_xlim(0.1, cell_bc_len + 1.5) + axes[1, 0].set_title(lbls["Q3"]) + axes[1, 0].spines[["right", "top"]].set_visible(False) + axes[1, 0].set_xticks(list(set(list(range(1, cell_bc_len + 1, 2)) + [cell_bc_len]))) + + axes[1, 1].plot(x, group["Q4"], color=nucl_clrs[name], linewidth=2) + axes[1, 1].set_xlim(0.1, cell_bc_len + 1.5) + axes[1, 1].set_title(lbls["Q4"]) + axes[1, 1].spines[["right", "top"]].set_visible(False) + axes[1, 1].set_xticks(list(set(list(range(1, cell_bc_len + 1, 2)) + [cell_bc_len]))) + + handles, labels = axes[0, 0].get_legend_handles_labels() + legend = fig.legend( + handles, + labels, + loc="center right", + bbox_to_anchor=(1.1, 0.5), + borderaxespad=0, + title="Nucleotide", + ) + legend.set_frame_on(False) + fig.text(0.5, 0.0, "nucleotide position in the barcode", ha="center") + plt.tight_layout() + + return fig, axes + + +def plot_entropy_compression(obs_df, nbins=30): + fig, axes = plt.subplots(2, 1, figsize=(7, 4)) + axes[0].hist( + obs_df["theoretical_entropy"].values, + color=cpalette["grey"], + bins=nbins, + edgecolor="black", + label="Theoretical", + ) + axes[0].hist( + obs_df["exact_entropy"].values, + color=cpalette["orange"], + bins=nbins, + edgecolor="black", + alpha=0.7, + label="Observed", + ) + axes[0].set_xlabel("Shannon entropy of barcodes") + axes[0].set_ylabel("# of barcodes") + axes[0].spines[["right", "top"]].set_visible(False) + legend0 = axes[0].legend(loc="upper left") + legend0.set_frame_on(False) + + axes[1].hist( + obs_df["theoretical_compression"].values, + color=cpalette["grey"], + bins=nbins, + edgecolor="black", + label="Theoretical", + ) + axes[1].hist( + obs_df["exact_compression"].values, + color=cpalette["orange"], + bins=nbins, + edgecolor="black", + alpha=0.7, + label="Observed", + ) + axes[1].set_xlabel("Length of barcodes after compression") + axes[1].set_ylabel("# of barcodes") + axes[1].spines[["right", "top"]].set_visible(False) + axes[1].set_xlim(left=0) + return fig, axes + + +def plot_spatial_qc_metric( + adata, + metric="total_counts", + puck_bead_size=1, + px_by_um=1, + x_breaks=None, + y_breaks=None, + x_mm_breaks=None, + y_mm_breaks=None, +): + fig, axes = plt.subplots(1, 1, figsize=(5, 5)) + sc.pl.spatial( + adata, + img_key=None, + size=puck_bead_size * 1.5, + spot_size=px_by_um, + color=metric, + ax=axes, + title=SPATIAL_METRICS_TITLES[metric], + show=False, + cmap="inferno", + ) + axes.spines[["right", "top"]].set_visible(False) + axes.set_xticks(x_breaks) + axes.set_xticklabels(x_mm_breaks) + axes.set_yticks(y_breaks) + axes.set_yticklabels(y_mm_breaks) + axes.set_ylabel("") + axes.set_xlabel("") + + return fig, axes + + +def plot_to_base64(fig): + my_stringIObytes = io.BytesIO() + fig.savefig(my_stringIObytes, format="jpg", dpi=300) + plt.close() + my_stringIObytes.seek(0) + return base64.b64encode(my_stringIObytes.read()).decode() + + +def generate_table_mapping_statistics(complete_data_root: str, split_reads_read_type: str): + # Initialize empty lists to store the filenames + bowtie_log_files = [] + star_log_files = [] + + # Iterate over the files in the folder + for filename in os.listdir(complete_data_root): + # Check if the file ends with .bam.log + if filename.endswith(".bam.log") and "final" not in filename: + bowtie_log_files.append(filename) + # Check if the file ends with .Log.final.out + elif filename.endswith(".Log.final.out") and filename != "star.Log.final.out": + star_log_files.append(filename) + + bowtie_logs = [] + for bowtie_log_file in bowtie_log_files: + bowtie_log = read_bowtie_log_file(os.path.join(complete_data_root, bowtie_log_file)) + bowtie_log["name"] = bowtie_log_file.split(".")[0] + bowtie_log["mapper"] = "bowtie2" + bowtie_logs.append(bowtie_log) + + star_logs = [] + for star_log_file in star_log_files: + star_log = read_star_log_file(os.path.join(complete_data_root, star_log_file)) + star_log["name"] = star_log_file.split(".")[1] + star_log["mapper"] = "STAR" + star_logs.append(star_log) + + # we sort the mapping statistics by the number of input reads, merge into a single list + all_logs = star_logs + bowtie_logs + + idx_log = np.argsort([log["input_reads"] for log in all_logs])[::-1] + all_logs = [all_logs[idx] for idx in idx_log] + + reads_type = read_split_reads_file(split_reads_read_type) + reads_type["name"] = "reads_type" + all_logs += [reads_type] + + return all_logs + + +def load_dge_summary(obs_df_path, with_adata=True): + obs_df = pd.read_csv(obs_df_path) + empty_ad = csr_matrix((len(obs_df), 1), dtype=int) + adata = ad.AnnData(empty_ad) + adata.obs = obs_df + if with_adata: + return obs_df, adata + else: + return obs_df + + +def generate_qc_sequencing_metadata( + project_id: str, + sample_id: str, + dge_summary_paths: Union[list, dict], # from snakemake_helper_funtions.get_qc_sheet_input_files + complete_data_root: str, + split_reads_read_type: Union[str, list], + puck_barcode_file_id_qc: str, + is_spatial: bool = False, + run_modes: list = None, +): + if isinstance(dge_summary_paths, str): + dge_summary_paths = [dge_summary_paths] + if isinstance(run_modes, str): + run_modes = [run_modes] + if ( + isinstance(run_modes, list) + and isinstance(dge_summary_paths, list) + and len(run_modes) == len(dge_summary_paths) + ): + dge_summary_paths = { + f'{run_mode}.dge_summary': dge_summary_path for run_mode, dge_summary_path in zip(run_modes, dge_summary_paths) + } + elif ( + isinstance(run_modes, list) + and isinstance(dge_summary_paths, list) + and len(run_modes) != len(dge_summary_paths) + ): + raise ValueError("'run_modes' and 'dge_summary_paths' must have the same length") + + report = { + "type": "qc_report", + "sampleinfo": [], + "runmodes": [], + "mappingstats": [], + "summarisedmetrics": [], + "is_spatial": is_spatial, + "plots": [], + } + main_plots = { + "kneeplot": [], + "umicutoff": [], + "histogrambeads": [], + "nucleotidedistribution": [], + "shannonentropy": [], + "spatialqc": [], + } + report["plots"] = main_plots + + # Loading project_df for metadata + config = ConfigFile.from_yaml("config.yaml") + project_df = ProjectDF("project_df.csv", config=config) + + # Table: sample info + sample_info = project_df.get_sample_info(project_id, sample_id) + report["sampleinfo"] = {} + report["sampleinfo"]["project_id"] = project_id + report["sampleinfo"]["sample_id"] = sample_id + report["sampleinfo"]["puck_barcode_file_id"] = puck_barcode_file_id_qc + report["sampleinfo"].update({svar: sample_info[svar] for svar in SAMPLEINFO_VARS}) + + # Table: run modes + if run_modes is None: + run_modes = sample_info["run_mode"] + + for run_mode in run_modes: + run_mode_info = {} + run_mode_info["variables"] = project_df.config.get_run_mode(run_mode).variables + run_mode_info["name"] = run_mode + report["runmodes"].append(run_mode_info) + + # Table: mapping statistics + if isinstance(split_reads_read_type, str): + split_reads_read_type = [split_reads_read_type] + + for s in split_reads_read_type: + # assumes that the directory of s is the name of the mapping strategy + mappingstat = { + "name": os.path.basename(os.path.dirname(s)), + "all_stats": generate_table_mapping_statistics(complete_data_root, s), + } + report["mappingstats"].append(mappingstat) + + # Loading all dge data summaries + # Table: summarised metrics over beads + dge_summaries = {} + for run_mode in run_modes: + dge_summaries[run_mode] = {} + summarised_metrics = {"name": run_mode, "variables": None} + _dge_summary = load_dge_summary(dge_summary_paths[f'{run_mode}.dge_summary']) + dge_summaries[run_mode]["df"] = _dge_summary[0] + dge_summaries[run_mode]["adata"] = _dge_summary[1] + + obs_df = dge_summaries[run_mode]["df"] + metrics_beads = obs_df[["n_genes_by_counts", "reads_per_counts", "n_reads", "total_counts"]].median().to_dict() + metrics_beads["n_beads"] = len(obs_df) + metrics_beads["sum_reads"] = obs_df["n_reads"].sum() + summarised_metrics["variables"] = metrics_beads + report["summarisedmetrics"].append(summarised_metrics) + + # Plots + # 'Knee'-plot + for run_mode, dge_summary in dge_summaries.items(): + kneeplot = {"name": run_mode, "plot": None} + obs_df = dge_summary["df"] + fig, axis = plt.subplots(1, 1, figsize=(5, 3)) + axis.plot( + np.arange(len(obs_df)), + np.cumsum(obs_df["n_reads"].values), + color="black", + linewidth=1, + ) + axis.set_ylabel("Cumulative\nsum of reads") + axis.set_xlabel("Beads sorted by number of reads") + axis.spines[["right", "top"]].set_visible(False) + plt.tight_layout() + kneeplot["plot"] = plot_to_base64(fig) + report["plots"]["kneeplot"].append(kneeplot) + + # UMI cutoff plots + for run_mode, dge_summary in dge_summaries.items(): + umicutoff = {"name": run_mode, "plot": None} + obs_df = dge_summary["df"] + fig, axes = plot_umi_cutoff(obs_df) + umicutoff["plot"] = plot_to_base64(fig) + report["plots"]["umicutoff"].append(umicutoff) + + # Histogram beads + for run_mode, dge_summary in dge_summaries.items(): + histogrambeads = {"name": run_mode, "plot": None} + obs_df = dge_summary["df"] + fig, axes = plot_histogram_beads(obs_df) + histogrambeads["plot"] = plot_to_base64(fig) + report["plots"]["histogrambeads"].append(histogrambeads) + + # Check if the run_mode is meshed + + # Nucleotide distribution per beads + for run_mode, dge_summary in dge_summaries.items(): + if not project_df.config.get_run_mode(run_mode).variables["mesh_data"]: + nucleotidedistribution = {"name": run_mode, "plot": None} + obs_df = dge_summary["df"] + fig, axes = plot_nucleotide_distribution_beads(obs_df) + nucleotidedistribution["plot"] = plot_to_base64(fig) + report["plots"]["nucleotidedistribution"].append(nucleotidedistribution) + + # Shannon entropy + for run_mode, dge_summary in dge_summaries.items(): + shannonentropy = {"name": run_mode, "plot": None} + obs_df = dge_summary["df"] + fig, axes = plot_entropy_compression(obs_df) + shannonentropy["plot"] = plot_to_base64(fig) + report["plots"]["shannonentropy"].append(shannonentropy) + + # Return here if not spatial + if not is_spatial: + return report + + # Proceed with Spatial QC plots + # Variables for spatial + px_by_um, puck_bead_size = 1, 1 + x_breaks, y_breaks = None, None + x_mm_breaks, y_mm_breaks = None, None + puck_width_um = 1 + + for run_mode, dge_summary in dge_summaries.items(): + # Loading adata for scanpy plotting + adata = dge_summary["adata"] + adata.obsm["spatial"] = adata.obs[["x_pos", "y_pos"]].values + report["n_cells"] = len(adata) + + # Loading metadata from spatial pucks and run mode + puck_metrics = ( + project_df.get_puck_barcode_file_metrics( + project_id=project_id, + sample_id=sample_id, + puck_barcode_file_id=puck_barcode_file_id_qc, + ), + )[0] + + puck_settings = project_df.get_puck_variables(project_id, sample_id, return_empty=True) + + run_mode_vars = project_df.config.get_run_mode(run_mode).variables + + px_by_um = puck_metrics["px_by_um"] + mesh_spot_diameter_um = run_mode_vars["mesh_spot_diameter_um"] + meshed = run_mode_vars["mesh_data"] + spot_diameter_um = puck_settings["spot_diameter_um"] + + # Set limits and axes units for the spatial plots + x_limits = adata.obsm["spatial"][:, 0].min(), adata.obsm["spatial"][:, 0].max() + y_limits = adata.obsm["spatial"][:, 1].min(), adata.obsm["spatial"][:, 1].max() + puck_width_um = (x_limits[1] - x_limits[0]) / px_by_um + + ratio = (x_limits[1] - x_limits[0]) / (y_limits[1] - y_limits[0]) + + scale_factor = 2 if puck_width_um < 3000 else 3 + mm_dist = max(10**scale_factor, round(puck_width_um / 3, scale_factor)) + mm_diff = mm_dist / 1000 + + def_plot_bead_size = 0.5 if report["n_cells"] > 5000 else 0.75 + def_plot_bead_size = 0.1 if report["n_cells"] > 10000 else def_plot_bead_size + def_plot_bead_size = 0.05 if report["n_cells"] > 25000 else def_plot_bead_size + + puck_bead_size = max(def_plot_bead_size, mesh_spot_diameter_um if meshed else spot_diameter_um) + x_mm_breaks = np.arange(0, puck_width_um, mm_dist) + x_mm_breaks = [f"{round(i, 1)} mm" for i in x_mm_breaks * mm_diff / mm_dist] + y_mm_breaks = np.arange(0, puck_width_um / ratio, mm_dist) + y_mm_breaks = [f"{round(i, 1)} mm" for i in y_mm_breaks * mm_diff / mm_dist] + + x_breaks = np.arange(x_limits[0], x_limits[1], px_by_um * mm_dist) + y_breaks = np.arange(y_limits[0], y_limits[1], px_by_um * mm_dist) + + # Plot spatial metrics + spatialqc = {"name": run_mode, "plots": []} + for spatial_metric in SPATIAL_METRICS: + fig, axes = plot_spatial_qc_metric( + adata, + spatial_metric, + puck_bead_size=puck_bead_size, + px_by_um=px_by_um, + x_breaks=x_breaks, + y_breaks=y_breaks, + x_mm_breaks=x_mm_breaks, + y_mm_breaks=y_mm_breaks, + ) + spatialqc["plots"].append(plot_to_base64(fig)) + report["plots"]["spatialqc"].append(spatialqc) + + return report + + +def generate_html_report(data, template_file): + with open(template_file, "r") as template_data: + template_content = template_data.read() + template = Template(template_content) + + html_report = template.render(report=data) + + return html_report + + +@message_aggregation(logger_name) +def cmdline(): + """cmdline.""" + import argparse + + parser = argparse.ArgumentParser( + allow_abbrev=False, + description="generate spacemake's 'QC sequencing' with python", + ) + parser = setup_parser(parser) + + args = parser.parse_args() + template_file = os.path.join(absolute_path, "templates/qc_sequencing.html") + + report_metadata = generate_qc_sequencing_metadata( + args.project_id, + args.sample_id, + args.dge_summary_paths, + args.complete_data_root, + args.split_reads_read_type, + args.puck_barcode_file_id_qc, + args.is_spatial, + args.run_modes, + ) + + html_report = generate_html_report(report_metadata, template_file) + + with open(args.out_html_report, "w") as output: + output.write(html_report) + + +if __name__ == "__main__": + cmdline() From 28e0d979987eb40a6586cba4eeed5cf533d999e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Mon, 18 Sep 2023 11:10:17 +0200 Subject: [PATCH 07/20] add python-based automated_analysis report --- spacemake/report/automated_analysis.py | 470 +++++++++++++++++++++++++ 1 file changed, 470 insertions(+) create mode 100644 spacemake/report/automated_analysis.py diff --git a/spacemake/report/automated_analysis.py b/spacemake/report/automated_analysis.py new file mode 100644 index 00000000..722a7308 --- /dev/null +++ b/spacemake/report/automated_analysis.py @@ -0,0 +1,470 @@ +import base64 +import io +import logging +import os + +import anndata as ad +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import scanpy as sc +from jinja2 import Template +from scipy.sparse import csr_matrix +from spacemake.config import ConfigFile +from spacemake.project_df import ProjectDF +from spacemake.util import message_aggregation + +absolute_path = os.path.dirname(__file__) + +cpalette = { + "grey": "#999999", + "light_orange": "#E69F00", + "light_blue": "#56B4E9", + "green": "#009E73", + "yellow": "#F0E442", + "blue": "#0072B2", + "orange": "#D55E00", + "pink": "#CC79A7", +} + +clrs = { + "umis": cpalette["light_orange"], + "genes": cpalette["light_blue"], + "reads": cpalette["green"], + "pcr": cpalette["pink"], + "pct_counts_mt": "black", +} + + +logger_name = "spacemake.report.automated_analysis" +logger = logging.getLogger(logger_name) + + +def setup_parser(parser): + """ + Set up command-line arguments for the script. + + :param parser: Argument parser object. + :type parser: argparse.ArgumentParser + :returns: Updated argument parser object. + :rtype: argparse.ArgumentParser + """ + # These allow to get the run_mode_variables.* from the config file + # and pbf_metrics.px_by_um via project_df.puck_barcode_file_metrics + parser.add_argument( + "--project-id", + type=str, + help="the project_id in spacemake project_df.csv", + required=True, + ) + parser.add_argument( + "--sample-id", + type=str, + help="the sample_id in spacemake project_df.csv", + required=True, + ) + parser.add_argument( + "--run-mode", + type=str, + help="one of the run_mode from the project+sample in project_df.csv", + required=True, + ) + parser.add_argument( + "--puck-barcode-file-id-qc", + type=str, + help="a path to the puck_barcode_file_id_qc", + required=True, + ) + # These have the paths to the input files used for generating plots + parser.add_argument( + "--obs-df", + type=str, + help="path to the csv file that stores properties per observation (per cell)", + required=True, + ) + parser.add_argument( + "--var-df", + type=str, + help="path to the csv file that stores properties per variable (per gene)", + required=True, + ) + parser.add_argument( + "--neighborhood-enrichment", + type=str, + help="path to the csv file with the neighborhood enrichment results", + required=False, + default="", + ) + # These additionally configure the plotting functionality + parser.add_argument( + "--is-spatial", + type=str, + help="Whether the current sample is spatial or not", + required=True, + choices=["True", "False"], + default="False", + ) + # This specifies the UMI filter for the current report + parser.add_argument( + "--umi-cutoff", + type=str, + help="one of the UMI cutoffs from the --run-mode", + required=True, + ) + # This specifies where the output file will be generated + parser.add_argument( + "--out-html-report", + type=str, + help="where the HTML report will be saved", + required=True, + ) + + return parser + + +def plot_metric(values, axis, nbins=100, color="#000000"): + # decide linear or logarithmic scale + min_difference = values.max() - values.min() + + hist, bins = np.histogram(values, bins=nbins) + if np.abs(min_difference) < 100: + axis.bar(bins[:-1], hist, color=color) + + else: + logbins = np.logspace(np.log10(bins[0] + 1), np.log10(bins[-1]), nbins) + axis.hist(values, bins=logbins, color=color) + axis.set_xscale("log") + + axis.spines[["right", "top"]].set_visible(False) + + +def plot_to_base64(fig): + my_stringIObytes = io.BytesIO() + fig.savefig(my_stringIObytes, format="jpg") + my_stringIObytes.seek(0) + return base64.b64encode(my_stringIObytes.read()).decode() + + +def generate_automated_analysis_metadata( + project_id: str, + sample_id: str, + run_mode: str, + puck_barcode_file_id_qc: str, + obs_df_path: str, + var_df_path: str, + neighborhood_enrichment: str, + is_spatial: bool = False, + umi_cutoff: int = 0, +): + report = { + "table_overview": [], + "data_complete": False, + "n_cells": 0, + "umi_cutoff": 0, + "n_genes": 0, + "analyzed_clustering_resolutions": [], + "is_spatial": False, + "plots": [], + } + main_plots = { + "histogram_metrics_spatial_units": "", + "umi_spatial_unit_original": "", + "umi_spatial_unit_scaled": "", + } + + # Loading data + obs_df = pd.read_csv(obs_df_path) + var_df = pd.read_csv(var_df_path) + + # Loading project_df for metadata + config = ConfigFile.from_yaml("config.yaml") + project_df = ProjectDF("project_df.csv", config=config) + + # Create anndata for scanpy plotting + empty_ad = csr_matrix((len(obs_df), len(var_df)), dtype=int) + adata = ad.AnnData(empty_ad) + adata.obs = obs_df + adata.var = var_df + adata.obsm["X_umap"] = adata.obs[["umap_0", "umap_1"]].values + + report["n_genes"] = len(var_df) + report["n_cells"] = len(obs_df) + + report["umi_cutoff"] = umi_cutoff + report["data_complete"] = True if np.any(obs_df.columns.str.startswith("leiden")) else False + report["is_spatial"] = is_spatial + + report["plots"] = main_plots + + # Plot Histogram of metrics over spatial units + fig, axes = plt.subplots(3, 2, figsize=(7, 6)) + plot_metric(obs_df["n_reads"], axes[0, 0], 100, clrs["reads"]) + axes[0, 0].set_ylabel("# of\nspatial units") + axes[0, 0].set_xlabel("# of reads") + plot_metric(obs_df["reads_per_counts"], axes[0, 1], 100, clrs["pcr"]) + axes[0, 1].set_ylabel("# of\nspatial units") + axes[0, 1].set_xlabel("# of reads/UMI") + plot_metric(obs_df["n_genes_by_counts"], axes[1, 0], 100, clrs["genes"]) + axes[1, 0].set_ylabel("# of\nspatial units") + axes[1, 0].set_xlabel("# of genes") + plot_metric(obs_df["total_counts"], axes[1, 1], 100, clrs["umis"]) + axes[1, 1].set_ylabel("# of\nspatial units") + axes[1, 1].set_xlabel("# of UMIs") + plot_metric(obs_df["pct_counts_mt"], axes[2, 0], 100, clrs["pct_counts_mt"]) + axes[2, 0].set_ylabel("# of\nspatial units") + axes[2, 0].set_xlabel("% of mitochondrial counts") + axes[2, 1].axis("off") + plt.tight_layout() + main_plots["histogram_metrics_spatial_units"] = plot_to_base64(fig) + + # Variables for spatial + px_by_um, puck_bead_size = 1, 1 + x_breaks, y_breaks = None, None + x_mm_breaks, y_mm_breaks = None, None + puck_width_um = 1 + + # Distribution of UMIs in 2D + if is_spatial: + # Loading metadata from spatial pucks and run mode + puck_metrics = ( + project_df.get_puck_barcode_file_metrics( + project_id=project_id, + sample_id=sample_id, + puck_barcode_file_id=puck_barcode_file_id_qc, + ), + )[0] + + puck_settings = project_df.get_puck_variables( + project_id, sample_id, return_empty=True + ) + + run_mode_vars = project_df.config.get_run_mode(run_mode).variables + + print(puck_metrics) + print(puck_settings) + print(run_mode_vars) + + px_by_um = puck_metrics["px_by_um"] + mesh_spot_diameter_um = run_mode_vars["mesh_spot_diameter_um"] + meshed = run_mode_vars["mesh_data"] + spot_diameter_um = puck_settings["spot_diameter_um"] + + adata.obsm["spatial"] = adata.obs[["x_pos", "y_pos"]].values + + # Set limits and axes units for the spatial plots + x_limits = adata.obsm["spatial"][:, 0].min(), adata.obsm["spatial"][:, 0].max() + y_limits = adata.obsm["spatial"][:, 1].min(), adata.obsm["spatial"][:, 1].max() + puck_width_um = (x_limits[1] - x_limits[0]) / px_by_um + + ratio = (x_limits[1] - x_limits[0]) / (y_limits[1] - y_limits[0]) + + scale_factor = 2 if puck_width_um < 3000 else 3 + mm_dist = max(10**scale_factor, round(puck_width_um / 3, scale_factor)) + mm_diff = mm_dist / 1000 + + def_plot_bead_size = 0.5 if report["n_cells"] > 5000 else 0.75 + def_plot_bead_size = 0.1 if report["n_cells"] > 10000 else def_plot_bead_size + def_plot_bead_size = 0.05 if report["n_cells"] > 25000 else def_plot_bead_size + + puck_bead_size = max( + def_plot_bead_size, mesh_spot_diameter_um if meshed else spot_diameter_um + ) + x_mm_breaks = np.arange(0, puck_width_um, mm_dist) + x_mm_breaks = [f"{round(i, 1)} mm" for i in x_mm_breaks * mm_diff / mm_dist] + y_mm_breaks = np.arange(0, puck_width_um / ratio, mm_dist) + y_mm_breaks = [f"{round(i, 1)} mm" for i in y_mm_breaks * mm_diff / mm_dist] + + x_breaks = np.arange(x_limits[0], x_limits[1], px_by_um * mm_dist) + y_breaks = np.arange(y_limits[0], y_limits[1], px_by_um * mm_dist) + + # Plot spatial UMI (original) + fig, axes = plt.subplots(1, 1, figsize=(5, 5)) + sc.pl.spatial( + adata, + img_key=None, + size=puck_bead_size * 1.5, + spot_size=px_by_um, + color="total_counts", + ax=axes, + show=False, + cmap="inferno", + ) + axes.spines[["right", "top"]].set_visible(False) + axes.set_xticks(x_breaks) + axes.set_xticklabels(x_mm_breaks) + axes.set_yticks(y_breaks) + axes.set_yticklabels(y_mm_breaks) + axes.set_ylabel("") + axes.set_xlabel("") + + main_plots["umi_spatial_unit_original"] = plot_to_base64(fig) + + # Plot spatial UMI (rescaled) + + fig, axes = plt.subplots(1, 1, figsize=(5, 5)) + sc.pl.spatial( + adata, + img_key=None, + size=puck_bead_size * 1.5, + spot_size=px_by_um, + color="total_counts", + ax=axes, + show=False, + cmap="inferno", + vmax=np.quantile(adata.obs["total_counts"], 0.9), + ) + axes.spines[["right", "top"]].set_visible(False) + axes.set_xticks(x_breaks) + axes.set_xticklabels(x_mm_breaks) + axes.set_yticks(y_breaks) + axes.set_yticklabels(y_mm_breaks) + axes.set_ylabel("") + axes.set_xlabel("") + + main_plots["umi_spatial_unit_scaled"] = plot_to_base64(fig) + + if report["data_complete"]: + resolutions = np.unique( + pd.Series(obs_df.columns[obs_df.columns.str.startswith("leiden")]) + .apply(lambda x: x.split("_")[1]) + .values + ) + for i, resolution in enumerate(resolutions): + _current_analyzed_clustering_resolution = { + "resolution": "", + "resolution_id": "", + "plots": {"umap": "", "spatial": "", "neighbor_enrichment": ""}, + } + + _current_analyzed_clustering_resolution['resolution'] = f'{resolution} resolution' + _current_analyzed_clustering_resolution['resolution_id'] = f'rid_{i}' + + # Plot UMAP for current resolution + adata.obs[f"leiden_{resolution}"] = pd.Categorical( + adata.obs[f"leiden_{resolution}"] + ) + fig, axes = plt.subplots(1, 1, figsize=(8, 5)) + sc.pl.umap(adata, color=f"leiden_{resolution}", ax=axes, show=False) + axes.spines[["right", "top", "left", "bottom"]].set_visible(False) + axes.set_ylabel("UMAP 0") + axes.set_xlabel("UMAP 1") + plt.tight_layout() + _current_analyzed_clustering_resolution["plots"]["umap"] = plot_to_base64( + fig + ) + + # Spatial plot and neighborhood enrichment (if spatial) + if is_spatial: + # Plot Spatial clusters + fig, axes = plt.subplots(1, 1, figsize=(8, 5)) + sc.pl.spatial( + adata, + img_key=None, + size=puck_bead_size * 1.5, + spot_size=px_by_um, + color=f"leiden_{resolution}", + ax=axes, + show=False, + ) # marker can be "o" or "h" if is hexagonal + axes.spines[["right", "top"]].set_visible(False) + axes.set_xticks(x_breaks) + axes.set_xticklabels(x_mm_breaks) + axes.set_yticks(y_breaks) + axes.set_yticklabels(y_mm_breaks) + axes.set_ylabel("") + axes.set_xlabel("") + plt.tight_layout() + _current_analyzed_clustering_resolution["plots"][ + "spatial" + ] = plot_to_base64(fig) + + # Plot Neighborhood enrichment + nhood_enrich = pd.read_csv(neighborhood_enrichment) + nhood_enrich_current_res = nhood_enrich[ + nhood_enrich["resolution"].astype(str) == str(resolution) + ] + ne_data = np.zeros( + ( + nhood_enrich_current_res["cluster_a"].max() + 1, + nhood_enrich_current_res["cluster_b"].max() + 1, + ) + ) + ne_data[ + nhood_enrich_current_res["cluster_a"], + nhood_enrich_current_res["cluster_b"], + ] = nhood_enrich_current_res["zscore"] + + fig, axes = plt.subplots(1, 1, figsize=(4, 4)) + plmat = axes.matshow( + ne_data, cmap="inferno", vmin=-50, vmax=100, origin="lower" + ) + cbar = plt.colorbar(plmat, fraction=0.046) + cbar.set_label("Neighbor enrichment score") + axes.set_xlabel("cluster identity") + axes.set_ylabel("cluster identity") + axes.spines[["right", "top"]].set_visible(False) + axes.xaxis.tick_bottom() + plt.tight_layout() + _current_analyzed_clustering_resolution["plots"][ + "neighbor_enrichment" + ] = plot_to_base64(fig) + + report["analyzed_clustering_resolutions"].append( + _current_analyzed_clustering_resolution.copy() + ) + + report["table_overview"] = [ + {"name": "UMI filter", "value": umi_cutoff}, + {"name": "# genes in data", "value": report["n_genes"]}, + {"name": "# of spots in data", "value": report["n_cells"]}, + {"name": "median UMI", "value": obs_df["total_counts"].median()}, + {"name": "median genes", "value": obs_df["n_genes_by_counts"].median()}, + {"name": "puck width (um)", "value": puck_width_um}, + ] + + return report + + +def generate_html_report(data, template_file): + with open(template_file, "r") as template_data: + template_content = template_data.read() + template = Template(template_content) + + html_report = template.render(report=data) + + return html_report + + +@message_aggregation(logger_name) +def cmdline(): + """cmdline.""" + import argparse + + parser = argparse.ArgumentParser( + allow_abbrev=False, + description="generate spacemake's 'automated analysis' with python", + ) + parser = setup_parser(parser) + + args = parser.parse_args() + template_file = os.path.join(absolute_path, "templates/automated_analysis.html") + + report_metadata = generate_automated_analysis_metadata( + args.project_id, + args.sample_id, + args.run_mode, + args.puck_barcode_file_id_qc, + args.obs_df, + args.var_df, + args.neighborhood_enrichment, + args.is_spatial, + args.umi_cutoff, + ) + html_report = generate_html_report(report_metadata, template_file) + + with open(args.out_html_report, "w") as output: + output.write(html_report) + + +if __name__ == "__main__": + cmdline() \ No newline at end of file From d351b7efaca592365d50e4f82fab87fa0a5a6ff1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Mon, 18 Sep 2023 11:10:32 +0200 Subject: [PATCH 08/20] modify main.smk for python-based report generation --- spacemake/snakemake/main.smk | 38 ++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/spacemake/snakemake/main.smk b/spacemake/snakemake/main.smk index 52bf30ba..f957ba54 100644 --- a/spacemake/snakemake/main.smk +++ b/spacemake/snakemake/main.smk @@ -581,17 +581,7 @@ rule puck_collection_stitching: rule create_qc_sheet: input: unpack(get_qc_sheet_input_files), - ribo_log=parsed_ribo_depletion_log params: - sample_info = lambda wildcards: project_df.get_sample_info( - wildcards.project_id, wildcards.sample_id), - puck_variables = lambda wildcards: - project_df.get_puck_variables(wildcards.project_id, wildcards.sample_id, - return_empty=True), - pbf_metrics = lambda wildcards: project_df.get_puck_barcode_file_metrics( - project_id = wildcards.project_id, - sample_id = wildcards.sample_id, - puck_barcode_file_id = wildcards.puck_barcode_file_id_qc), is_spatial = lambda wildcards: project_df.is_spatial(wildcards.project_id, wildcards.sample_id, puck_barcode_file_id=wildcards.puck_barcode_file_id_qc), @@ -599,8 +589,26 @@ rule create_qc_sheet: wildcards.project_id, wildcards.sample_id) output: qc_sheet - script: - "scripts/qc_sequencing_create_sheet.Rmd" + run: + from spacemake.report.automated_analysis import generate_automated_analysis_metadata + + template_file = os.path.join(spacemake_dir, "report/templates/qc_sequencing.html") + + report_metadata = generate_qc_sequencing_metadata( + wildcards.project_id, + wildcards.sample_id, + input, + complete_data_root, + input['reads_type_out'], + wildcards.puck_barcode_file_id_qc, + parama['is_spatial'], + params['run_modes'], + ) + + html_report = generate_html_report(report_metadata, template_file) + + with open(output[0], "w") as output: + output.write(html_report) rule run_automated_analysis: input: @@ -662,7 +670,7 @@ rule create_automated_report: project_df.is_spatial(wildcards.project_id, wildcards.sample_id, puck_barcode_file_id=wildcards.puck_barcode_file_id_qc), run: - from spacemake.report.classes.automated_analysis import generate_automat + from spacemake.report.automated_analysis import generate_automated_analysis_metadata template_file = os.path.join(spacemake_dir, "report/templates/automated_analysis.html") @@ -671,8 +679,8 @@ rule create_automated_report: wildcards.sample_id, wildcards.run_mode, wildcards.puck_barcode_file_id_qc, - input['obs_df']', - input['var_df']', + input['obs_df'], + input['var_df'], input['nhood_enrichment'], params['is_spatial'], wildcards.umi_cutoff, From 7d2965af39716759cdf22bf19861007d5b04f8fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Mon, 18 Sep 2023 11:12:31 +0200 Subject: [PATCH 09/20] add report html template files to setup.cfg --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 62112c68..c66824f6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,6 +32,7 @@ spacemake = data/*.csv config/*.yaml longread/*.py + report/templates/*.html # [options.data_files] # snakemake = spacemake/snakemake/dropseq.smk From 7065bf856885d9c3b2eec22030a0e0770e861837 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Mon, 18 Sep 2023 11:19:53 +0200 Subject: [PATCH 10/20] dont plot spatial variables not in dge_summary --- spacemake/report/qc_sequencing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacemake/report/qc_sequencing.py b/spacemake/report/qc_sequencing.py index d98f081f..3ac8b002 100644 --- a/spacemake/report/qc_sequencing.py +++ b/spacemake/report/qc_sequencing.py @@ -824,6 +824,8 @@ def generate_qc_sequencing_metadata( # Plot spatial metrics spatialqc = {"name": run_mode, "plots": []} for spatial_metric in SPATIAL_METRICS: + if spatial_metric not in adata.obs.columns: + continue fig, axes = plot_spatial_qc_metric( adata, spatial_metric, From d70952fd980afeaf39af60a89a5069ff11eaa68e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Mon, 18 Sep 2023 11:20:30 +0200 Subject: [PATCH 11/20] reads to millions --- spacemake/report/qc_sequencing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacemake/report/qc_sequencing.py b/spacemake/report/qc_sequencing.py index 3ac8b002..978104a1 100644 --- a/spacemake/report/qc_sequencing.py +++ b/spacemake/report/qc_sequencing.py @@ -704,7 +704,7 @@ def generate_qc_sequencing_metadata( obs_df = dge_summaries[run_mode]["df"] metrics_beads = obs_df[["n_genes_by_counts", "reads_per_counts", "n_reads", "total_counts"]].median().to_dict() metrics_beads["n_beads"] = len(obs_df) - metrics_beads["sum_reads"] = obs_df["n_reads"].sum() + metrics_beads["sum_reads"] = f'{round(obs_df["n_reads"].sum()/1e6, 2)} (1e6)' summarised_metrics["variables"] = metrics_beads report["summarisedmetrics"].append(summarised_metrics) From c04d03bc64fde1780d165f2c96f29dda72aa6368 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Mon, 18 Sep 2023 11:31:15 +0200 Subject: [PATCH 12/20] bool argument parsing --- spacemake/report/qc_sequencing.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/spacemake/report/qc_sequencing.py b/spacemake/report/qc_sequencing.py index 978104a1..e955adca 100644 --- a/spacemake/report/qc_sequencing.py +++ b/spacemake/report/qc_sequencing.py @@ -71,6 +71,13 @@ "exact_compression": "barcode length after compression per spatial unit", } + +STRTOBOOL = { + "False": False, + "True": True +} + + logger_name = "spacemake.report.qc_sequencing" logger = logging.getLogger(logger_name) @@ -600,6 +607,7 @@ def generate_table_mapping_statistics(complete_data_root: str, split_reads_read_ def load_dge_summary(obs_df_path, with_adata=True): obs_df = pd.read_csv(obs_df_path) + obs_df = obs_df[obs_df['cell_bc'].str.contains('^[0-9]+|^[ACTGAN]+$', regex=True)] empty_ad = csr_matrix((len(obs_df), 1), dtype=int) adata = ad.AnnData(empty_ad) adata.obs = obs_df @@ -873,7 +881,7 @@ def cmdline(): args.complete_data_root, args.split_reads_read_type, args.puck_barcode_file_id_qc, - args.is_spatial, + STRTOBOOL[args.is_spatial], args.run_modes, ) From 5b3c1ba4c152056fd5022ca193bad4258220363c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Mon, 18 Sep 2023 11:51:03 +0200 Subject: [PATCH 13/20] spatial qc: colormap to magma, vmax=0.9 quantile --- spacemake/report/automated_analysis.py | 10 +++------- spacemake/report/qc_sequencing.py | 3 ++- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/spacemake/report/automated_analysis.py b/spacemake/report/automated_analysis.py index 722a7308..38b5ff11 100644 --- a/spacemake/report/automated_analysis.py +++ b/spacemake/report/automated_analysis.py @@ -240,10 +240,6 @@ def generate_automated_analysis_metadata( run_mode_vars = project_df.config.get_run_mode(run_mode).variables - print(puck_metrics) - print(puck_settings) - print(run_mode_vars) - px_by_um = puck_metrics["px_by_um"] mesh_spot_diameter_um = run_mode_vars["mesh_spot_diameter_um"] meshed = run_mode_vars["mesh_data"] @@ -287,7 +283,7 @@ def generate_automated_analysis_metadata( color="total_counts", ax=axes, show=False, - cmap="inferno", + cmap="magma", ) axes.spines[["right", "top"]].set_visible(False) axes.set_xticks(x_breaks) @@ -310,7 +306,7 @@ def generate_automated_analysis_metadata( color="total_counts", ax=axes, show=False, - cmap="inferno", + cmap="magma", vmax=np.quantile(adata.obs["total_counts"], 0.9), ) axes.spines[["right", "top"]].set_visible(False) @@ -396,7 +392,7 @@ def generate_automated_analysis_metadata( fig, axes = plt.subplots(1, 1, figsize=(4, 4)) plmat = axes.matshow( - ne_data, cmap="inferno", vmin=-50, vmax=100, origin="lower" + ne_data, cmap="magma", vmin=-50, vmax=100, origin="lower" ) cbar = plt.colorbar(plmat, fraction=0.046) cbar.set_label("Neighbor enrichment score") diff --git a/spacemake/report/qc_sequencing.py b/spacemake/report/qc_sequencing.py index e955adca..412a7b68 100644 --- a/spacemake/report/qc_sequencing.py +++ b/spacemake/report/qc_sequencing.py @@ -543,7 +543,8 @@ def plot_spatial_qc_metric( ax=axes, title=SPATIAL_METRICS_TITLES[metric], show=False, - cmap="inferno", + vmax=np.quantile(adata.obs[metric], 0.9), + cmap="magma", ) axes.spines[["right", "top"]].set_visible(False) axes.set_xticks(x_breaks) From 29ded8cecab6409e5ab16c5dca01572a081ee43a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Mon, 18 Sep 2023 11:52:14 +0200 Subject: [PATCH 14/20] obs_df['cell_bc'] as str in qc_sequencing --- spacemake/report/qc_sequencing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacemake/report/qc_sequencing.py b/spacemake/report/qc_sequencing.py index 412a7b68..ad7a9c3d 100644 --- a/spacemake/report/qc_sequencing.py +++ b/spacemake/report/qc_sequencing.py @@ -608,6 +608,7 @@ def generate_table_mapping_statistics(complete_data_root: str, split_reads_read_ def load_dge_summary(obs_df_path, with_adata=True): obs_df = pd.read_csv(obs_df_path) + obs_df['cell_bc'] = obs_df['cell_bc'].astype(str) obs_df = obs_df[obs_df['cell_bc'].str.contains('^[0-9]+|^[ACTGAN]+$', regex=True)] empty_ad = csr_matrix((len(obs_df), 1), dtype=int) adata = ad.AnnData(empty_ad) From 6d24e64785a540235cc83b242c50889d7df0caaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Mon, 18 Sep 2023 11:55:48 +0200 Subject: [PATCH 15/20] ignore nan when computing quantile --- spacemake/report/qc_sequencing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacemake/report/qc_sequencing.py b/spacemake/report/qc_sequencing.py index ad7a9c3d..e3c1a0bc 100644 --- a/spacemake/report/qc_sequencing.py +++ b/spacemake/report/qc_sequencing.py @@ -543,7 +543,7 @@ def plot_spatial_qc_metric( ax=axes, title=SPATIAL_METRICS_TITLES[metric], show=False, - vmax=np.quantile(adata.obs[metric], 0.9), + vmax=np.nanquantile(adata.obs[metric], 0.9), cmap="magma", ) axes.spines[["right", "top"]].set_visible(False) From 87d67ec67dff3faff745e68b8b8c80c7498719fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Mon, 18 Sep 2023 11:56:13 +0200 Subject: [PATCH 16/20] ignore nan computing quantile (auto analysis) --- spacemake/report/automated_analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacemake/report/automated_analysis.py b/spacemake/report/automated_analysis.py index 38b5ff11..7c0e55c5 100644 --- a/spacemake/report/automated_analysis.py +++ b/spacemake/report/automated_analysis.py @@ -307,7 +307,7 @@ def generate_automated_analysis_metadata( ax=axes, show=False, cmap="magma", - vmax=np.quantile(adata.obs["total_counts"], 0.9), + vmax=np.nanquantile(adata.obs["total_counts"], 0.9), ) axes.spines[["right", "top"]].set_visible(False) axes.set_xticks(x_breaks) From 05b03dc0e08aa242fb6fbc5e08d2e9d526475133 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Mon, 18 Sep 2023 15:00:10 +0200 Subject: [PATCH 17/20] python report: add saturation analysis --- spacemake/report/saturation_analysis.py | 416 ++++ .../report/templates/saturation_analysis.html | 1811 +++++++++++++++++ spacemake/snakemake/downsample.smk | 20 +- 3 files changed, 2245 insertions(+), 2 deletions(-) create mode 100644 spacemake/report/saturation_analysis.py create mode 100644 spacemake/report/templates/saturation_analysis.html diff --git a/spacemake/report/saturation_analysis.py b/spacemake/report/saturation_analysis.py new file mode 100644 index 00000000..bd1ab49a --- /dev/null +++ b/spacemake/report/saturation_analysis.py @@ -0,0 +1,416 @@ +import base64 +import io +import logging +import os +from typing import Union + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from itertools import cycle +from jinja2 import Template +from scipy.stats import gaussian_kde +from spacemake.config import ConfigFile +from spacemake.project_df import ProjectDF +from spacemake.util import message_aggregation + +absolute_path = os.path.dirname(__file__) + +cpalette = { + "grey": "#999999", + "light_orange": "#E69F00", + "light_blue": "#56B4E9", + "green": "#009E73", + "yellow": "#F0E442", + "blue": "#0072B2", + "orange": "#D55E00", + "pink": "#CC79A7", +} + +clrs = { + "umis": cpalette["light_orange"], + "genes": cpalette["light_blue"], + "reads": cpalette["green"], + "pcr": cpalette["pink"], + "pct_counts_mt": "black", +} + +SAMPLEINFO_VARS = [ + "species", + "sequencing_date", + "investigator", + "experiment", + "barcode_flavor", + "sequencing_date", + "puck" +] + +parula_dict = { + 1: "#352a87", + 2: "#2058b0", + 3: "#1f7eb8", + 4: "#28a7d7", + 5: "#38d7e3", + 6: "#99d4d0", + 7: "#aacca1", + 8: "#bbcc74", + 9: "#cbcc49", + 10: "#e0d317" +} + +PCT_DOWNSAMPLE_TO_PLOT = [20, 40, 60, 80, 100] +DOWNSAMPLE_PCTS = list(range(10,110,10)) + +# Run example: +# python /home/dleonpe/spacemake_project/repos/spacemake/spacemake/report/saturation_analysis.py \ +# --project-id fc_sts_76 \ +# --sample-id fc_sts_76_3 \ +# --run-modes fc_sts_novaseq_SP_mesh_7um fc_sts_novaseq_SP_mesh_7um fc_sts_novaseq_SP_mesh_7um fc_sts_novaseq_SP_mesh_7um fc_sts_novaseq_SP_mesh_7um fc_sts_novaseq_SP_mesh_7um fc_sts_novaseq_SP_mesh_7um fc_sts_novaseq_SP_mesh_7um fc_sts_novaseq_SP_mesh_7um fc_sts_novaseq_SP_mesh_7um \ +# --downsampled-dge-summary /data/rajewsky/home/dleonpe/projects/openst_paper/data/1_spacemake_mouse_new/projects/fc_sts_76/processed_data/fc_sts_76_3/illumina/downsampled_data/*/dge/dge.all.polyA_adapter_trimmed.mm_included.spatial_beads.mesh_7_hexagon_fc_010_L3_tile_2157.obs.csv /home/dleonpe/spacemake_project/data/1_spacemake_mouse_new/projects/fc_sts_76/processed_data/fc_sts_76_3/illumina/complete_data/dge/dge.all.polyA_adapter_trimmed.mm_included.spatial_beads.mesh_7_hexagon_fc_010_L3_tile_2157.obs.csv \ +# --out-html-report ~/Desktop/fc_sts_76_3_saturation_analysis.html \ +# --puck-barcode-file-id fc_010_L3_tile_2157 + +# .deciledmedian has the 'parula' color map + +logger_name = "spacemake.report.saturation_analysis" +logger = logging.getLogger(logger_name) + + +def setup_parser(parser): + """ + Set up command-line arguments for the script. + + :param parser: Argument parser object. + :type parser: argparse.ArgumentParser + :returns: Updated argument parser object. + :rtype: argparse.ArgumentParser + """ + # These allow to get the run_mode_variables.* from the config file + # and pbf_metrics.px_by_um via project_df.puck_barcode_file_metrics + parser.add_argument( + "--project-id", + type=str, + help="the project_id in spacemake project_df.csv", + required=True, + ) + parser.add_argument( + "--sample-id", + type=str, + help="the sample_id in spacemake project_df.csv", + required=True, + ) + parser.add_argument( + "--run-modes", + type=str, + nargs="+", + help="run modes of the sample for which the report will be generated", + required=False, + default=None, + ) + parser.add_argument( + "--downsampled-dge-summary", + type=str, + nargs="+", + help="path to the 'downsampled_dge_summary' file(s) generated by spacemake downsampling analysis.", + required=True, + ) + parser.add_argument( + "--puck-barcode-file-id", + type=str, + help="the puck_barcode_file_id for the current report", + required=True, + ) + # This specifies where the output file will be generated + parser.add_argument( + "--out-html-report", + type=str, + help="where the HTML report will be saved", + required=True, + ) + + return parser + + +def plot_density_metric_faceted(values, metric, log_scale=True, color='#000000', title=''): + fig, axes = plt.subplots(len(PCT_DOWNSAMPLE_TO_PLOT), 1, figsize=(5, 0.5*len(PCT_DOWNSAMPLE_TO_PLOT))) + + i = 0 + for downsample_pct, value_density in values.groupby("_downsample_pct_report"): + if int(downsample_pct) in PCT_DOWNSAMPLE_TO_PLOT: + density_function = gaussian_kde(np.nan_to_num(value_density[metric]), bw_method=0.1) + x = np.linspace(1, max(np.nan_to_num(values[metric])), 100) + + axes[i].plot(x, density_function(x), color='black', linewidth=1) + axes[i].fill_between(x, density_function(x), color=color) + axes[i].set_yticks([]) + + if log_scale: + axes[i].set_xscale("log") + + axes[i].spines[["right", "top", "bottom"]].set_visible(False) + axes[i].text(1.05, 0.5, f'{downsample_pct}%', transform=axes[i].transAxes, va='center') + i += 1 + + axes[-1].spines[["right", "top"]].set_visible(False) + axes[-1].spines[["left", "bottom"]].set_visible(True) + axes[-1].set_xlabel(title) + + for i in range(i-1): + axes[i].set_xticks([]) + + fig.text(0.0, 0.6, 'density', va='center', rotation='vertical') + plt.tight_layout() + + return fig, axes + + +def plot_median_per_run_mode(values, metric, umi_cutoffs, color='#000000', title=''): + fig, axes = plt.subplots(1, 1, figsize=(5, 3)) + + lines = ["-","--","-.",":"] + linecycler = cycle(lines) + handles, labels = [], [] + + for umi_cutoff in umi_cutoffs: + _values = values[values['total_counts'] > umi_cutoff] + median_values = _values[[metric, '_downsample_pct_report']].groupby('_downsample_pct_report').median().reset_index() + + linestyle = next(linecycler) + + line, = axes.plot(median_values['_downsample_pct_report'], median_values[metric], linestyle, color=color, label=umi_cutoff) + axes.scatter(median_values['_downsample_pct_report'], median_values[metric], s=20, color=color, edgecolors='black') + + handles.append(line) + labels.append(umi_cutoff) + + axes.set_xticks(PCT_DOWNSAMPLE_TO_PLOT) + axes.set_xticklabels([f'{pct}%'for pct in PCT_DOWNSAMPLE_TO_PLOT]) + axes.spines[["right", "top"]].set_visible(False) + axes.set_xlabel("downsampling percentage") + axes.set_ylabel(title) + + legend = axes.legend(handles, labels, loc='lower right', title="UMI cutoff") + legend.set_frame_on(False) + + plt.tight_layout() + return fig, axes + + +def generate_deciled_data(values): + # Group by '_downsample_pct_report' and perform the necessary calculations + def calculate_deciles(group): + group['cumsum_reads'] = group['n_reads'].cumsum() + group['decile_limit'] = group['n_reads'].sum() / 10 + group['decile'] = (group['cumsum_reads'] / group['decile_limit']).floordiv(1) + 1 + return group.loc[group['decile'] < 11] + + # Group by '_downsample_pct_report' and apply the calculate_deciles function + decile_dat = values.groupby('_downsample_pct_report').apply(calculate_deciles).reset_index(drop=True) + + # Group by 'percentage' and 'decile' and calculate medians and counts + decile_dat = (decile_dat.groupby(['_downsample_pct_report', 'decile']) + .agg({'n_reads': 'median', 'n_genes_by_counts': 'median', 'reads_per_counts': 'median', 'total_counts': 'median', 'cell_bc': 'count'}) + .reset_index()) + + # Melt the DataFrame to long format + decile_dat = pd.melt(decile_dat, id_vars=['_downsample_pct_report', 'decile'], var_name='observation', value_name='value') + + # Convert 'decile' and '_downsample_pct_report' to appropriate data types + decile_dat['decile'] = decile_dat['decile'].astype('category') + decile_dat['_downsample_pct_report'] = decile_dat['_downsample_pct_report'].astype(int) + + mapping_dict = {'n_reads': 'median_reads', 'n_genes_by_counts': 'median_genes', 'reads_per_counts': 'median_pcr', 'total_counts': 'median_umis', 'cell_bc': 'n_beads'} + decile_dat['observation'] = decile_dat['observation'].replace(mapping_dict) + + return decile_dat + + +def plot_deciled_median(decile_dat): + fig, axes = plt.subplots(3, 2, figsize=(6, 4)) + + # Iterate through each unique 'observation' for facetting + for i, (obs, data) in enumerate(decile_dat.groupby('observation')): + for _obs, _data in data.groupby('decile'): + axes.flatten()[i].plot(_data['_downsample_pct_report'], _data['value'], label=_obs, linewidth=0.6, color=parula_dict[_obs]) + axes.flatten()[i].scatter(_data['_downsample_pct_report'], _data['value'], s=20, edgecolors='black', color=parula_dict[_obs]) + + axes.flatten()[i].set_xticks([0, 20, 40, 60, 80, 100]) + axes.flatten()[i].set_xticklabels(['0', '20', '40', '60', '80', '100']) + axes.flatten()[i].set_title(obs) + axes.flatten()[i].spines[['top', 'right']].set_visible(False) + + axes.flatten()[i].set_xlabel("downsampling percentage") + if i % 2 == 0: + axes.flatten()[-1].axis("off") + + # Create a single legend at the bottom + handles, labels = [], [] + for obs in parula_dict: + handles.append(plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=parula_dict[obs], markersize=8)) + labels.append(str(obs)) + + fig.legend(handles, labels, title='Decile', loc='lower right', ncol=3, bbox_to_anchor=(0.95, 0.02)) + + plt.tight_layout() + + return fig, axes + + +def plot_to_base64(fig): + my_stringIObytes = io.BytesIO() + fig.savefig(my_stringIObytes, format="jpg", dpi=300) + plt.close() + my_stringIObytes.seek(0) + return base64.b64encode(my_stringIObytes.read()).decode() + + +def load_dge_summary_downsampling(downsampled_dge_summary, run_mode, puck_barcode_file_id): + obs_df = pd.DataFrame() + for downsample_pct in DOWNSAMPLE_PCTS: + _obs_df = pd.read_csv(downsampled_dge_summary[f'downsampled_dge_summary.{run_mode}.{downsample_pct}.{puck_barcode_file_id}']) + _obs_df['_downsample_pct_report'] = downsample_pct + obs_df = pd.concat([obs_df, _obs_df]) + + return obs_df + + +def generate_saturation_analysis_metadata( + project_id: str, + sample_id: str, + run_modes: Union[list, str], + downsampled_dge_summary: Union[dict, list, str], + puck_barcode_file_id: str, +): + if isinstance(downsampled_dge_summary, str): + downsampled_dge_summary = [downsampled_dge_summary] + if isinstance(run_modes, str): + run_modes = [run_modes] + if ( + isinstance(run_modes, list) + and isinstance(downsampled_dge_summary, list) + and len(run_modes) == len(downsampled_dge_summary) + ): + downsampled_dge_summary = { + f'downsampled_dge_summary.{run_mode}.{d_pct}.{puck_barcode_file_id}': d for run_mode, d, d_pct in zip(run_modes, downsampled_dge_summary, DOWNSAMPLE_PCTS) + } + elif ( + isinstance(run_modes, list) + and isinstance(downsampled_dge_summary, list) + and len(run_modes) != len(downsampled_dge_summary) + ): + raise ValueError("'run_modes' and 'downsampled_dge_summary' must have the same length") + + report = { + "type": "saturation_analysis", + "runinformation": [], + "date": None, + "plots": [], + } + main_plots = { + "histostats": [], + "medianstats": [], + "deciledmedian": [] + } + report["plots"] = main_plots + + # Loading project_df for metadata + config = ConfigFile.from_yaml("config.yaml") + project_df = ProjectDF("project_df.csv", config=config) + + # Table: sample info + sample_info = project_df.get_sample_info(project_id, sample_id) + report["sampleinfo"] = {} + report["sampleinfo"]["project_id"] = project_id + report["sampleinfo"]["sample_id"] = sample_id + report["sampleinfo"].update({svar: sample_info[svar] for svar in SAMPLEINFO_VARS}) + + # Loading all dge data summaries + # Table: summarised metrics over beads + dge_summaries = {} + for run_mode in run_modes: + dge_summaries[run_mode] = {} + dge_summaries[run_mode] = load_dge_summary_downsampling(downsampled_dge_summary, run_mode, puck_barcode_file_id) + + # Plots + # Histograms per run mode + for run_mode, dge_summary in dge_summaries.items(): + umicutoff = {"name": run_mode, "umiplot": None, "readsplot": None, "readsumiplot": None} + fig, _ = plot_density_metric_faceted(dge_summary, "total_counts", log_scale=True, color=clrs['umis'], title='# of UMIs per spatial unit') + umicutoff["umiplot"] = plot_to_base64(fig) + + fig, _ = plot_density_metric_faceted(dge_summary, "n_reads", log_scale=True, color=clrs['reads'], title='# of reads per spatial unit') + umicutoff["readsplot"] = plot_to_base64(fig) + + fig, _ = plot_density_metric_faceted(dge_summary, "reads_per_counts", log_scale=False, color=clrs['pcr'], title='reads/UMI per spatial unit') + umicutoff["readsumiplot"] = plot_to_base64(fig) + + report["plots"]["histostats"].append(umicutoff) + + # Median plots per run mode + for run_mode, dge_summary in dge_summaries.items(): + umi_cutoffs_values = project_df.config.get_run_mode(run_mode).variables['umi_cutoff'] + umi_cutoffs_values = list(sorted(list(set([int(u) for u in umi_cutoffs_values] + [1])))) + medianstats = {"name": run_mode, "umiplot": None, "readsplot": None, "readsumiplot": None} + fig, _ = plot_median_per_run_mode(dge_summary, "total_counts", umi_cutoffs_values, color=clrs['umis'], title="median reads\nper spatial unit") + medianstats["umiplot"] = plot_to_base64(fig) + + fig, _ = plot_median_per_run_mode(dge_summary, "n_reads", umi_cutoffs_values, color=clrs['reads'], title="median UMIs\nper spatial unit") + medianstats["readsplot"] = plot_to_base64(fig) + + fig, _ = plot_median_per_run_mode(dge_summary, "reads_per_counts", umi_cutoffs_values, color=clrs['pcr'], title="median reads/UMI\nper spatial unit") + medianstats["readsumiplot"] = plot_to_base64(fig) + + report["plots"]["medianstats"].append(medianstats) + + + # Deciled plots + for run_mode, dge_summary in dge_summaries.items(): + deciledmedian = {"name": run_mode, "plot": None} + decile_dat = generate_deciled_data(dge_summary) + fig, _ = plot_deciled_median(decile_dat) + deciledmedian["plot"] = plot_to_base64(fig) + report["plots"]["deciledmedian"].append(deciledmedian) + + return report + + +def generate_html_report(data, template_file): + with open(template_file, "r") as template_data: + template_content = template_data.read() + template = Template(template_content) + + html_report = template.render(report=data) + + return html_report + + +@message_aggregation(logger_name) +def cmdline(): + """cmdline.""" + import argparse + + parser = argparse.ArgumentParser( + allow_abbrev=False, + description="generate spacemake's 'saturation analysis' with python", + ) + parser = setup_parser(parser) + + args = parser.parse_args() + template_file = os.path.join(absolute_path, "templates/saturation_analysis.html") + + report_metadata = generate_saturation_analysis_metadata( + args.project_id, + args.sample_id, + args.run_modes, + args.downsampled_dge_summary, + args.puck_barcode_file_id + ) + + html_report = generate_html_report(report_metadata, template_file) + + with open(args.out_html_report, "w") as output: + output.write(html_report) + + +if __name__ == "__main__": + cmdline() \ No newline at end of file diff --git a/spacemake/report/templates/saturation_analysis.html b/spacemake/report/templates/saturation_analysis.html new file mode 100644 index 00000000..8d666119 --- /dev/null +++ b/spacemake/report/templates/saturation_analysis.html @@ -0,0 +1,1811 @@ + + + + + + + + + + + + + + spacemake {{ report.type }} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+

spacemake report

+
+
+
+
+
+ +

Saturation analysis

+
+

Overview

+
+

Run information

+ + + {% for runinfo_var in report.runinformation %} + + + + + {% endfor %} + +
+ {{ runinfo_var }} + + {{ report.runinformation[runinfo_var] }} +
+

saturation analysis v.0.1.2, generated on {{ report.date }}

+

contact: , ,

+
+
+

Downstream stats

+

In order to know whether we would gain more from sequencing deeper, we downsampled the data (the final.bam file) to contain 10%, 20%… 90% reads, and then we created the DigitalExpression matrix (as in the normal dropseq pipeline).

+

This can give us insight, whether we have reached the saturation point (in terms of median umi per cell and median genes per cell) or whether we should sequence deeper.

+

Results of this are plotted below.

+ +
+

Histograms per run mode

+ {% for histostats in report.plots.histostats %} +
+

{{ histostats.name }}

+

+

+

+
+ {% endfor %} +
+ +
+

Median plots per run mode

+ {% for medianstats in report.plots.medianstats %} +
+

{{ medianstats.name }}

+

+

+

+
+ {% endfor %} +
+ +
+

Deciled median plots per run mode

+ {% for deciledmedian in report.plots.deciledmedian %} +
+

{{ deciledmedian.name }}

+

+
+ {% endfor %} +
+ +
+
+
+ + + + + + + + + + + + + \ No newline at end of file diff --git a/spacemake/snakemake/downsample.smk b/spacemake/snakemake/downsample.smk index 128ad52f..f17483c0 100644 --- a/spacemake/snakemake/downsample.smk +++ b/spacemake/snakemake/downsample.smk @@ -74,6 +74,7 @@ def get_saturation_analysis_input(wildcards): return files + rule create_saturation_analysis: input: unpack(get_saturation_analysis_input) @@ -84,5 +85,20 @@ rule create_saturation_analysis: wildcards.project_id, wildcards.sample_id), run_modes = lambda wildcards: get_run_modes_from_sample( wildcards.project_id, wildcards.sample_id) - script: - "scripts/saturation_analysis.Rmd" + run: + from spacemake.report.saturation_analysis import generate_saturation_analysis_metadata + + template_file = os.path.join(spacemake_dir, "report/templates/saturation_analysis.html") + + report_metadata = generate_saturation_analysis_metadata( + wildcards.project_id, + wildcards.sample_id, + params['run_modes'], + input, + wildcards.puck_barcode_file_id_qc + ) + + html_report = generate_html_report(report_metadata, template_file) + + with open(output[0], "w") as output: + output.write(html_report) \ No newline at end of file From 17e7ba9fc7144d0ca645586f26f4448397cb8c35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Mon, 18 Sep 2023 15:02:40 +0200 Subject: [PATCH 18/20] saturation analysis report: add date and time --- spacemake/report/saturation_analysis.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacemake/report/saturation_analysis.py b/spacemake/report/saturation_analysis.py index bd1ab49a..55425ef4 100644 --- a/spacemake/report/saturation_analysis.py +++ b/spacemake/report/saturation_analysis.py @@ -1,4 +1,5 @@ import base64 +import datetime import io import logging import os @@ -303,7 +304,7 @@ def generate_saturation_analysis_metadata( report = { "type": "saturation_analysis", "runinformation": [], - "date": None, + "date": datetime.date.today().strftime("%Y/%m/%d"), "plots": [], } main_plots = { From 7bb7f6cf4f9f751c4f96016caeb48ddd4d60e500 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Mon, 18 Sep 2023 15:03:50 +0200 Subject: [PATCH 19/20] saturation analysis, applied black/flake8 --- spacemake/report/saturation_analysis.py | 183 ++++++++++++++---------- 1 file changed, 106 insertions(+), 77 deletions(-) diff --git a/spacemake/report/saturation_analysis.py b/spacemake/report/saturation_analysis.py index 55425ef4..7a9d321e 100644 --- a/spacemake/report/saturation_analysis.py +++ b/spacemake/report/saturation_analysis.py @@ -3,12 +3,12 @@ import io import logging import os +from itertools import cycle from typing import Union import matplotlib.pyplot as plt import numpy as np import pandas as pd -from itertools import cycle from jinja2 import Template from scipy.stats import gaussian_kde from spacemake.config import ConfigFile @@ -43,7 +43,7 @@ "experiment", "barcode_flavor", "sequencing_date", - "puck" + "puck", ] parula_dict = { @@ -56,22 +56,11 @@ 7: "#aacca1", 8: "#bbcc74", 9: "#cbcc49", - 10: "#e0d317" + 10: "#e0d317", } PCT_DOWNSAMPLE_TO_PLOT = [20, 40, 60, 80, 100] -DOWNSAMPLE_PCTS = list(range(10,110,10)) - -# Run example: -# python /home/dleonpe/spacemake_project/repos/spacemake/spacemake/report/saturation_analysis.py \ -# --project-id fc_sts_76 \ -# --sample-id fc_sts_76_3 \ -# --run-modes fc_sts_novaseq_SP_mesh_7um fc_sts_novaseq_SP_mesh_7um fc_sts_novaseq_SP_mesh_7um fc_sts_novaseq_SP_mesh_7um fc_sts_novaseq_SP_mesh_7um fc_sts_novaseq_SP_mesh_7um fc_sts_novaseq_SP_mesh_7um fc_sts_novaseq_SP_mesh_7um fc_sts_novaseq_SP_mesh_7um fc_sts_novaseq_SP_mesh_7um \ -# --downsampled-dge-summary /data/rajewsky/home/dleonpe/projects/openst_paper/data/1_spacemake_mouse_new/projects/fc_sts_76/processed_data/fc_sts_76_3/illumina/downsampled_data/*/dge/dge.all.polyA_adapter_trimmed.mm_included.spatial_beads.mesh_7_hexagon_fc_010_L3_tile_2157.obs.csv /home/dleonpe/spacemake_project/data/1_spacemake_mouse_new/projects/fc_sts_76/processed_data/fc_sts_76_3/illumina/complete_data/dge/dge.all.polyA_adapter_trimmed.mm_included.spatial_beads.mesh_7_hexagon_fc_010_L3_tile_2157.obs.csv \ -# --out-html-report ~/Desktop/fc_sts_76_3_saturation_analysis.html \ -# --puck-barcode-file-id fc_010_L3_tile_2157 - -# .deciledmedian has the 'parula' color map +DOWNSAMPLE_PCTS = list(range(10, 110, 10)) logger_name = "spacemake.report.saturation_analysis" logger = logging.getLogger(logger_name) @@ -132,65 +121,71 @@ def setup_parser(parser): return parser -def plot_density_metric_faceted(values, metric, log_scale=True, color='#000000', title=''): - fig, axes = plt.subplots(len(PCT_DOWNSAMPLE_TO_PLOT), 1, figsize=(5, 0.5*len(PCT_DOWNSAMPLE_TO_PLOT))) +def plot_density_metric_faceted(values, metric, log_scale=True, color="#000000", title=""): + fig, axes = plt.subplots(len(PCT_DOWNSAMPLE_TO_PLOT), 1, figsize=(5, 0.5 * len(PCT_DOWNSAMPLE_TO_PLOT))) i = 0 for downsample_pct, value_density in values.groupby("_downsample_pct_report"): if int(downsample_pct) in PCT_DOWNSAMPLE_TO_PLOT: density_function = gaussian_kde(np.nan_to_num(value_density[metric]), bw_method=0.1) x = np.linspace(1, max(np.nan_to_num(values[metric])), 100) - - axes[i].plot(x, density_function(x), color='black', linewidth=1) + + axes[i].plot(x, density_function(x), color="black", linewidth=1) axes[i].fill_between(x, density_function(x), color=color) axes[i].set_yticks([]) if log_scale: axes[i].set_xscale("log") - + axes[i].spines[["right", "top", "bottom"]].set_visible(False) - axes[i].text(1.05, 0.5, f'{downsample_pct}%', transform=axes[i].transAxes, va='center') + axes[i].text(1.05, 0.5, f"{downsample_pct}%", transform=axes[i].transAxes, va="center") i += 1 axes[-1].spines[["right", "top"]].set_visible(False) axes[-1].spines[["left", "bottom"]].set_visible(True) axes[-1].set_xlabel(title) - for i in range(i-1): + for i in range(i - 1): axes[i].set_xticks([]) - fig.text(0.0, 0.6, 'density', va='center', rotation='vertical') + fig.text(0.0, 0.6, "density", va="center", rotation="vertical") plt.tight_layout() return fig, axes -def plot_median_per_run_mode(values, metric, umi_cutoffs, color='#000000', title=''): +def plot_median_per_run_mode(values, metric, umi_cutoffs, color="#000000", title=""): fig, axes = plt.subplots(1, 1, figsize=(5, 3)) - lines = ["-","--","-.",":"] + lines = ["-", "--", "-.", ":"] linecycler = cycle(lines) handles, labels = [], [] for umi_cutoff in umi_cutoffs: - _values = values[values['total_counts'] > umi_cutoff] - median_values = _values[[metric, '_downsample_pct_report']].groupby('_downsample_pct_report').median().reset_index() + _values = values[values["total_counts"] > umi_cutoff] + median_values = ( + _values[[metric, "_downsample_pct_report"]].groupby("_downsample_pct_report").median().reset_index() + ) linestyle = next(linecycler) - - line, = axes.plot(median_values['_downsample_pct_report'], median_values[metric], linestyle, color=color, label=umi_cutoff) - axes.scatter(median_values['_downsample_pct_report'], median_values[metric], s=20, color=color, edgecolors='black') + + (line,) = axes.plot( + median_values["_downsample_pct_report"], median_values[metric], linestyle, color=color, label=umi_cutoff + ) + axes.scatter( + median_values["_downsample_pct_report"], median_values[metric], s=20, color=color, edgecolors="black" + ) handles.append(line) labels.append(umi_cutoff) axes.set_xticks(PCT_DOWNSAMPLE_TO_PLOT) - axes.set_xticklabels([f'{pct}%'for pct in PCT_DOWNSAMPLE_TO_PLOT]) + axes.set_xticklabels([f"{pct}%" for pct in PCT_DOWNSAMPLE_TO_PLOT]) axes.spines[["right", "top"]].set_visible(False) axes.set_xlabel("downsampling percentage") axes.set_ylabel(title) - legend = axes.legend(handles, labels, loc='lower right', title="UMI cutoff") + legend = axes.legend(handles, labels, loc="lower right", title="UMI cutoff") legend.set_frame_on(False) plt.tight_layout() @@ -200,28 +195,46 @@ def plot_median_per_run_mode(values, metric, umi_cutoffs, color='#000000', title def generate_deciled_data(values): # Group by '_downsample_pct_report' and perform the necessary calculations def calculate_deciles(group): - group['cumsum_reads'] = group['n_reads'].cumsum() - group['decile_limit'] = group['n_reads'].sum() / 10 - group['decile'] = (group['cumsum_reads'] / group['decile_limit']).floordiv(1) + 1 - return group.loc[group['decile'] < 11] + group["cumsum_reads"] = group["n_reads"].cumsum() + group["decile_limit"] = group["n_reads"].sum() / 10 + group["decile"] = (group["cumsum_reads"] / group["decile_limit"]).floordiv(1) + 1 + return group.loc[group["decile"] < 11] # Group by '_downsample_pct_report' and apply the calculate_deciles function - decile_dat = values.groupby('_downsample_pct_report').apply(calculate_deciles).reset_index(drop=True) + decile_dat = values.groupby("_downsample_pct_report").apply(calculate_deciles).reset_index(drop=True) # Group by 'percentage' and 'decile' and calculate medians and counts - decile_dat = (decile_dat.groupby(['_downsample_pct_report', 'decile']) - .agg({'n_reads': 'median', 'n_genes_by_counts': 'median', 'reads_per_counts': 'median', 'total_counts': 'median', 'cell_bc': 'count'}) - .reset_index()) + decile_dat = ( + decile_dat.groupby(["_downsample_pct_report", "decile"]) + .agg( + { + "n_reads": "median", + "n_genes_by_counts": "median", + "reads_per_counts": "median", + "total_counts": "median", + "cell_bc": "count", + } + ) + .reset_index() + ) # Melt the DataFrame to long format - decile_dat = pd.melt(decile_dat, id_vars=['_downsample_pct_report', 'decile'], var_name='observation', value_name='value') + decile_dat = pd.melt( + decile_dat, id_vars=["_downsample_pct_report", "decile"], var_name="observation", value_name="value" + ) # Convert 'decile' and '_downsample_pct_report' to appropriate data types - decile_dat['decile'] = decile_dat['decile'].astype('category') - decile_dat['_downsample_pct_report'] = decile_dat['_downsample_pct_report'].astype(int) - - mapping_dict = {'n_reads': 'median_reads', 'n_genes_by_counts': 'median_genes', 'reads_per_counts': 'median_pcr', 'total_counts': 'median_umis', 'cell_bc': 'n_beads'} - decile_dat['observation'] = decile_dat['observation'].replace(mapping_dict) + decile_dat["decile"] = decile_dat["decile"].astype("category") + decile_dat["_downsample_pct_report"] = decile_dat["_downsample_pct_report"].astype(int) + + mapping_dict = { + "n_reads": "median_reads", + "n_genes_by_counts": "median_genes", + "reads_per_counts": "median_pcr", + "total_counts": "median_umis", + "cell_bc": "n_beads", + } + decile_dat["observation"] = decile_dat["observation"].replace(mapping_dict) return decile_dat @@ -230,15 +243,19 @@ def plot_deciled_median(decile_dat): fig, axes = plt.subplots(3, 2, figsize=(6, 4)) # Iterate through each unique 'observation' for facetting - for i, (obs, data) in enumerate(decile_dat.groupby('observation')): - for _obs, _data in data.groupby('decile'): - axes.flatten()[i].plot(_data['_downsample_pct_report'], _data['value'], label=_obs, linewidth=0.6, color=parula_dict[_obs]) - axes.flatten()[i].scatter(_data['_downsample_pct_report'], _data['value'], s=20, edgecolors='black', color=parula_dict[_obs]) + for i, (obs, data) in enumerate(decile_dat.groupby("observation")): + for _obs, _data in data.groupby("decile"): + axes.flatten()[i].plot( + _data["_downsample_pct_report"], _data["value"], label=_obs, linewidth=0.6, color=parula_dict[_obs] + ) + axes.flatten()[i].scatter( + _data["_downsample_pct_report"], _data["value"], s=20, edgecolors="black", color=parula_dict[_obs] + ) axes.flatten()[i].set_xticks([0, 20, 40, 60, 80, 100]) - axes.flatten()[i].set_xticklabels(['0', '20', '40', '60', '80', '100']) + axes.flatten()[i].set_xticklabels(["0", "20", "40", "60", "80", "100"]) axes.flatten()[i].set_title(obs) - axes.flatten()[i].spines[['top', 'right']].set_visible(False) + axes.flatten()[i].spines[["top", "right"]].set_visible(False) axes.flatten()[i].set_xlabel("downsampling percentage") if i % 2 == 0: @@ -247,10 +264,10 @@ def plot_deciled_median(decile_dat): # Create a single legend at the bottom handles, labels = [], [] for obs in parula_dict: - handles.append(plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=parula_dict[obs], markersize=8)) + handles.append(plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=parula_dict[obs], markersize=8)) labels.append(str(obs)) - fig.legend(handles, labels, title='Decile', loc='lower right', ncol=3, bbox_to_anchor=(0.95, 0.02)) + fig.legend(handles, labels, title="Decile", loc="lower right", ncol=3, bbox_to_anchor=(0.95, 0.02)) plt.tight_layout() @@ -268,8 +285,10 @@ def plot_to_base64(fig): def load_dge_summary_downsampling(downsampled_dge_summary, run_mode, puck_barcode_file_id): obs_df = pd.DataFrame() for downsample_pct in DOWNSAMPLE_PCTS: - _obs_df = pd.read_csv(downsampled_dge_summary[f'downsampled_dge_summary.{run_mode}.{downsample_pct}.{puck_barcode_file_id}']) - _obs_df['_downsample_pct_report'] = downsample_pct + _obs_df = pd.read_csv( + downsampled_dge_summary[f"downsampled_dge_summary.{run_mode}.{downsample_pct}.{puck_barcode_file_id}"] + ) + _obs_df["_downsample_pct_report"] = downsample_pct obs_df = pd.concat([obs_df, _obs_df]) return obs_df @@ -292,7 +311,8 @@ def generate_saturation_analysis_metadata( and len(run_modes) == len(downsampled_dge_summary) ): downsampled_dge_summary = { - f'downsampled_dge_summary.{run_mode}.{d_pct}.{puck_barcode_file_id}': d for run_mode, d, d_pct in zip(run_modes, downsampled_dge_summary, DOWNSAMPLE_PCTS) + f"downsampled_dge_summary.{run_mode}.{d_pct}.{puck_barcode_file_id}": d + for run_mode, d, d_pct in zip(run_modes, downsampled_dge_summary, DOWNSAMPLE_PCTS) } elif ( isinstance(run_modes, list) @@ -307,11 +327,7 @@ def generate_saturation_analysis_metadata( "date": datetime.date.today().strftime("%Y/%m/%d"), "plots": [], } - main_plots = { - "histostats": [], - "medianstats": [], - "deciledmedian": [] - } + main_plots = {"histostats": [], "medianstats": [], "deciledmedian": []} report["plots"] = main_plots # Loading project_df for metadata @@ -330,40 +346,57 @@ def generate_saturation_analysis_metadata( dge_summaries = {} for run_mode in run_modes: dge_summaries[run_mode] = {} - dge_summaries[run_mode] = load_dge_summary_downsampling(downsampled_dge_summary, run_mode, puck_barcode_file_id) + dge_summaries[run_mode] = load_dge_summary_downsampling( + downsampled_dge_summary, run_mode, puck_barcode_file_id + ) # Plots # Histograms per run mode for run_mode, dge_summary in dge_summaries.items(): umicutoff = {"name": run_mode, "umiplot": None, "readsplot": None, "readsumiplot": None} - fig, _ = plot_density_metric_faceted(dge_summary, "total_counts", log_scale=True, color=clrs['umis'], title='# of UMIs per spatial unit') + fig, _ = plot_density_metric_faceted( + dge_summary, "total_counts", log_scale=True, color=clrs["umis"], title="# of UMIs per spatial unit" + ) umicutoff["umiplot"] = plot_to_base64(fig) - fig, _ = plot_density_metric_faceted(dge_summary, "n_reads", log_scale=True, color=clrs['reads'], title='# of reads per spatial unit') + fig, _ = plot_density_metric_faceted( + dge_summary, "n_reads", log_scale=True, color=clrs["reads"], title="# of reads per spatial unit" + ) umicutoff["readsplot"] = plot_to_base64(fig) - fig, _ = plot_density_metric_faceted(dge_summary, "reads_per_counts", log_scale=False, color=clrs['pcr'], title='reads/UMI per spatial unit') + fig, _ = plot_density_metric_faceted( + dge_summary, "reads_per_counts", log_scale=False, color=clrs["pcr"], title="reads/UMI per spatial unit" + ) umicutoff["readsumiplot"] = plot_to_base64(fig) report["plots"]["histostats"].append(umicutoff) # Median plots per run mode for run_mode, dge_summary in dge_summaries.items(): - umi_cutoffs_values = project_df.config.get_run_mode(run_mode).variables['umi_cutoff'] + umi_cutoffs_values = project_df.config.get_run_mode(run_mode).variables["umi_cutoff"] umi_cutoffs_values = list(sorted(list(set([int(u) for u in umi_cutoffs_values] + [1])))) medianstats = {"name": run_mode, "umiplot": None, "readsplot": None, "readsumiplot": None} - fig, _ = plot_median_per_run_mode(dge_summary, "total_counts", umi_cutoffs_values, color=clrs['umis'], title="median reads\nper spatial unit") + fig, _ = plot_median_per_run_mode( + dge_summary, "total_counts", umi_cutoffs_values, color=clrs["umis"], title="median reads\nper spatial unit" + ) medianstats["umiplot"] = plot_to_base64(fig) - fig, _ = plot_median_per_run_mode(dge_summary, "n_reads", umi_cutoffs_values, color=clrs['reads'], title="median UMIs\nper spatial unit") + fig, _ = plot_median_per_run_mode( + dge_summary, "n_reads", umi_cutoffs_values, color=clrs["reads"], title="median UMIs\nper spatial unit" + ) medianstats["readsplot"] = plot_to_base64(fig) - fig, _ = plot_median_per_run_mode(dge_summary, "reads_per_counts", umi_cutoffs_values, color=clrs['pcr'], title="median reads/UMI\nper spatial unit") + fig, _ = plot_median_per_run_mode( + dge_summary, + "reads_per_counts", + umi_cutoffs_values, + color=clrs["pcr"], + title="median reads/UMI\nper spatial unit", + ) medianstats["readsumiplot"] = plot_to_base64(fig) report["plots"]["medianstats"].append(medianstats) - # Deciled plots for run_mode, dge_summary in dge_summaries.items(): deciledmedian = {"name": run_mode, "plot": None} @@ -400,11 +433,7 @@ def cmdline(): template_file = os.path.join(absolute_path, "templates/saturation_analysis.html") report_metadata = generate_saturation_analysis_metadata( - args.project_id, - args.sample_id, - args.run_modes, - args.downsampled_dge_summary, - args.puck_barcode_file_id + args.project_id, args.sample_id, args.run_modes, args.downsampled_dge_summary, args.puck_barcode_file_id ) html_report = generate_html_report(report_metadata, template_file) @@ -414,4 +443,4 @@ def cmdline(): if __name__ == "__main__": - cmdline() \ No newline at end of file + cmdline() From 0b437857a8866933a9620a42b6ab61778342e2e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Le=C3=B3n-Perin=C3=A1n?= Date: Mon, 18 Sep 2023 15:05:38 +0200 Subject: [PATCH 20/20] removed R dependencies, added jinja2 --- environment.yaml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/environment.yaml b/environment.yaml index 44530b3b..82410103 100644 --- a/environment.yaml +++ b/environment.yaml @@ -14,14 +14,7 @@ dependencies: - bcl2fastq2>=2.19 - fastqc>=0.11.9 - pip>=21.1 - - r-base>=4.0.3 - - r-rmarkdown>=2.7 - - r-tidyverse>=1.3.1 - - r-kableextra>=1.3.4 - - r-cowplot>=1.1.1 - - r-pals>=1.7 - - r-hexbin - - r-scales + - jinja2>=3.1.2 - pysam>=0.16.0.1 - pot - openjdk==11.0.15