diff --git a/bin/Build_Upset.py b/bin/Build_Upset.py
new file mode 100755
index 0000000..2d7ec34
--- /dev/null
+++ b/bin/Build_Upset.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+
+import argparse
+import pandas as pd
+from matplotlib import cm
+from matplotlib import pyplot as plt
+from upsetplot import UpSet, from_memberships
+import warnings
+
+# Suppress the UserWarning for tight_layout
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+def load_data(samplesheet_file, feature_profile_file):
+ """
+ Load samplesheet and feature profile data from TSV files.
+
+ Parameters:
+ samplesheet_file (str): Path to the samplesheet file.
+ feature_profile_file (str): Path to the feature profile file.
+
+ Returns:
+ pd.DataFrame: Samplesheet dataframe.
+ pd.DataFrame: Feature profile dataframe.
+ """
+ try:
+ samplesheet_df = pd.read_csv(samplesheet_file)
+ feature_df = pd.read_csv(feature_profile_file, sep='\t')
+ except FileNotFoundError as e:
+ print(f"Error: {e}")
+ raise
+
+ # Convert all columns in samplesheet_df to strings
+ samplesheet_df = samplesheet_df.astype(str)
+
+ return samplesheet_df, feature_df
+
+def process_data(samplesheet_df, feature_df, column_name):
+ """
+ Process the data to merge samplesheet and feature dataframes and create presence/absence matrix.
+
+ Parameters:
+ samplesheet_df (pd.DataFrame): Samplesheet dataframe.
+ feature_df (pd.DataFrame): Feature profile dataframe.
+ column_name (str): Column name for the feature.
+
+ Returns:
+ pd.DataFrame: Transposed presence/absence dataframe with groups column.
+ """
+ merged_df = feature_df.merge(samplesheet_df, left_on='genome_id', right_on='sample', how='left')
+
+ if column_name not in merged_df.columns:
+ print(f"Warning: Column '{column_name}' not found in samplesheet. Skipping.")
+ return None
+
+ if merged_df[column_name].nunique() <= 1:
+ print(f"Warning: Column '{column_name}' contains only a single unique value. Skipping.")
+ return None
+
+ selected_columns = ['genome_id', column_name] + list(feature_df.columns[1:])
+ feature_df = merged_df[selected_columns]
+ presence = feature_df.groupby(column_name).any().astype(int)
+ presence.drop('genome_id', axis=1, inplace=True, errors='ignore')
+ presence_transposed = presence.transpose()
+ presence_transposed.reset_index(inplace=True)
+ presence_transposed.rename(columns={'index': 'feature'}, inplace=True)
+ presence_transposed['category'] = presence_transposed['feature'].str.split('_', expand=True)[0]
+
+ presence_transposed['groups'] = presence_transposed.apply(
+ lambda row: ','.join([col for col in presence_transposed.columns[2:] if row[col] == 1]), axis=1)
+
+ presence_transposed = presence_transposed.loc[:, ['feature', 'category', 'groups']]
+
+ # print("Presence Transposed DataFrame with 'groups' column:\n", presence_transposed.head()) # Debug print to check the dataframe state
+
+ return presence_transposed
+
+def create_upset_plot(samplesheet_file, feature_profile_file, column_names, output_file_prefix):
+ """
+ Create an Upset plot from samplesheet and feature profile files.
+
+ Parameters:
+ samplesheet_file (str): Path to the samplesheet file.
+ feature_profile_file (str): Path to the feature profile file.
+ column_names (list of str): List of column names for features.
+ output_file_prefix (str): Prefix for the output Upset plot files.
+ """
+ samplesheet_df, feature_df = load_data(samplesheet_file, feature_profile_file)
+
+ for column_name in column_names:
+ presence_transposed = process_data(samplesheet_df, feature_df, column_name)
+
+ if presence_transposed is None:
+ continue
+
+ if 'groups' not in presence_transposed.columns:
+ print(f"Error: 'groups' column was not created for {column_name}. Skipping.")
+ continue
+
+ # Ensure 'groups' column is processed correctly
+ memberships = presence_transposed['groups'].str.split(",").apply(lambda x: frozenset(x))
+ rows_by_group = from_memberships(memberships, data=presence_transposed.set_index('feature'))
+
+ # print("Rows by group:\n", rows_by_group.head()) # Debug print to check rows_by_group structure
+
+ # Create UpSet plot
+ fig, ax = plt.subplots(figsize=(24, 12)) # Increased width for better layout
+ upset = UpSet(rows_by_group, show_counts=True, intersection_plot_elements=0, min_subset_size=1)
+ upset.add_stacked_bars(by="category", colors=cm.Pastel1, title="category", elements=10)
+ upset.plot(fig=fig)
+
+ # Remove the black frame by turning off the top and right spines
+ ax.spines[['top','right','left','bottom']].set_visible(False)
+ ax.tick_params(left=False)
+ ax.tick_params(bottom=False)
+ ax.tick_params(labelleft=False)
+ ax.tick_params(labelbottom=False)
+
+ # Adjust layout to move legend outside the figure
+ plt.tight_layout(rect=[0.1, 0.1, 0.75, 1]) # Adjust rect to leave space on the right for legend
+ plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
+
+ output_file = f"{output_file_prefix}_{column_name}.png"
+ # Overwrite the existing file if it exists
+ plt.savefig(output_file)
+ plt.close(fig)
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Create Upset plot from samplesheet and feature profile files.')
+ parser.add_argument('-s', '--samplesheet_file', type=str, required=True, help='Path to the samplesheet file in TSV format')
+ parser.add_argument('-f', '--feature_profile_file', type=str, required=True, help='Path to the feature profile file in TSV format')
+ parser.add_argument('-c', '--columns', type=str, required=True, help='Comma-separated list of column names for features')
+ parser.add_argument('-o', '--output_file_prefix', type=str, required=True, help='Prefix for the output Upset plot files (e.g., upset_plot)')
+ args = parser.parse_args()
+
+ column_names = args.columns.split(',')
+
+ create_upset_plot(args.samplesheet_file, args.feature_profile_file, column_names, args.output_file_prefix)
diff --git a/conf/modules.config b/conf/modules.config
index cdadf67..0b7741d 100755
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -466,6 +466,13 @@ process {
]
}
+ withName: BUILD_UPSET {
+ publishDir = [
+ path: { "${params.outdir}/annotation/upsets" },
+ mode: params.publish_dir_mode,
+ ]
+ }
+
// Recombination
withName: VERTICALL_PAIRWISE {
ext.prefix = { "cluster_${cluster}" }
diff --git a/conf/test_full.config b/conf/test_full.config
index bbaf9df..2e5301c 100644
--- a/conf/test_full.config
+++ b/conf/test_full.config
@@ -20,6 +20,7 @@ params {
use_prokka = true
use_ppanggolin = true
skip_poppunk = true
+ upset_plot_columns = "group_one,group_two"
}
process {
diff --git a/docs/params.md b/docs/params.md
index 928da88..a3a8bdb 100644
--- a/docs/params.md
+++ b/docs/params.md
@@ -48,6 +48,7 @@ Parameters for the annotation subworkflow
| `min_qcover` | Minimum coverage of each match for filtering | `number` | 0.6 |
| `skip_profile_creation` | Skip annotation feature profile creation | `boolean` | |
| `feature_profile_columns` | Columns to include in the feature profile | `string` | mobsuite,rgi,cazy,vfdb,iceberg,bacmet |
+| `upset_plot_columns` | Columns to use for making Upset plots of genome features | `string` | |
## Phylogenomics
@@ -71,8 +72,8 @@ Parameters for the lineage subworkflow
| `poppunk_model` | Which PopPunk model to use (bgmm, dbscan, refine, threshold or lineage) | `string` | |
| `run_poppunk_qc` | Whether to run the QC step for PopPunk | `boolean` | |
| `enable_subsetting` | Enable subsetting workflow based on genome similarity | `boolean` | |
-| `core_similarity` | Similarity threshold for core genomes | `number` | 99.99 |
-| `accessory_similarity` | Similarity threshold for accessory genes | `number` | 99 |
+| `core_similarity` | Similarity threshold for core genomes | `number` | 99.9 |
+| `accessory_similarity` | Similarity threshold for accessory genes | `number` | 99.0 |
## Gene Order
@@ -140,9 +141,9 @@ Set the top limit for requested resources for any single job.
| Parameter | Description | Type | Default |
|-----------|-----------|-----------|-----------|
-| `max_cpus` | Maximum number of CPUs that can be requested for any single job. Help
Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1` | `integer` | 16 |
-| `max_memory` | Maximum amount of memory that can be requested for any single job. Help
Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'` | `string` | 128.GB |
-| `max_time` | Maximum amount of time that can be requested for any single job. Help
Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'` | `string` | 240.h |
+| `max_cpus` | Maximum number of CPUs that can be requested for any single job. Help
Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1` | `integer` | 72 |
+| `max_memory` | Maximum amount of memory that can be requested for any single job. Help
Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'` | `string` | 125.GB |
+| `max_time` | Maximum amount of time that can be requested for any single job. Help
Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'` | `string` | 168.h |
## Generic options
diff --git a/modules/local/build_upset/main.nf b/modules/local/build_upset/main.nf
new file mode 100644
index 0000000..8f923ae
--- /dev/null
+++ b/modules/local/build_upset/main.nf
@@ -0,0 +1,32 @@
+process BUILD_UPSET {
+ label 'process_single'
+
+ conda "conda-forge::upsetplot=0.9.0"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'docker://docker.io/jvfe/arete_upset:v0.1.0':
+ 'docker.io/jvfe/arete_upset:v0.1.0' }"
+
+ input:
+ path feature_profile
+ path samplesheet
+ val samplesheet_columns
+
+ output:
+ path "BuildUpset*png", emit: pngs
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ """
+ Build_Upset.py \\
+ --output_file_prefix BuildUpset \\
+ --feature_profile_file $feature_profile \\
+ --samplesheet_file $samplesheet \\
+ --columns $samplesheet_columns
+ """
+ stub:
+ """
+ touch BuildUpset_dummy.png
+ """
+}
diff --git a/nextflow.config b/nextflow.config
index fcabd78..821661c 100755
--- a/nextflow.config
+++ b/nextflow.config
@@ -37,6 +37,7 @@ params {
annotation_tools = 'mobsuite,rgi,cazy,vfdb,iceberg,bacmet,islandpath,phispy,report'
feature_profile_columns = 'mobsuite,rgi,cazy,vfdb,iceberg,bacmet'
feature_dispersion_columns = null
+ upset_plot_columns = null
min_pident = 60
min_qcover = 0.6
skip_profile_creation = false
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 73d9df3..41177ce 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -148,6 +148,11 @@
"fa_icon": "fas fa-columns",
"description": "Columns to include in the feature profile",
"pattern": "^((rgi|mobsuite|vfdb|cazy|bacmet|iceberg)?,?)*(?