diff --git a/bin/Feature_Dispersion.py b/bin/Feature_Dispersion.py
new file mode 100755
index 0000000..422f42d
--- /dev/null
+++ b/bin/Feature_Dispersion.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python
+
+import argparse
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from ete3 import Tree
+
+def round_to_sig_figs(value, sig_figs):
+    if value == 0:
+        return 0
+    return round(value, sig_figs - int(np.floor(np.log10(abs(value)))) - 1)
+
+def calculate_phylogenetic_diversity(tree):
+    sum_branch_lengths = 0.0
+    for node in tree.traverse():
+        sum_branch_lengths += node.dist
+    return sum_branch_lengths
+
+def calculate_feature_counts(feature_file):
+    feature_df = pd.read_csv(feature_file, sep='\t', index_col=0)
+    feature_df.reset_index(inplace=True)
+    feature_df.rename(columns={'index': 'genome_id'}, inplace=True)
+
+    # Initialize lists to store data for the new dataframe
+    features = []
+    total_counts = []
+    genomes_list = []
+
+    # Iterate over each feature column
+    for feature_col in feature_df.columns[1:]:  # Exclude 'genome_id' column
+        # Collect the genomes where this feature is present into a comma-separated list
+        genomes_with_feature = feature_df[feature_df[feature_col] == 1]['genome_id'].tolist()
+        genomes_str = ','.join(genomes_with_feature)
+
+        # Calculate the total count of this feature across all genomes
+        total_count_feature = feature_df[feature_col].sum()
+
+        # Append data to lists
+        features.append(feature_col)
+        total_counts.append(total_count_feature)
+        genomes_list.append(genomes_str)
+
+    # Create a new dataframe from the collected data
+    features_df = pd.DataFrame({
+        'feature': features,
+        'total_count': total_counts,
+        'genomes_list': genomes_list
+    })
+
+    return features_df
+
+def verify_genome_ids(tree, feature_file, samplesheet_file=None):
+    feature_df = pd.read_csv(feature_file, sep='\t', index_col=0)
+    feature_genomes = set(feature_df.index)
+
+    if samplesheet_file:
+        samplesheet_df = pd.read_csv(samplesheet_file, usecols=[0], header=0, names=['genome_id'], skiprows=1)
+        samplesheet_genomes = set(samplesheet_df['genome_id'])
+    else:
+        samplesheet_genomes = set()
+
+    tree_genomes = set(tree.get_leaf_names())
+
+    missing_in_tree = feature_genomes.union(samplesheet_genomes) - tree_genomes
+
+    if missing_in_tree:
+        print(f"Error: The following genome IDs are missing in the phylogenetic tree: {', '.join(missing_in_tree)}")
+        exit(1)
+
+def generate_heatmap(output_df, output_heatmap):
+    bins = np.arange(0, 1.1, 0.1)
+    max_genome_count = output_df['Genome Count'].max()
+    genome_bins = np.linspace(0, max_genome_count, 11)  # Generate 11 edges to create 10 bins
+    genome_bins_labels = [f'{int(genome_bins[i]) + 1}-{int(genome_bins[i + 1])}' for i in range(len(genome_bins) - 1)]
+
+    heatmap_data = pd.DataFrame(0, index=genome_bins_labels, columns=bins)
+
+    for index, row in output_df.iterrows():
+        pd_ratio = row['PD Ratio']
+        genome_count = row['Genome Count']
+        bin_idx = np.digitize(pd_ratio, bins) - 1
+        genome_bin_idx = np.digitize(genome_count, genome_bins) - 1
+
+        # Ensure the indices are within the valid range
+        bin_idx = min(bin_idx, len(bins) - 1)
+        genome_bin_idx = min(genome_bin_idx, len(genome_bins_labels) - 1)
+
+        heatmap_data.iloc[genome_bin_idx, bin_idx] += 1
+
+    plt.figure(figsize=(12, 8))
+    sns.heatmap(np.log1p(heatmap_data), cmap="Reds", cbar_kws={'label': 'Number of Features (log scale)'}, annot=heatmap_data, fmt='g', linewidths=.5)
+    plt.xlabel('PD Ratio Bins')
+    plt.ylabel('Genome Count Bins')
+    plt.title('Heatmap of Features by PD Ratio and Genome Count')
+    plt.xticks(ticks=np.arange(0.5, len(bins), 1), labels=np.round(bins, 1))
+    plt.yticks(ticks=np.arange(0.5, len(genome_bins_labels), 1), labels=genome_bins_labels)
+    plt.gca().invert_yaxis()
+    plt.savefig(output_heatmap)
+    plt.close()
+
+
+
+
+
+def main(tree_file, feature_file, output_base, samplesheet_file=None, samplesheet_columns=None):
+    ref_tree = Tree(tree_file)
+
+    # Verify genome IDs
+    verify_genome_ids(ref_tree, feature_file, samplesheet_file)
+
+    # Calculate phylogenetic diversity
+    total_diversity = calculate_phylogenetic_diversity(ref_tree)
+
+    # Calculate feature counts
+    feature_distr = calculate_feature_counts(feature_file)
+
+    # Read samplesheet if provided
+    if samplesheet_file:
+        samplesheet_df = pd.read_csv(samplesheet_file, header=0)
+        available_columns = set(samplesheet_df.columns)
+        if 'genome_id' not in available_columns:
+            print("Error: 'genome_id' column is missing in the samplesheet.")
+            exit(1)
+
+        samplesheet_data = {}
+        for column in samplesheet_columns:
+            if column in available_columns:
+                samplesheet_data[column] = samplesheet_df[['genome_id', column]].set_index('genome_id')[column].to_dict()
+            else:
+                print(f"Warning: Column '{column}' not found in the samplesheet. Skipping.")
+    else:
+        samplesheet_data = {}
+
+    # Create an empty DataFrame to store the output
+    output_columns = [
+        'Feature Name',
+        'Total PD', 'Projected PD', 'PD Ratio',
+        'Genome Count','PD Ratio / Genome Count'
+    ]
+
+    # Add columns for each requested samplesheet column
+    for column in samplesheet_columns:
+        output_columns.append(f'{column} Distinct Values')
+        output_columns.append(f'PD Ratio / {column} Values')
+
+    output_df = pd.DataFrame(columns=output_columns)
+
+    # Iterate over each feature in feature_distr
+    for index, row in feature_distr.iterrows():
+        feature_name = row['feature']
+        genomes_list = row['genomes_list'].split(',')
+        genome_count = row['total_count']
+
+        # Only proceed if the feature is present in more than one genome
+        if len(genomes_list) > 1:
+            # Generate a list of genomes to keep (those that have the feature)
+            genomes_to_keep = [genome for genome in genomes_list if genome in ref_tree]
+
+            # Create a copy of the original tree with only the relevant genomes
+            projected_tree = ref_tree.copy()
+            projected_tree.prune(genomes_to_keep)
+
+            # Calculate phylogenetic diversity of the projected tree
+            projected_diversity = calculate_phylogenetic_diversity(projected_tree)
+
+            # Calculate the ratio of projected diversity to total diversity
+            ratio_diversity = projected_diversity / total_diversity
+
+            # Calculate the ratio of projected phylogenetic diversity to total count of genomes
+            genome_ratio_phylogenetic_diversity = ratio_diversity / genome_count
+
+            # Prepare the row for output
+            output_row = [
+                feature_name,
+                round_to_sig_figs(total_diversity, 4),
+                round_to_sig_figs(projected_diversity, 4),
+                round_to_sig_figs(ratio_diversity, 4),
+                round_to_sig_figs(genome_count, 4),
+                round_to_sig_figs(genome_ratio_phylogenetic_diversity, 4)
+            ]
+
+            # Add values for each requested samplesheet column
+            for column in samplesheet_columns:
+                if column in samplesheet_data:
+                    # Identify distinct values for the column
+                    distinct_values = set(samplesheet_data[column].get(genome, None) for genome in genomes_list if genome in samplesheet_data[column])
+                    distinct_values.discard(None)
+                    V = len(distinct_values)
+                    PD_ratio_per_V = ratio_diversity / V if V > 0 else 0
+                    output_row.extend([V, round_to_sig_figs(PD_ratio_per_V, 4)])
+                else:
+                    output_row.extend([None, None])
+
+            # Add the row to the output DataFrame
+            output_df.loc[index] = output_row
+
+    output_df_sorted = output_df.sort_values(by='PD Ratio / Genome Count')
+
+    # Save the output dataframe to a TSV file
+    output_df_sorted.to_csv(f"{output_base}.tsv", sep='\t', index=False)
+
+    # Generate the heatmap
+    generate_heatmap(output_df_sorted, f"{output_base}.png")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Calculate feature statistics based on phylogenetic tree and genus information.')
+    parser.add_argument('--tree_file', type=str, required=True, help='Path to the Newick tree file')
+    parser.add_argument('--feature_file', type=str, required=True, help='Path to the feature presence/absence file')
+    parser.add_argument('--output_base', type=str, required=True, help='Base name for the output files (without extension)')
+    parser.add_argument('--samplesheet_file', type=str, help='Path to the file mapping genome IDs to other properties')
+    parser.add_argument('--samplesheet_columns', type=str, help='Comma-separated list of columns to process from the samplesheet')
+    args = parser.parse_args()
+
+    if args.samplesheet_columns:
+        samplesheet_columns = args.samplesheet_columns.split(',')
+    else:
+        samplesheet_columns = []
+
+    main(args.tree_file, args.feature_file, args.output_base, args.samplesheet_file, samplesheet_columns)
diff --git a/conf/modules.config b/conf/modules.config
index 870b5ae..cdadf67 100755
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -458,6 +458,14 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
+
+    withName: FEATURE_DISPERSION {
+        publishDir = [
+            path: { "${params.outdir}/phylogenomics/feature_dispersion" },
+            mode: params.publish_dir_mode,
+        ]
+    }
+
     // Recombination
     withName: VERTICALL_PAIRWISE {
         ext.prefix = { "cluster_${cluster}" }
diff --git a/docs/params.md b/docs/params.md
index 8460d97..928da88 100644
--- a/docs/params.md
+++ b/docs/params.md
@@ -6,155 +6,161 @@ AMR/VF LGT-focused bacterial genomics workflow
 
 Define where the pipeline should find input data and save output data.
 
-| Parameter            | Description                                                                                                                                                                                                                                                                                                                                                                                  | Type     | Default   | Required | Hidden |
-| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | --------- | -------- | ------ |
-| `input_sample_table` | Path to comma-separated file containing information about the samples in the experiment. <details><summary>Help</summary><small>You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row.</small></details> | `string` |           |          |        |
-| `outdir`             | Path to the output directory where the results will be saved.                                                                                                                                                                                                                                                                                                                                | `string` | ./results |          |        |
-| `db_cache`           | Directory where the databases are located                                                                                                                                                                                                                                                                                                                                                    | `string` |           |          |        |
-| `email`              | Email address for completion summary. <details><summary>Help</summary><small>Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.</small></details>                                  | `string` |           |          |        |
-| `multiqc_title`      | MultiQC report title. Printed as page header, used for filename if not otherwise specified.                                                                                                                                                                                                                                                                                                  | `string` |           |          |        |
+| Parameter | Description | Type | Default |
+|-----------|-----------|-----------|-----------|
+| `input_sample_table` | Path to comma-separated file containing information about the samples in the experiment. <details><summary>Help</summary><small>You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row.</small></details>| `string` |  |
+| `outdir` | Path to the output directory where the results will be saved. | `string` | ./results |
+| `db_cache` | Directory where the databases are located | `string` |  |
+| `email` | Email address for completion summary. <details><summary>Help</summary><small>Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.</small></details>| `string` |  |
+| `multiqc_title` | MultiQC report title. Printed as page header, used for filename if not otherwise specified. | `string` |  |
 
 ## Reference genome options
 
 Reference and outgroup genome fasta files required for the workflow.
 
-| Parameter          | Description                          | Type     | Default | Required | Hidden |
-| ------------------ | ------------------------------------ | -------- | ------- | -------- | ------ |
-| `reference_genome` | Path to FASTA reference genome file. | `string` |         |          |        |
+| Parameter | Description | Type | Default |
+|-----------|-----------|-----------|-----------|
+| `reference_genome` | Path to FASTA reference genome file. | `string` |  |
 
 ## QC
 
-| Parameter             | Description                                | Type      | Default | Required | Hidden |
-| --------------------- | ------------------------------------------ | --------- | ------- | -------- | ------ |
-| `run_checkm`          | Run CheckM QC software                     | `boolean` |         |          |        |
-| `apply_filtering`     | Filter assemblies on QC results            | `boolean` |         |          |        |
-| `skip_kraken`         | Don't run Kraken2 taxonomic classification | `boolean` |         |          |        |
-| `min_n50`             | Minimum N50 for filtering                  | `integer` | 10000   |          |        |
-| `min_contigs_1000_bp` | Minimum number of contigs with >1000bp     | `integer` | 1       |          |        |
-| `min_contig_length`   | Minimum average contig length              | `integer` | 1       |          |        |
+
+
+| Parameter | Description | Type | Default |
+|-----------|-----------|-----------|-----------|
+| `run_checkm` | Run CheckM QC software | `boolean` |  |
+| `apply_filtering` | Filter assemblies on QC results | `boolean` |  |
+| `skip_kraken` | Don't run Kraken2 taxonomic classification | `boolean` |  |
+| `min_n50` | Minimum N50 for filtering | `integer` | 10000 |
+| `min_contigs_1000_bp` | Minimum number of contigs with >1000bp | `integer` | 1 |
+| `min_contig_length` | Minimum average contig length | `integer` | 1 |
 
 ## Annotation
 
 Parameters for the annotation subworkflow
 
-| Parameter                 | Description                                      | Type      | Default                                                        | Required | Hidden |
-| ------------------------- | ------------------------------------------------ | --------- | -------------------------------------------------------------- | -------- | ------ |
-| `annotation_tools`        | Comma-separated list of annotation tools to run  | `string`  | mobsuite,rgi,cazy,vfdb,iceberg,bacmet,islandpath,phispy,report |          |        |
-| `bakta_db`                | Path to the BAKTA database                       | `string`  |                                                                |          |        |
-| `use_prokka`              | Use Prokka (not Bakta) for annotating assemblies | `boolean` |                                                                |          |        |
-| `min_pident`              | Minimum match identity percentage for filtering  | `integer` | 60                                                             |          |        |
-| `min_qcover`              | Minimum coverage of each match for filtering     | `number`  | 0.6                                                            |          |        |
-| `skip_profile_creation`   | Skip annotation feature profile creation         | `boolean` |                                                                |          |        |
-| `feature_profile_columns` | Columns to include in the feature profile        | `string`  | mobsuite,rgi,cazy,vfdb,iceberg,bacmet                          |          |        |
+| Parameter | Description | Type | Default |
+|-----------|-----------|-----------|-----------|
+| `annotation_tools` | Comma-separated list of annotation tools to run | `string` | mobsuite,rgi,cazy,vfdb,iceberg,bacmet,islandpath,phispy,report |
+| `bakta_db` | Path to the BAKTA database | `string` |  |
+| `use_prokka` | Use Prokka (not Bakta) for annotating assemblies | `boolean` |  |
+| `min_pident` | Minimum match identity percentage for filtering | `integer` | 60 |
+| `min_qcover` | Minimum coverage of each match for filtering | `number` | 0.6 |
+| `skip_profile_creation` | Skip annotation feature profile creation | `boolean` |  |
+| `feature_profile_columns` | Columns to include in the feature profile | `string` | mobsuite,rgi,cazy,vfdb,iceberg,bacmet |
 
 ## Phylogenomics
 
 Parameters for the phylogenomics subworkflow
 
-| Parameter            | Description                                    | Type      | Default | Required | Hidden |
-| -------------------- | ---------------------------------------------- | --------- | ------- | -------- | ------ |
-| `skip_phylo`         | Skip Pangenomics and Phylogenomics subworkflow | `boolean` |         |          |        |
-| `use_ppanggolin`     | Use ppanggolin for calculating the pangenome   | `boolean` |         |          |        |
-| `use_full_alignment` | Use full alignment                             | `boolean` |         |          |        |
-| `use_fasttree`       | Use FastTree                                   | `boolean` | True    |          |        |
+| Parameter | Description | Type | Default |
+|-----------|-----------|-----------|-----------|
+| `skip_phylo` | Skip Pangenomics and Phylogenomics subworkflow | `boolean` |  |
+| `use_ppanggolin` | Use ppanggolin for calculating the pangenome | `boolean` |  |
+| `use_full_alignment` | Use full alignment | `boolean` |  |
+| `use_fasttree` | Use FastTree | `boolean` | True |
+| `feature_dispersion_columns` | Columns from the input samplesheet to use in the feature dispersion module | `string` |  |
 
 ## PopPUNK
 
 Parameters for the lineage subworkflow
 
-| Parameter              | Description                                                             | Type      | Default | Required | Hidden |
-| ---------------------- | ----------------------------------------------------------------------- | --------- | ------- | -------- | ------ |
-| `skip_poppunk`         | Skip PopPunk                                                            | `boolean` |         |          |        |
-| `poppunk_model`        | Which PopPunk model to use (bgmm, dbscan, refine, threshold or lineage) | `string`  |         |          |        |
-| `run_poppunk_qc`       | Whether to run the QC step for PopPunk                                  | `boolean` |         |          |        |
-| `enable_subsetting`    | Enable subsetting workflow based on genome similarity                   | `boolean` |         |          |        |
-| `core_similarity`      | Similarity threshold for core genomes                                   | `number`  | 99.99   |          |        |
-| `accessory_similarity` | Similarity threshold for accessory genes                                | `number`  | 99.0    |          |        |
+| Parameter | Description | Type | Default |
+|-----------|-----------|-----------|-----------|
+| `skip_poppunk` | Skip PopPunk | `boolean` |  |
+| `poppunk_model` | Which PopPunk model to use (bgmm, dbscan, refine, threshold or lineage) | `string` |  |
+| `run_poppunk_qc` | Whether to run the QC step for PopPunk | `boolean` |  |
+| `enable_subsetting` | Enable subsetting workflow based on genome similarity | `boolean` |  |
+| `core_similarity` | Similarity threshold for core genomes | `number` | 99.99 |
+| `accessory_similarity` | Similarity threshold for accessory genes | `number` | 99 |
 
 ## Gene Order
 
 Parameters for the Gene Order Subworkflow
 
-| Parameter                   | Description                                                                                                                                                                                                                      | Type      | Default | Required | Hidden |
-| --------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ |
-| `run_gene_order`            | Whether to run the Gene Order subworkflow                                                                                                                                                                                        | `boolean` |         |          |        |
-| `gene_order_percent_cutoff` | Cutoff percentage of genomes a gene should be present within to be included in extraction and subsequent analysis. Should a float between 0 and 1 (e.g., 0.25 means only genes present in a minimum of 25% of genomes are kept). | `number`  | 0.25    |          |        |
-| `gene_order_label_cols`     | If using annotation files predicting features, list of space separated column names to be added to the gene names                                                                                                                | `string`  | None    |          |        |
-| `num_neighbors`             | Neighborhood size to extract. Should be an even number N, such that N/2 neighbors upstream and N/2 neighbors downstream will be analyzed.                                                                                        | `integer` | 10      |          |        |
-| `inflation`                 | Inflation hyperparameter value for Markov Clustering Algorithm.                                                                                                                                                                  | `integer` | 2       |          |        |
-| `epsilon`                   | Epsilon hyperparameter value for DBSCAN clustering.                                                                                                                                                                              | `number`  | 0.5     |          |        |
-| `minpts`                    | Minpts hyperparameter value for DBSCAN clustering.                                                                                                                                                                               | `integer` | 5       |          |        |
-| `plot_clustering`           | Create Clustering HTML Plots                                                                                                                                                                                                     | `boolean` |         |          |        |
+| Parameter | Description | Type | Default |
+|-----------|-----------|-----------|-----------|
+| `run_gene_order` | Whether to run the Gene Order subworkflow | `boolean` |  |
+| `input_file_path` |  | `string` | /home/jvfe/dev/dalhousie/arete/test/gene-order/rgi_input.txt |
+| `gene_order_percent_cutoff` | Cutoff percentage of genomes a gene should be present within to be included in extraction and subsequent analysis. Should a float between 0 and 1 (e.g., 0.25 means only genes present in a minimum of 25% of genomes are kept). | `number` | 0.25 |
+| `gene_order_label_cols` | If using annotation files predicting features, list of space separated column names to be added to the gene names | `string` | None |
+| `num_neighbors` | Neighborhood size to extract. Should be an even number N, such that N/2 neighbors upstream and N/2 neighbors downstream will be analyzed. | `integer` | 10 |
+| `inflation` | Inflation hyperparameter value for Markov Clustering Algorithm. | `integer` | 2 |
+| `epsilon` | Epsilon hyperparameter value for DBSCAN clustering. | `number` | 0.5 |
+| `minpts` | Minpts hyperparameter value for DBSCAN clustering. | `integer` | 5 |
+| `plot_clustering` | Create Clustering HTML Plots | `boolean` |  |
 
 ## Recombination
 
 Parameters for the recombination subworkflow
 
-| Parameter           | Description                      | Type      | Default | Required | Hidden |
-| ------------------- | -------------------------------- | --------- | ------- | -------- | ------ |
-| `run_recombination` | Run Recombination                | `boolean` |         |          |        |
-| `run_verticall`     | Run Verticall recombination tool | `boolean` | True    |          |        |
-| `run_gubbins`       | Run Gubbins recombination tool   | `boolean` |         |          |        |
+| Parameter | Description | Type | Default |
+|-----------|-----------|-----------|-----------|
+| `run_recombination` | Run Recombination | `boolean` |  |
+| `run_verticall` | Run Verticall recombination tool | `boolean` | True |
+| `run_gubbins` | Run Gubbins recombination tool | `boolean` |  |
 
 ## Dynamics
 
-| Parameter                 | Description                                                                                       | Type      | Default | Required | Hidden |
-| ------------------------- | ------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ |
-| `run_evolccm`             | Run the community coevolution model                                                               | `boolean` |         |          |        |
-| `run_rspr`                | Run rSPR                                                                                          | `boolean` |         |          |        |
-| `min_rspr_distance`       | Minimum rSPR distance used to define processing groups                                            | `integer` | 10      |          |        |
-| `min_branch_length`       | Minimum rSPR branch length                                                                        | `integer` | 0       |          |        |
-| `max_support_threshold`   | Maximum rSPR support threshold                                                                    | `number`  | 0.7     |          |        |
-| `max_approx_rspr`         | Maximum approximate rSPR distance for filtering                                                   | `integer` | -1      |          |        |
-| `min_heatmap_approx_rspr` | Minimum approximate rSPR distance used to generate heatmap                                        | `integer` | 0       |          |        |
-| `max_heatmap_approx_rspr` | Maximum approximate rSPR distance used to generate heatmap                                        | `integer` | -1      |          |        |
-| `min_heatmap_exact_rspr`  | Minimum exact rSPR distance used to generate heatmap                                              | `integer` | 0       |          |        |
-| `max_heatmap_exact_rspr`  | Maximum exact rSPR distance used to generate heatmap                                              | `integer` | -1      |          |        |
-| `core_gene_tree`          | Core (or reference) genome tree. Used in the rSPR and evolCCM entries.                            | `string`  |         |          |        |
-| `concatenated_annotation` | TSV table of annotations for all genomes. Such as the ones generated by Bakta or Prokka in ARETE. | `string`  |         |          |        |
-| `feature_profile`         | Feature profile TSV (A presence-absence matrix). Used in the evolCCM entry.                       | `string`  |         |          |        |
+
+
+| Parameter | Description | Type | Default |
+|-----------|-----------|-----------|-----------|
+| `run_evolccm` | Run the community coevolution model | `boolean` |  |
+| `run_rspr` | Run rSPR | `boolean` |  |
+| `min_rspr_distance` | Minimum rSPR distance used to define processing groups | `integer` | 10 |
+| `min_branch_length` | Minimum rSPR branch length | `integer` | 0 |
+| `max_support_threshold` | Maximum rSPR support threshold | `number` | 0.7 |
+| `max_approx_rspr` | Maximum approximate rSPR distance for filtering | `integer` | -1 |
+| `min_heatmap_approx_rspr` | Minimum approximate rSPR distance used to generate heatmap | `integer` | 0 |
+| `max_heatmap_approx_rspr` | Maximum approximate rSPR distance used to generate heatmap | `integer` | -1 |
+| `min_heatmap_exact_rspr` | Minimum exact rSPR distance used to generate heatmap | `integer` | 0 |
+| `max_heatmap_exact_rspr` | Maximum exact rSPR distance used to generate heatmap | `integer` | -1 |
+| `core_gene_tree` | Core (or reference) genome tree. Used in the rSPR and evolCCM entries. | `string` |  |
+| `concatenated_annotation` | TSV table of annotations for all genomes. Such as the ones generated by Bakta or Prokka in ARETE. | `string` |  |
+| `feature_profile` | Feature profile TSV (A presence-absence matrix). Used in the evolCCM entry. | `string` |  |
 
 ## Institutional config options
 
 Parameters used to describe centralised config profiles. These should not be edited.
 
-| Parameter                    | Description                                                                                                                                                                                                                                                                                                                                                                                       | Type     | Default                                                  | Required | Hidden |
-| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------------------------------------------------------- | -------- | ------ |
-| `custom_config_version`      | Git commit id for Institutional configs.                                                                                                                                                                                                                                                                                                                                                          | `string` | master                                                   |          | True   |
-| `custom_config_base`         | Base directory for Institutional configs. <details><summary>Help</summary><small>If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.</small></details> | `string` | https://raw.githubusercontent.com/nf-core/configs/master |          | True   |
-| `hostnames`                  | Institutional configs hostname.                                                                                                                                                                                                                                                                                                                                                                   | `string` |                                                          |          | True   |
-| `config_profile_name`        | Institutional config name.                                                                                                                                                                                                                                                                                                                                                                        | `string` |                                                          |          | True   |
-| `config_profile_description` | Institutional config description.                                                                                                                                                                                                                                                                                                                                                                 | `string` |                                                          |          | True   |
-| `config_profile_contact`     | Institutional config contact information.                                                                                                                                                                                                                                                                                                                                                         | `string` |                                                          |          | True   |
-| `config_profile_url`         | Institutional config URL link.                                                                                                                                                                                                                                                                                                                                                                    | `string` |                                                          |          | True   |
+| Parameter | Description | Type | Default |
+|-----------|-----------|-----------|-----------|
+| `custom_config_version` | Git commit id for Institutional configs. | `string` | master |
+| `custom_config_base` | Base directory for Institutional configs. <details><summary>Help</summary><small>If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.</small></details>| `string` | https://raw.githubusercontent.com/nf-core/configs/master |
+| `hostnames` | Institutional configs hostname. | `string` |  |
+| `config_profile_name` | Institutional config name. | `string` |  |
+| `config_profile_description` | Institutional config description. | `string` |  |
+| `config_profile_contact` | Institutional config contact information. | `string` |  |
+| `config_profile_url` | Institutional config URL link. | `string` |  |
 
 ## Max job request options
 
 Set the top limit for requested resources for any single job.
 
-| Parameter    | Description                                                                                                                                                                                                                                                                 | Type      | Default | Required | Hidden |
-| ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ |
-| `max_cpus`   | Maximum number of CPUs that can be requested for any single job. <details><summary>Help</summary><small>Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`</small></details>                                      | `integer` | 16      |          | True   |
-| `max_memory` | Maximum amount of memory that can be requested for any single job. <details><summary>Help</summary><small>Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`</small></details> | `string`  | 128.GB  |          | True   |
-| `max_time`   | Maximum amount of time that can be requested for any single job. <details><summary>Help</summary><small>Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`</small></details>        | `string`  | 240.h   |          | True   |
+| Parameter | Description | Type | Default |
+|-----------|-----------|-----------|-----------|
+| `max_cpus` | Maximum number of CPUs that can be requested for any single job. <details><summary>Help</summary><small>Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`</small></details>| `integer` | 16 |
+| `max_memory` | Maximum amount of memory that can be requested for any single job. <details><summary>Help</summary><small>Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`</small></details>| `string` | 128.GB |
+| `max_time` | Maximum amount of time that can be requested for any single job. <details><summary>Help</summary><small>Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`</small></details>| `string` | 240.h |
 
 ## Generic options
 
 Less common options for the pipeline, typically set in a config file.
 
-| Parameter                           | Description                                                                                                                                                                                                                                                                                                                                                                                                  | Type      | Default                        | Required | Hidden |
-| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------- | ------------------------------ | -------- | ------ |
-| `help`                              | Display help text.                                                                                                                                                                                                                                                                                                                                                                                           | `boolean` |                                |          | True   |
-| `publish_dir_mode`                  | Method used to save pipeline results to output directory. <details><summary>Help</summary><small>The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.</small></details> | `string`  | copy                           |          | True   |
-| `email_on_fail`                     | Email address for completion summary, only when pipeline fails. <details><summary>Help</summary><small>An email address to send a summary email to when the pipeline is completed - ONLY sent if the pipeline does not exit successfully.</small></details>                                                                                                                                                  | `string`  |                                |          | True   |
-| `plaintext_email`                   | Send plain-text email instead of HTML.                                                                                                                                                                                                                                                                                                                                                                       | `boolean` |                                |          | True   |
-| `max_multiqc_email_size`            | File size limit when attaching MultiQC reports to summary emails.                                                                                                                                                                                                                                                                                                                                            | `string`  | 25.MB                          |          | True   |
-| `monochrome_logs`                   | Do not use coloured log outputs.                                                                                                                                                                                                                                                                                                                                                                             | `boolean` |                                |          | True   |
-| `multiqc_config`                    | Custom config file to supply to MultiQC.                                                                                                                                                                                                                                                                                                                                                                     | `string`  |                                |          | True   |
-| `tracedir`                          | Directory to keep pipeline Nextflow logs and reports.                                                                                                                                                                                                                                                                                                                                                        | `string`  | ${params.outdir}/pipeline_info |          | True   |
-| `validate_params`                   | Boolean whether to validate parameters against the schema at runtime                                                                                                                                                                                                                                                                                                                                         | `boolean` | True                           |          | True   |
-| `show_hidden_params`                | Show all params when using `--help` <details><summary>Help</summary><small>By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters.</small></details>                                                                                                                    | `boolean` |                                |          | True   |
-| `enable_conda`                      | Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter.                                                                                                                                                                                                                                                                                                         | `boolean` |                                |          | True   |
-| `singularity_pull_docker_container` | Instead of directly downloading Singularity images for use with Singularity, force the workflow to pull and convert Docker containers instead. <details><summary>Help</summary><small>This may be useful for example if you are unable to directly pull Singularity containers to run the pipeline due to http/https proxy issues.</small></details>                                                         | `boolean` |                                |          | True   |
-| `schema_ignore_params`              |                                                                                                                                                                                                                                                                                                                                                                                                              | `string`  | genomes,modules                |          |        |
-| `multiqc_logo`                      |                                                                                                                                                                                                                                                                                                                                                                                                              | `string`  |                                |          | True   |
+| Parameter | Description | Type | Default |
+|-----------|-----------|-----------|-----------|
+| `help` | Display help text. | `boolean` |  |
+| `publish_dir_mode` | Method used to save pipeline results to output directory. <details><summary>Help</summary><small>The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.</small></details>| `string` | copy |
+| `email_on_fail` | Email address for completion summary, only when pipeline fails. <details><summary>Help</summary><small>An email address to send a summary email to when the pipeline is completed - ONLY sent if the pipeline does not exit successfully.</small></details>| `string` |  |
+| `plaintext_email` | Send plain-text email instead of HTML. | `boolean` |  |
+| `max_multiqc_email_size` | File size limit when attaching MultiQC reports to summary emails. | `string` | 25.MB |
+| `monochrome_logs` | Do not use coloured log outputs. | `boolean` |  |
+| `multiqc_config` | Custom config file to supply to MultiQC. | `string` |  |
+| `tracedir` | Directory to keep pipeline Nextflow logs and reports. | `string` | ${params.outdir}/pipeline_info |
+| `validate_params` | Boolean whether to validate parameters against the schema at runtime | `boolean` | True |
+| `show_hidden_params` | Show all params when using `--help` <details><summary>Help</summary><small>By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters.</small></details>| `boolean` |  |
+| `enable_conda` | Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter. | `boolean` |  |
+| `singularity_pull_docker_container` | Instead of directly downloading Singularity images for use with Singularity, force the workflow to pull and convert Docker containers instead. <details><summary>Help</summary><small>This may be useful for example if you are unable to directly pull Singularity containers to run the pipeline due to http/https proxy issues.</small></details>| `boolean` |  |
+| `schema_ignore_params` |  | `string` | genomes,modules |
+| `multiqc_logo` |  | `string` |  |
diff --git a/modules/local/featuredispersion/main.nf b/modules/local/featuredispersion/main.nf
new file mode 100644
index 0000000..f19dbe5
--- /dev/null
+++ b/modules/local/featuredispersion/main.nf
@@ -0,0 +1,38 @@
+process FEATURE_DISPERSION {
+    label 'process_single'
+
+    conda "bioconda::ete3=3.1.1"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'docker://docker.io/jvfe/rspr:v1.3.7':
+        'docker.io/jvfe/rspr:v1.3.7' }"
+
+    input:
+    path core_tree
+    path feature_profile
+    path samplesheet
+    val samplesheet_columns
+
+    output:
+    path "FeatureDispersion.tsv", emit: tsv
+    path "FeatureDispersion.png", emit: png
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def sheet = samplesheet ? "--samplesheet_file $samplesheet": ''
+    def columns = samplesheet_columns ? "--samplesheet_columns $samplesheet_columns": ''
+    """
+    Feature_Dispersion.py \\
+        --output_base FeatureDispersion \\
+        --tree_file $core_tree \\
+        --feature_file $feature_profile \\
+        $sheet \\
+        $columns
+    """
+    stub:
+    """
+    touch FeatureDispersion.tsv
+    touch FeatureDispersion.png
+    """
+}
diff --git a/nextflow.config b/nextflow.config
index a20036b..fcabd78 100755
--- a/nextflow.config
+++ b/nextflow.config
@@ -36,6 +36,7 @@ params {
     bakta_db                   = null
     annotation_tools           = 'mobsuite,rgi,cazy,vfdb,iceberg,bacmet,islandpath,phispy,report'
     feature_profile_columns    = 'mobsuite,rgi,cazy,vfdb,iceberg,bacmet'
+    feature_dispersion_columns = null
     min_pident                 = 60
     min_qcover                 = 0.6
     skip_profile_creation      = false
diff --git a/nextflow_schema.json b/nextflow_schema.json
index fae9918..73d9df3 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -10,7 +10,6 @@
             "type": "object",
             "fa_icon": "fas fa-terminal",
             "description": "Define where the pipeline should find input data and save output data.",
-            "required": [],
             "properties": {
                 "input_sample_table": {
                     "type": "string",
@@ -179,6 +178,11 @@
                     "description": "Use FastTree",
                     "fa_icon": "fas fa-tree",
                     "default": true
+                },
+                "feature_dispersion_columns": {
+                    "type": "string",
+                    "fa_icon": "fas fa-columns",
+                    "description": "Columns from the input samplesheet to use in the feature dispersion module"
                 }
             }
         },
@@ -217,7 +221,7 @@
                 },
                 "accessory_similarity": {
                     "type": "number",
-                    "default": 99.0,
+                    "default": 99,
                     "fa_icon": "far fa-clone",
                     "description": "Similarity threshold for accessory genes"
                 }
@@ -475,7 +479,14 @@
                     "description": "Method used to save pipeline results to output directory.",
                     "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
                     "fa_icon": "fas fa-copy",
-                    "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"],
+                    "enum": [
+                        "symlink",
+                        "rellink",
+                        "link",
+                        "copy",
+                        "copyNoFollow",
+                        "move"
+                    ],
                     "hidden": true
                 },
                 "email_on_fail": {
@@ -596,4 +607,4 @@
             "$ref": "#/definitions/generic_options"
         }
     ]
-}
+}
\ No newline at end of file
diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf
index 60777ef..c0d9d69 100755
--- a/subworkflows/local/annotation.nf
+++ b/subworkflows/local/annotation.nf
@@ -326,7 +326,7 @@ workflow ANNOTATE_ASSEMBLIES {
             ch_multiqc_files = ch_multiqc_files.mix(DIAMOND_BLAST_ICEBERG.out.log.collect{it[1]}.ifEmpty([]))
         }
 
-        profile = []
+        profile = Channel.empty()
         if (tools_to_run.contains('report')) {
             needed_for_report = ['vfdb', 'rgi', 'mobsuite']
             if (!params.use_prokka && needed_for_report.every { it in tools_to_run }) {
@@ -353,7 +353,7 @@ workflow ANNOTATE_ASSEMBLIES {
                 )
             }
 
-            profile = (params.skip_profile_creation) ? [] : CREATE_REPORT.out.profile
+            profile = (params.skip_profile_creation) ? profile : CREATE_REPORT.out.profile
         }
 
     emit:
diff --git a/subworkflows/local/phylo.nf b/subworkflows/local/phylo.nf
index 06caa9c..bc80713 100755
--- a/subworkflows/local/phylo.nf
+++ b/subworkflows/local/phylo.nf
@@ -15,6 +15,7 @@ include { PPANGGOLIN_MSA } from '../../modules/local/ppanggolin/msa/main'
 include { GML2GV } from '../../modules/local/graphviz/gml2gv/main'
 include { GET_SOFTWARE_VERSIONS } from '../../modules/local/get_software_versions'
 include { CONCAT_ALIGNMENT } from '../../modules/local/concat_alignment'
+include { FEATURE_DISPERSION } from '../../modules/local/featuredispersion/main'
 
 
 workflow PHYLOGENOMICS{
@@ -22,8 +23,10 @@ workflow PHYLOGENOMICS{
         gffs
         use_full_alignment
         use_fasttree
+        feature_profile
     main:
         ch_software_versions = Channel.empty()
+        ch_feature_profile = feature_profile.ifEmpty(false)
 
         gffs
             .map { meta, path -> [meta.id, path.getName()] }
@@ -117,6 +120,19 @@ workflow PHYLOGENOMICS{
             ch_software_versions = ch_software_versions.mix(IQTREE.out.versions.ifEmpty(null))
         }
 
+    if (ch_feature_profile != false) {
+        if (params.feature_dispersion_columns) {
+            FEATURE_DISPERSION(
+                core_tree,
+                feature_profile,
+                file(params.input_sample_table),
+                params.feature_dispersion_columns
+            )
+        } else {
+            FEATURE_DISPERSION(core_tree, feature_profile, [], [])
+        }
+    }
+
     emit:
         phylo_software = ch_software_versions
         all_alignments = ch_all_alignments
diff --git a/tests/subworkflows/local/phylo.nf.test b/tests/subworkflows/local/phylo.nf.test
index ef556f4..22108ef 100644
--- a/tests/subworkflows/local/phylo.nf.test
+++ b/tests/subworkflows/local/phylo.nf.test
@@ -23,6 +23,7 @@ nextflow_workflow {
                 input[1] = false
                 // Use fasttree
                 input[2] = true
+                input[3] = Channel.empty()
                 """
             }
         }
diff --git a/workflows/arete.nf b/workflows/arete.nf
index 71c4ed4..9caa94d 100755
--- a/workflows/arete.nf
+++ b/workflows/arete.nf
@@ -222,7 +222,12 @@ workflow ARETE {
 
     ////////////////////////// PANGENOME /////////////////////////////////////
     if (!params.skip_phylo) {
-        PHYLOGENOMICS(gffs, use_full_alignment, use_fasttree)
+        PHYLOGENOMICS(
+                gffs,
+                use_full_alignment,
+                use_fasttree,
+                ANNOTATE_ASSEMBLIES.out.feature_profile
+        )
         ch_software_versions = ch_software_versions.mix(PHYLOGENOMICS.out.phylo_software)
 
         if (params.run_evolccm) {
@@ -476,7 +481,12 @@ workflow ANNOTATION {
 
     ////////////////////////// PANGENOME /////////////////////////////////////
     if (!params.skip_phylo) {
-        PHYLOGENOMICS(gffs, use_full_alignment, use_fasttree)
+        PHYLOGENOMICS(
+                gffs,
+                use_full_alignment,
+                use_fasttree,
+                ANNOTATE_ASSEMBLIES.out.feature_profile
+        )
         ch_software_versions = ch_software_versions.mix(PHYLOGENOMICS.out.phylo_software)
 
         if (params.run_evolccm) {
@@ -603,7 +613,7 @@ workflow PHYLO {
     PHYLO_INPUT_CHECK.out.genomes.set { gffs }
 
     ////////////////////////// PANGENOME /////////////////////////////////////
-    PHYLOGENOMICS(gffs, use_full_alignment, use_fasttree)
+    PHYLOGENOMICS(gffs, use_full_alignment, use_fasttree, Channel.empty())
     ch_software_versions = ch_software_versions.mix(PHYLOGENOMICS.out.phylo_software)