diff --git a/src/Afanc/autodatabase/runFuncs.py b/src/Afanc/autodatabase/runFuncs.py index 9924cf9..1a77cc2 100644 --- a/src/Afanc/autodatabase/runFuncs.py +++ b/src/Afanc/autodatabase/runFuncs.py @@ -12,7 +12,7 @@ def runAutoDB(args): """ Run autodatabase pipeline """ from ..utilities.makeWD import initAutoDBDirStructure - from ..utilities.getVersions import get_versions_autodatabase + from ..utilities.getVersions import getVersionsAutodatabase subprocessID = "MAIN" vprint( @@ -30,7 +30,7 @@ def runAutoDB(args): fasta_db_path = args.fastaDir ## capture python package and software versions for the autodatabase module in a JSON file - get_versions_autodatabase(args) + getVersionsAutodatabase(args) ## download ncbi taxonomy and preprocess fastas fasta_dict, mapping_dict = preprocessing(args, fasta_db_path) @@ -42,13 +42,13 @@ def runAutoDB(args): makeK2db(args) ## make the variant index from quality controlled assemblies - make_variant_index(args) + makeVariantIndex(args) ## make a Krona chart for pleasing visualisation makeKronaChart(args) ## clean the output directory - clean_outdir(args) + cleanOutdir(args) vprint( "FINISHED", @@ -205,7 +205,7 @@ def makeK2db(args): chdir(args.autoDB_WDir) -def make_variant_index(args): +def makeVariantIndex(args): """ Generates a variant index of parent child distances """ from .makeVariantIndex import make_variant_index @@ -251,7 +251,7 @@ def makeKronaChart(args): chdir(args.autoDB_WDir) -def clean_outdir(args): +def cleanOutdir(args): """ Cleans the output directory according to provided arguments. clean : remove the mash working directory. diff --git a/src/Afanc/screen/runFuncs.py b/src/Afanc/screen/runFuncs.py index 754244e..8d2422a 100644 --- a/src/Afanc/screen/runFuncs.py +++ b/src/Afanc/screen/runFuncs.py @@ -24,7 +24,8 @@ def runScreen(args): ## if no_map is True then exit Afanc screen if args.no_map: - vprint("FINISHED", f"no_map mode finished. Metagenomic report can be found in {out_json}\n", "prGreen") + final_report = makeFinalReport(args, None, None) + vprint("FINISHED", f"no_map mode finished. Metagenomic report can be found in {final_report}\n", "prGreen") return 0 ## parse kraken2 report to a json @@ -44,7 +45,7 @@ def runScreen(args): final_report = makeFinalReport(args, variant_profile, reports) if args.clean or args.superclean: - final_report = clean_outdir(args, final_report) + final_report = cleanOutdir(args, final_report) vprint("FINISHED", f"Final report can be found at {final_report}\n", "prGreen") @@ -193,7 +194,7 @@ def makeFinalReport(args, variant_profile, reports): import json from os import listdir - from Afanc.utilities.getVersions import get_versions_screen + from Afanc.utilities.getVersions import getVersionsScreen subprocessID = "REPORT" vprint( @@ -228,7 +229,7 @@ def makeFinalReport(args, variant_profile, reports): ## collect versions jsondict["versions"] = { - "screen" : get_versions_screen() + "screen" : getVersionsScreen() } ## collect k2 json reports @@ -243,25 +244,44 @@ def makeFinalReport(args, variant_profile, reports): ## initialise warnings event["warnings"] = [] - ## block to deal with most likely variants - if "closest_variant" in event: - variant_flag = True - taxon_id = str(event["closest_variant"]["taxon_id"]) + ## handle instances where an assembly cannot be found or no_map mode was used + if "assembly" in event: - ## in instances where this hit was subjected to variant profiling, the assembly used for mapping will - ## belong to the species rather than the closest variant - if "assembly" in event["closest_variant"]: - assembly = event["closest_variant"]["assembly"] + ## block to deal with most likely variants + if "closest_variant" in event: + variant_flag = True + taxon_id = str(event["closest_variant"]["taxon_id"]) + + ## in instances where this hit was subjected to variant profiling, the assembly used for mapping will + ## belong to the species rather than the closest variant + if "assembly" in event["closest_variant"]: + assembly = event["closest_variant"]["assembly"] + else: + assembly = event["assembly"] + + ## no variants else: + variant_flag = False assembly = event["assembly"] + taxon_id = str(event["taxon_id"]) + + ## block to deal with instances where there is no assembly + ## this is to ensure that a final json is constructed when no_map mode is used else: - variant_flag = False - assembly = event["assembly"] - taxon_id = str(event["taxon_id"]) + ## block to deal with most likely variants + if "closest_variant" in event: + variant_flag = True + assembly = None + taxon_id = str(event["closest_variant"]["taxon_id"]) + + ## no variants + else: + variant_flag = False + assembly = None + taxon_id = str(event["taxon_id"]) ## block dealing with hits which have an accompanying assembly which reads were mapped to if not assembly == None: - assembly_prefix = path.basename(path.splitext(assembly)[0]) if assembly_prefix.endswith("_genomic"): assembly_prefix = assembly_prefix.strip("_genomic") @@ -310,7 +330,7 @@ def makeFinalReport(args, variant_profile, reports): return final_report -def clean_outdir(args, final_report): +def cleanOutdir(args, final_report): """ Cleans the output directory according to provided arguments. clean : remove the bt2 working directory. diff --git a/src/Afanc/utilities/getVersions.py b/src/Afanc/utilities/getVersions.py index d7b4c8c..6efd605 100644 --- a/src/Afanc/utilities/getVersions.py +++ b/src/Afanc/utilities/getVersions.py @@ -5,7 +5,7 @@ from .runCommands import command -def get_versions_autodatabase(args): +def getVersionsAutodatabase(args): """ get python package and software versions for afanc-autodatabase """ @@ -34,7 +34,7 @@ def get_versions_autodatabase(args): json.dump({ "afanc-autodatabase_versions" : version_dict }, fout, indent = 4) -def get_versions_screen(): +def getVersionsScreen(): """ Gets python package and softeware versions for afanc-screen """ diff --git a/test/afanc_test.py b/test/afanc_test.py index abe15e0..e640bd2 100644 --- a/test/afanc_test.py +++ b/test/afanc_test.py @@ -24,7 +24,10 @@ import argparse from argparse import RawTextHelpFormatter -def run_test(args): + +def runTest(args): + + nomap = True ## tsv for results tsv_handle = open("./Afanc_test.tsv", 'w') @@ -61,8 +64,11 @@ def run_test(args): print(f"Error at {fq1} {fq2} {truth_val}. FASTQ files must end with either .fq.gz or .fastq.gz. Exiting.") sys.exit(1) - runline = f"afanc screen -o {prefix} -v {args.variants} -n 10 {args.db} {fq1} {fq2} -c > {prefix}.log" - # print(runline) + if nomap: + runline = f"afanc screen -o {prefix} -v {args.variants} -n 10 {args.db} {fq1} {fq2} -a > {prefix}.log" + + else: + runline = f"afanc screen -o {prefix} -v {args.variants} -n 10 {args.db} {fq1} {fq2} -c > {prefix}.log" if not os.path.exists(prefix): subprocess.call(runline, shell=True) @@ -70,7 +76,7 @@ def run_test(args): results_json = f"./{prefix}/{prefix}.json" if os.path.exists(results_json): - results = check_hits(results_json, truth_val) + results = checkHits(results_json, truth_val) else: result = "RUNFAIL" @@ -94,24 +100,24 @@ def run_test(args): tsv_handle.close() -def check_hits(results_json, truth_val): +def checkHits(results_json, truth_val): with open(results_json, "r") as fin: jdata = json.load(fin) hits = jdata["results"]["Detection_events"] - cluster_results, cluster_hits = get_cluster_hits(hits["Clustering_results"], truth_val) + cluster_results, cluster_hits = getClusterHits(hits["Clustering_results"], truth_val) results = { "cluster" : [cluster_results, cluster_hits], "variant" : [None, [["None", "None"]]] } if "Variant_profile" in hits: - variant_results, variant_hits = get_variant_hits(hits["Variant_profile"], truth_val) + variant_results, variant_hits = getClusterHits(hits["Variant_profile"], truth_val) results["variant"] = [variant_results, variant_hits] return results -def get_variant_hits(hits, truth_val): +def getVariantHits(hits, truth_val): variant_box = [] @@ -130,7 +136,7 @@ def get_variant_hits(hits, truth_val): return result, variant_box -def get_cluster_hits(hits, truth_val): +def getClusterHits(hits, truth_val): """ checks hits against a known truth value """ @@ -184,6 +190,6 @@ def parse_args(argv): parser.add_argument("run_list", action='store', help="A tab separated txt file containing the paired end .fq paths and a truth value to compare the output to.") parser.add_argument("variants", action='store', help="A tab separated txt file containing variant info.") - run_test(parser.parse_args(argv)) + runTest(parser.parse_args(argv)) parse_args(sys.argv) diff --git a/test/get_clusters.py b/test/get_clusters.py new file mode 100644 index 0000000..e886fe1 --- /dev/null +++ b/test/get_clusters.py @@ -0,0 +1,161 @@ +import sys, os +import subprocess +import pandas as pd +import matplotlib.pyplot as plt +import matplotlib.cm as cm +import numpy as np +from os import path, walk +from sklearn.decomposition import PCA +import itertools + +from Afanc.utilities.runCommands import command + + +def makePCA(distance_matrix, species_dict): + # Perform PCA + pca = PCA(n_components=2) + principal_components = pca.fit_transform(distance_matrix.values) + + # Create a DataFrame for visualization + pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2']) + pca_df.index = distance_matrix.index # Assuming index contains sequence IDs + + # Map species to the DataFrame based on the dictionary + pca_df['Species'] = pca_df.index.map(species_dict) + + # Create a classifier dictionary + unique_species = pca_df['Species'].unique() + # classifier_dict = dict(zip(unique_species, plt.cm.tab10(np.arange(len(unique_species))))) + markers = list(itertools.product(['o', 's', '^', 'D', 'v'], plt.cm.tab10.colors)) + classifier_dict = dict(zip(unique_species, markers)) + + # Plot the PCA with colored points + fig, ax = plt.subplots() + for species, (shape, color) in classifier_dict.items(): + subset_df = pca_df[pca_df['Species'] == species] + + # # Calculate the mode for each cluster + # for cluster in subset_df.groupby('Species').groups.values(): + # # Filter the DataFrame using boolean indexing + # cluster_data = subset_df.loc[cluster] + + # # Calculate the mode + # cluster_mode = cluster_data[['PC1', 'PC2']].mode().iloc[0] + + # # Draw a ring around the mode with a radius of 0.1 times the range + # radius = 0.1 * np.ptp(cluster_data[['PC1', 'PC2']].values) + # circle = plt.Circle((cluster_mode['PC1'], cluster_mode['PC2']), radius, color='black', fill=False) + # ax.add_patch(circle) + + ax.scatter(subset_df['PC1'], subset_df['PC2'], marker=shape, color=color, label=species) + + ax.set_xlabel('Principal Component 1') + ax.set_ylabel('Principal Component 2') + ax.grid(True) + ax.legend() + + # Shrink current axis by 20% + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) + + # Put a legend to the right of the current axis + ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) + + ## add a scree plot + # scree_ax = fig.add_axes([0.69, 0.67, 0.2, 0.2]) + # explained_variance_ratio = pca.explained_variance_ratio_ + # scree_ax.plot(range(1, len(explained_variance_ratio) + 1), np.cumsum(explained_variance_ratio), marker='o', linestyle='--') + # scree_ax.set_xlabel('N. PCs') + # scree_ax.set_ylabel('Cum. Variance') + + # scree_ax.axvline(x=2, color='red', linestyle='--') ## Draw a vertical line at x=2 + + # y_intersect = np.cumsum(explained_variance_ratio)[1] + # scree_ax.text(2, y_intersect, f'(2,{y_intersect:.2f})', color='red', verticalalignment='bottom') + + # scree_ax.grid(True) + + plt.show() + + + +def readDistOut(dist_file, id_dict): + + # Read the file into a DataFrame + df = pd.read_csv(dist_file, sep='\t', header=None, names=["ref_path", "query_path", "mash_dist", "p", "matching_hashes"]) + + # Extract file names from paths + df['ref_ID'] = df['ref_path'].apply(lambda x: path.basename(x)) + df['query_ID'] = df['query_path'].apply(lambda x: path.basename(x)) + + # Create a pivot table to construct the distance matrix + distance_matrix = df.pivot(index='ref_ID', columns='query_ID', values='mash_dist') + + # Fill the diagonal with zeros + distance_matrix = distance_matrix.fillna(0) + + # Fill in the missing values by mirroring the existing values + distance_matrix = distance_matrix + distance_matrix.T + + # Save the result to a CSV file + distance_matrix.to_csv("mash_out.dist") + + return distance_matrix + + +def flattenDirectory(directory_path): + flat_dir_dict = {} + + for root, dirs, files in os.walk(directory_path): + # Extract the subdirectory name from the current path + subdirectory = path.relpath(root, directory_path) + + # Add non-directory files to the result dictionary + files_in_subdirectory = [path.join(root, f) for f in files if os.path.isfile(os.path.join(root, f))] + flat_dir_dict[subdirectory] = files_in_subdirectory + + return flat_dir_dict + + +def mash(prefix, fastas): + """ run mash + """ + + mashdist_out = path.abspath(f"{prefix}_mashdist.txt") + + mash_sketchline = f"mash sketch -o ref {' '.join(fastas)}" + mash_distline = f"mash dist ref.msh ref.msh > {mashdist_out}" + command(mash_sketchline, "MASH").run_comm(0, None, None) + command(mash_distline, "MASH").run_comm(0, None, None) + + return mashdist_out + +def fastANI(prefix, fastas): + """ run fastANI + """ + + outfile = f"{prefix}_fastANI.txt" + + with open(f"fastas.txt", 'w') as fout: + print('\n'.join(fastas), file=fout) + + argline = f"fastANI --ql fastas.txt --rl fastas.txt -o {outfile} -t 10" + print(argline) + # subprocess.call(argline, shell=True) + + return outfile + +if __name__=="__main__": + fasta_dir="/home/amorris/BioInf/afanc_kleb_WD/klebsiella_3.0_getds/" + out_dir="/home/amorris/BioInf/afanc_kleb_WD/" + + taxon_dict = flattenDirectory(fasta_dir) + reversed_dict = {path.basename(value): key for key, values in taxon_dict.items() for value in values} + + fasta_list = [fasta for fastas in taxon_dict.values() for fasta in fastas] + dist_out = mash("kleb", fasta_list) + # dist_out = fastANI("kleb", fasta_list) + + distance_matrix = readDistOut(dist_out, reversed_dict) + + makePCA(distance_matrix, reversed_dict) \ No newline at end of file