From 7fd1ddcb396b0083c4b02866bcd7161a1b65f362 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Cavalcante?= Date: Mon, 6 May 2024 09:59:57 -0300 Subject: [PATCH 1/3] refactor: Save cluster tree with cluster prob (#190) - Cluster probability instead of branch support Co-authored-by: KARTIK KAKADIYA <20909285+ktkakadiya@users.noreply.github.com> --- bin/rspr_heatmap.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/bin/rspr_heatmap.py b/bin/rspr_heatmap.py index c608fd9..0cffc92 100755 --- a/bin/rspr_heatmap.py +++ b/bin/rspr_heatmap.py @@ -250,6 +250,34 @@ def generate_cluster_network(lst_tree_clusters, refer_tree): update_cluster_probability(refer_tree.root, dict_clstr_map, total_trees, leaf_mapping) +##################################################################### +### FUNCTION REPLACE_CLUSTER_PROBABILITY +### Replace branch support with cluster probability +### node: current node +##################################################################### + +def replace_cluster_probability(node): + if not node: + return + node.comment = node.cluster_probability + if not node.is_terminal(): + for child in node.clades: + replace_cluster_probability(child) + + +##################################################################### +### FUNCTION SAVE_CLUSTER_TREE +### Save cluster network tree +### cluster_file_path: path to store cliuster tree +### refer_tree: reference tree +##################################################################### + +def save_cluster_tree(cluster_file_path, refer_tree): + print("Saving cluster tree") + replace_cluster_probability(refer_tree.root) + Phylo.write(refer_tree, cluster_file_path, "newick") + + ##################################################################### ### FUNCTION GENERATE_CLUSTER_HEATMAP ### Generate cluster heatmap @@ -301,8 +329,6 @@ def read_tree(input_path): formatted = re.sub(r";[^:]+:", ":", tree_string) return Phylo.read(io.StringIO(formatted), "newick") -def write_tree(output_path, data): - Phylo.write(data, output_path, "newick") def get_fig_size(refer_tree): max_fig_size = 100 @@ -350,7 +376,6 @@ def main(args=None): refer_tree = read_tree(refer_tree_path) if refer_tree: generate_cluster_network(lst_tree_clusters, refer_tree) - write_tree(cluster_file_path, refer_tree) plt.rcParams['font.size'] = '12' fig_size = get_fig_size(refer_tree) @@ -361,5 +386,7 @@ def main(args=None): plt.title("Cluster network") plt.savefig(cluster_tree_path, format="png") + save_cluster_tree(cluster_file_path, refer_tree) + if __name__ == "__main__": sys.exit(main()) From cb6138f9538fa75e56e5dcd654ce31584107a03b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Cavalcante?= Date: Tue, 7 May 2024 08:59:42 -0300 Subject: [PATCH 2/3] fix: Run rSPR when tree has duplicated nodes (#191) * refactor: Stop analysing trees with duplicate nodes Signed-off-by: jvfe * fix: Update rooted_gene_trees in rspr_exact Signed-off-by: jvfe --------- Signed-off-by: jvfe --- bin/rspr_approx.py | 53 ++++++++++++++++++++++++++++++++++------------ bin/rspr_exact.py | 6 +++--- 2 files changed, 43 insertions(+), 16 deletions(-) diff --git a/bin/rspr_approx.py b/bin/rspr_approx.py index 5de9ad8..05abace 100755 --- a/bin/rspr_approx.py +++ b/bin/rspr_approx.py @@ -85,12 +85,21 @@ def parse_args(args=None): ) return parser.parse_args(args) +def check_formatted_tree(tree_string): + """Check if formatted tree has duplicate nodes""" + + pattern = r'([a-zA-Z]+\w{3,}):.*\1' + match = re.search(pattern, tree_string) + + return bool(match) def read_tree(input_path): with open(input_path, "r") as f: tree_string = f.read() formatted = re.sub(r";[^:]+:", ":", tree_string) - return Tree(formatted) + is_duplicated = check_formatted_tree(formatted) + + return Tree(formatted), is_duplicated ##################################################################### @@ -102,12 +111,27 @@ def read_tree(input_path): ##################################################################### -def root_tree(input_path, output_path): - tre = read_tree(input_path) +def root_tree(input_path, basename, output_path): + tre,is_duplicated = read_tree(input_path) + midpoint = tre.get_midpoint_outgroup() + tre.set_outgroup(midpoint) + if is_duplicated: + outdir = Path(output_path) / "multiple" + Path(outdir).mkdir(exist_ok=True, parents=True) + output_path = outdir / basename + output_path = str(output_path).replace(".tre", ".tre.multiple") + else: + outdir = Path(output_path) / "unique" + Path(outdir).mkdir(exist_ok=True, parents=True) + output_path = outdir / basename + + tre.write(outfile=output_path) + return tre.write(), len(tre.get_leaves()), output_path, is_duplicated + +def root_reference_tree(input_path, output_path): + tre, _ = read_tree(input_path) midpoint = tre.get_midpoint_outgroup() tre.set_outgroup(midpoint) - if not os.path.exists(os.path.dirname(output_path)): - os.makedirs(os.path.dirname(output_path)) tre.write(outfile=output_path) return tre.write(), len(tre.get_leaves()) @@ -135,20 +159,23 @@ def root_trees(core_tree, gene_trees_path, output_dir, results, merge_pair=False rooted_reference_tree = os.path.join( output_dir, "rooted_reference_tree/core_gene_alignment.tre" ) - refer_content, refer_tree_size = root_tree(reference_tree, rooted_reference_tree) + refer_content, refer_tree_size = root_reference_tree(reference_tree, rooted_reference_tree) df_gene_trees = pd.read_csv(gene_trees_path) rooted_gene_trees_path = os.path.join(output_dir, "rooted_gene_trees") for filename in df_gene_trees["path"]: basename = Path(filename).name - rooted_gene_tree_path = os.path.join(rooted_gene_trees_path, basename) - gene_content, gene_tree_size = root_tree(filename, rooted_gene_tree_path) - results.loc[basename, "tree_size"] = gene_tree_size + gene_content, gene_tree_size, gene_tree_path, is_duplicated = root_tree( + filename, + basename, + rooted_gene_trees_path) + if not is_duplicated: + results.loc[basename, "tree_size"] = gene_tree_size if merge_pair: - with open(rooted_gene_tree_path, "w") as f2: + with open(gene_tree_path, "w") as f2: f2.write(refer_content + "\n" + gene_content) #''' - return rooted_gene_trees_path + return os.path.join(rooted_gene_trees_path, "unique") ##################################################################### @@ -212,7 +239,7 @@ def approx_rspr( "-length " + str(min_branch_len), "-support " + str(max_support_threshold), ] - + group_size = 10000 cur_count = 0 lst_filename = [] @@ -498,7 +525,7 @@ def main(args=None): # Generate group heatmap group_fig_path = os.path.join(args.OUTPUT_DIR, "group_output.png") make_group_heatmap( - results, + results, group_fig_path, args.MIN_HEATMAP_RSPR_DISTANCE, args.MAX_HEATMAP_RSPR_DISTANCE diff --git a/bin/rspr_exact.py b/bin/rspr_exact.py index 389da1e..00ca048 100755 --- a/bin/rspr_exact.py +++ b/bin/rspr_exact.py @@ -88,7 +88,7 @@ def fpt_rspr(results_df, min_branch_len=0, max_support_threshold=0.7, gather_clu "-support " + str(max_support_threshold), ] - trees_path = os.path.join("rooted_gene_trees") + trees_path = os.path.join("rooted_gene_trees/unique") cluster_file = None if gather_cluster_info: @@ -123,13 +123,13 @@ def fpt_rspr(results_df, min_branch_len=0, max_support_threshold=0.7, gather_clu continue elif "Clusters end" in line: clustering_start = False - + if clustering_start: updated_line = line.replace('(', '').replace(')', '').replace('\n', '') cluster_nodes = updated_line.split(',') cluster_nodes = [int(node) for node in cluster_nodes if "X" not in node] clusters.append(cluster_nodes) - + output_lines.append(line) cluster_file.write(json.dumps(clusters) + '\n') process.wait() From 7142981dc1f1b41cbdacc28d87a65813c167dae6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Cavalcante?= Date: Thu, 9 May 2024 17:14:05 -0300 Subject: [PATCH 3/3] fix: Compress output file in EvolCCM (#192) * fix: Compress output file from evolccm Signed-off-by: jvfe * refactor: Compress feature_profile, not pvals Signed-off-by: jvfe * Revert "fix: Compress output file from evolccm" This reverts commit 67583e724aaae19968156c4ffd61da1ff965fbdd. * refactor: Close connection before reading lines Signed-off-by: jvfe * fix: Add .gz to evolccm stub Signed-off-by: jvfe --------- Signed-off-by: jvfe --- bin/ParallelEvolCCM.R | 6 ++++-- modules/local/evolccm.nf | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/bin/ParallelEvolCCM.R b/bin/ParallelEvolCCM.R index 61266bf..a3bf3d4 100755 --- a/bin/ParallelEvolCCM.R +++ b/bin/ParallelEvolCCM.R @@ -458,19 +458,21 @@ for (result in results) { temp_files <- list.files(pattern = "output_temp_chunk_[0-9]+\\.txt") outputHeadings <- paste("feature1","feature2","intrisic1", "intrisic2", "gainloss1", "gainloss2", "interaction", "interact_score", "interact_pval",sep="\t") -cat(outputHeadings, file = outputFile, sep = "\n") +outputFilegz = gzfile(outputFile, "w") +cat(outputHeadings, file = outputFilegz, sep = "\n") for (temp_file in temp_files) { # Read the contents of the temporary file temp_contents <- readLines(temp_file) # Write the contents to the main output file - cat(temp_contents, file = outputFile, append = TRUE, sep = "\n") + cat(temp_contents, file = outputFilegz, append = TRUE, sep = "\n") # Remove the temporary file file.remove(temp_file) } # Read the lines from the input file +close(outputFilegz) lines <- readLines(outputFile)[-1] X2file = paste(outputFile,"X2",sep = ".") Pfile = paste(outputFile,"pvals",sep = ".") diff --git a/modules/local/evolccm.nf b/modules/local/evolccm.nf index e95b332..a0c3c27 100644 --- a/modules/local/evolccm.nf +++ b/modules/local/evolccm.nf @@ -34,7 +34,7 @@ process EVOLCCM { stub: """ - touch EvolCCM_test.tsv + touch EvolCCM_test.tsv.gz touch EvolCCM_test.tsv.pvals touch EvolCCM_test.tsv.X2 touch EvolCCM_test.tre