From 7fd1ddcb396b0083c4b02866bcd7161a1b65f362 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Cavalcante?= <jvfecav@gmail.com>
Date: Mon, 6 May 2024 09:59:57 -0300
Subject: [PATCH 1/3] refactor: Save cluster tree with cluster prob (#190)

- Cluster probability instead of branch support

Co-authored-by: KARTIK KAKADIYA <20909285+ktkakadiya@users.noreply.github.com>
---
 bin/rspr_heatmap.py | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/bin/rspr_heatmap.py b/bin/rspr_heatmap.py
index c608fd9..0cffc92 100755
--- a/bin/rspr_heatmap.py
+++ b/bin/rspr_heatmap.py
@@ -250,6 +250,34 @@ def generate_cluster_network(lst_tree_clusters, refer_tree):
     update_cluster_probability(refer_tree.root, dict_clstr_map, total_trees, leaf_mapping)
 
 
+#####################################################################
+### FUNCTION REPLACE_CLUSTER_PROBABILITY
+### Replace branch support with cluster probability
+### node: current node
+#####################################################################
+
+def replace_cluster_probability(node):
+    if not node:
+        return
+    node.comment = node.cluster_probability
+    if not node.is_terminal():
+        for child in node.clades:
+            replace_cluster_probability(child)
+
+
+#####################################################################
+### FUNCTION SAVE_CLUSTER_TREE
+### Save cluster network tree
+### cluster_file_path: path to store cliuster tree
+### refer_tree: reference tree
+#####################################################################
+
+def save_cluster_tree(cluster_file_path, refer_tree):
+    print("Saving cluster tree")
+    replace_cluster_probability(refer_tree.root)
+    Phylo.write(refer_tree, cluster_file_path, "newick")
+
+
 #####################################################################
 ### FUNCTION GENERATE_CLUSTER_HEATMAP
 ### Generate cluster heatmap
@@ -301,8 +329,6 @@ def read_tree(input_path):
         formatted = re.sub(r";[^:]+:", ":", tree_string)
         return Phylo.read(io.StringIO(formatted), "newick")
 
-def write_tree(output_path, data):
-    Phylo.write(data, output_path, "newick")
 
 def get_fig_size(refer_tree):
     max_fig_size = 100
@@ -350,7 +376,6 @@ def main(args=None):
     refer_tree = read_tree(refer_tree_path)
     if refer_tree:
         generate_cluster_network(lst_tree_clusters, refer_tree)
-        write_tree(cluster_file_path, refer_tree)
 
         plt.rcParams['font.size'] = '12'
         fig_size = get_fig_size(refer_tree)
@@ -361,5 +386,7 @@ def main(args=None):
         plt.title("Cluster network")
         plt.savefig(cluster_tree_path, format="png")
 
+        save_cluster_tree(cluster_file_path, refer_tree)
+
 if __name__ == "__main__":
     sys.exit(main())

From cb6138f9538fa75e56e5dcd654ce31584107a03b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Cavalcante?= <jvfecav@gmail.com>
Date: Tue, 7 May 2024 08:59:42 -0300
Subject: [PATCH 2/3] fix: Run rSPR when tree has duplicated nodes (#191)

* refactor: Stop analysing trees with duplicate nodes

Signed-off-by: jvfe <jvfecav@gmail.com>

* fix: Update rooted_gene_trees in rspr_exact

Signed-off-by: jvfe <jvfecav@gmail.com>

---------

Signed-off-by: jvfe <jvfecav@gmail.com>
---
 bin/rspr_approx.py | 53 ++++++++++++++++++++++++++++++++++------------
 bin/rspr_exact.py  |  6 +++---
 2 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/bin/rspr_approx.py b/bin/rspr_approx.py
index 5de9ad8..05abace 100755
--- a/bin/rspr_approx.py
+++ b/bin/rspr_approx.py
@@ -85,12 +85,21 @@ def parse_args(args=None):
     )
     return parser.parse_args(args)
 
+def check_formatted_tree(tree_string):
+    """Check if formatted tree has duplicate nodes"""
+
+    pattern = r'([a-zA-Z]+\w{3,}):.*\1'
+    match = re.search(pattern, tree_string)
+
+    return bool(match)
 
 def read_tree(input_path):
     with open(input_path, "r") as f:
         tree_string = f.read()
         formatted = re.sub(r";[^:]+:", ":", tree_string)
-        return Tree(formatted)
+        is_duplicated = check_formatted_tree(formatted)
+
+        return Tree(formatted), is_duplicated
 
 
 #####################################################################
@@ -102,12 +111,27 @@ def read_tree(input_path):
 #####################################################################
 
 
-def root_tree(input_path, output_path):
-    tre = read_tree(input_path)
+def root_tree(input_path, basename, output_path):
+    tre,is_duplicated = read_tree(input_path)
+    midpoint = tre.get_midpoint_outgroup()
+    tre.set_outgroup(midpoint)
+    if is_duplicated:
+        outdir = Path(output_path) / "multiple"
+        Path(outdir).mkdir(exist_ok=True, parents=True)
+        output_path = outdir / basename
+        output_path = str(output_path).replace(".tre", ".tre.multiple")
+    else:
+        outdir = Path(output_path) / "unique"
+        Path(outdir).mkdir(exist_ok=True, parents=True)
+        output_path = outdir / basename
+
+    tre.write(outfile=output_path)
+    return tre.write(), len(tre.get_leaves()), output_path, is_duplicated
+
+def root_reference_tree(input_path, output_path):
+    tre, _ = read_tree(input_path)
     midpoint = tre.get_midpoint_outgroup()
     tre.set_outgroup(midpoint)
-    if not os.path.exists(os.path.dirname(output_path)):
-        os.makedirs(os.path.dirname(output_path))
     tre.write(outfile=output_path)
     return tre.write(), len(tre.get_leaves())
 
@@ -135,20 +159,23 @@ def root_trees(core_tree, gene_trees_path, output_dir, results, merge_pair=False
     rooted_reference_tree = os.path.join(
         output_dir, "rooted_reference_tree/core_gene_alignment.tre"
     )
-    refer_content, refer_tree_size = root_tree(reference_tree, rooted_reference_tree)
+    refer_content, refer_tree_size = root_reference_tree(reference_tree, rooted_reference_tree)
 
     df_gene_trees = pd.read_csv(gene_trees_path)
     rooted_gene_trees_path = os.path.join(output_dir, "rooted_gene_trees")
     for filename in df_gene_trees["path"]:
         basename = Path(filename).name
-        rooted_gene_tree_path = os.path.join(rooted_gene_trees_path, basename)
-        gene_content, gene_tree_size = root_tree(filename, rooted_gene_tree_path)
-        results.loc[basename, "tree_size"] = gene_tree_size
+        gene_content, gene_tree_size, gene_tree_path, is_duplicated = root_tree(
+                                                                filename,
+                                                                basename,
+                                                                rooted_gene_trees_path)
+        if not is_duplicated:
+            results.loc[basename, "tree_size"] = gene_tree_size
         if merge_pair:
-            with open(rooted_gene_tree_path, "w") as f2:
+            with open(gene_tree_path, "w") as f2:
                 f2.write(refer_content + "\n" + gene_content)
     #'''
-    return rooted_gene_trees_path
+    return os.path.join(rooted_gene_trees_path, "unique")
 
 
 #####################################################################
@@ -212,7 +239,7 @@ def approx_rspr(
         "-length " + str(min_branch_len),
         "-support " + str(max_support_threshold),
     ]
-    
+
     group_size = 10000
     cur_count = 0
     lst_filename = []
@@ -498,7 +525,7 @@ def main(args=None):
     # Generate group heatmap
     group_fig_path = os.path.join(args.OUTPUT_DIR, "group_output.png")
     make_group_heatmap(
-        results, 
+        results,
         group_fig_path,
         args.MIN_HEATMAP_RSPR_DISTANCE,
         args.MAX_HEATMAP_RSPR_DISTANCE
diff --git a/bin/rspr_exact.py b/bin/rspr_exact.py
index 389da1e..00ca048 100755
--- a/bin/rspr_exact.py
+++ b/bin/rspr_exact.py
@@ -88,7 +88,7 @@ def fpt_rspr(results_df, min_branch_len=0, max_support_threshold=0.7, gather_clu
         "-support " + str(max_support_threshold),
     ]
 
-    trees_path = os.path.join("rooted_gene_trees")
+    trees_path = os.path.join("rooted_gene_trees/unique")
 
     cluster_file = None
     if gather_cluster_info:
@@ -123,13 +123,13 @@ def fpt_rspr(results_df, min_branch_len=0, max_support_threshold=0.7, gather_clu
                         continue
                     elif "Clusters end" in line:
                         clustering_start = False
-                    
+
                     if clustering_start:
                         updated_line = line.replace('(', '').replace(')', '').replace('\n', '')
                         cluster_nodes = updated_line.split(',')
                         cluster_nodes = [int(node) for node in cluster_nodes if "X" not in node]
                         clusters.append(cluster_nodes)
-                    
+
                     output_lines.append(line)
                 cluster_file.write(json.dumps(clusters) + '\n')
                 process.wait()

From 7142981dc1f1b41cbdacc28d87a65813c167dae6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Cavalcante?= <jvfecav@gmail.com>
Date: Thu, 9 May 2024 17:14:05 -0300
Subject: [PATCH 3/3] fix: Compress output file in EvolCCM (#192)

* fix: Compress output file from evolccm

Signed-off-by: jvfe <jvfecav@gmail.com>

* refactor: Compress feature_profile, not pvals

Signed-off-by: jvfe <jvfecav@gmail.com>

* Revert "fix: Compress output file from evolccm"

This reverts commit 67583e724aaae19968156c4ffd61da1ff965fbdd.

* refactor: Close connection before reading lines

Signed-off-by: jvfe <jvfecav@gmail.com>

* fix: Add .gz to evolccm stub

Signed-off-by: jvfe <jvfecav@gmail.com>

---------

Signed-off-by: jvfe <jvfecav@gmail.com>
---
 bin/ParallelEvolCCM.R    | 6 ++++--
 modules/local/evolccm.nf | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/bin/ParallelEvolCCM.R b/bin/ParallelEvolCCM.R
index 61266bf..a3bf3d4 100755
--- a/bin/ParallelEvolCCM.R
+++ b/bin/ParallelEvolCCM.R
@@ -458,19 +458,21 @@ for (result in results) {
 temp_files <- list.files(pattern = "output_temp_chunk_[0-9]+\\.txt")
 
 outputHeadings <- paste("feature1","feature2","intrisic1", "intrisic2", "gainloss1", "gainloss2", "interaction", "interact_score", "interact_pval",sep="\t")
-cat(outputHeadings, file = outputFile, sep = "\n")
+outputFilegz = gzfile(outputFile, "w")
+cat(outputHeadings, file = outputFilegz, sep = "\n")
 for (temp_file in temp_files) {
   # Read the contents of the temporary file
   temp_contents <- readLines(temp_file)
 
   # Write the contents to the main output file
-  cat(temp_contents, file = outputFile, append = TRUE, sep = "\n")
+  cat(temp_contents, file = outputFilegz, append = TRUE, sep = "\n")
 
   # Remove the temporary file
   file.remove(temp_file)
 }
 
 # Read the lines from the input file
+close(outputFilegz)
 lines <- readLines(outputFile)[-1]
 X2file = paste(outputFile,"X2",sep = ".")
 Pfile = paste(outputFile,"pvals",sep = ".")
diff --git a/modules/local/evolccm.nf b/modules/local/evolccm.nf
index e95b332..a0c3c27 100644
--- a/modules/local/evolccm.nf
+++ b/modules/local/evolccm.nf
@@ -34,7 +34,7 @@ process EVOLCCM {
 
     stub:
     """
-    touch EvolCCM_test.tsv
+    touch EvolCCM_test.tsv.gz
     touch EvolCCM_test.tsv.pvals
     touch EvolCCM_test.tsv.X2
     touch EvolCCM_test.tre