Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into fix-samplesheets
Browse files Browse the repository at this point in the history
* origin/master:
  fix: Compress output file in EvolCCM (#192)
  fix: Run rSPR when tree has duplicated nodes (#191)
  refactor: Save cluster tree with cluster prob (#190)
  • Loading branch information
jvfe committed May 10, 2024
2 parents f1c93b6 + 7142981 commit 95c5489
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 22 deletions.
6 changes: 4 additions & 2 deletions bin/ParallelEvolCCM.R
Original file line number Diff line number Diff line change
Expand Up @@ -458,19 +458,21 @@ for (result in results) {
temp_files <- list.files(pattern = "output_temp_chunk_[0-9]+\\.txt")

outputHeadings <- paste("feature1","feature2","intrisic1", "intrisic2", "gainloss1", "gainloss2", "interaction", "interact_score", "interact_pval",sep="\t")
cat(outputHeadings, file = outputFile, sep = "\n")
outputFilegz = gzfile(outputFile, "w")
cat(outputHeadings, file = outputFilegz, sep = "\n")
for (temp_file in temp_files) {
# Read the contents of the temporary file
temp_contents <- readLines(temp_file)

# Write the contents to the main output file
cat(temp_contents, file = outputFile, append = TRUE, sep = "\n")
cat(temp_contents, file = outputFilegz, append = TRUE, sep = "\n")

# Remove the temporary file
file.remove(temp_file)
}

# Read the lines from the input file
close(outputFilegz)
lines <- readLines(outputFile)[-1]
X2file = paste(outputFile,"X2",sep = ".")
Pfile = paste(outputFile,"pvals",sep = ".")
Expand Down
53 changes: 40 additions & 13 deletions bin/rspr_approx.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,21 @@ def parse_args(args=None):
)
return parser.parse_args(args)

def check_formatted_tree(tree_string):
"""Check if formatted tree has duplicate nodes"""

pattern = r'([a-zA-Z]+\w{3,}):.*\1'
match = re.search(pattern, tree_string)

return bool(match)

def read_tree(input_path):
with open(input_path, "r") as f:
tree_string = f.read()
formatted = re.sub(r";[^:]+:", ":", tree_string)
return Tree(formatted)
is_duplicated = check_formatted_tree(formatted)

return Tree(formatted), is_duplicated


#####################################################################
Expand All @@ -102,12 +111,27 @@ def read_tree(input_path):
#####################################################################


def root_tree(input_path, output_path):
tre = read_tree(input_path)
def root_tree(input_path, basename, output_path):
tre,is_duplicated = read_tree(input_path)
midpoint = tre.get_midpoint_outgroup()
tre.set_outgroup(midpoint)
if is_duplicated:
outdir = Path(output_path) / "multiple"
Path(outdir).mkdir(exist_ok=True, parents=True)
output_path = outdir / basename
output_path = str(output_path).replace(".tre", ".tre.multiple")
else:
outdir = Path(output_path) / "unique"
Path(outdir).mkdir(exist_ok=True, parents=True)
output_path = outdir / basename

tre.write(outfile=output_path)
return tre.write(), len(tre.get_leaves()), output_path, is_duplicated

def root_reference_tree(input_path, output_path):
tre, _ = read_tree(input_path)
midpoint = tre.get_midpoint_outgroup()
tre.set_outgroup(midpoint)
if not os.path.exists(os.path.dirname(output_path)):
os.makedirs(os.path.dirname(output_path))
tre.write(outfile=output_path)
return tre.write(), len(tre.get_leaves())

Expand Down Expand Up @@ -135,20 +159,23 @@ def root_trees(core_tree, gene_trees_path, output_dir, results, merge_pair=False
rooted_reference_tree = os.path.join(
output_dir, "rooted_reference_tree/core_gene_alignment.tre"
)
refer_content, refer_tree_size = root_tree(reference_tree, rooted_reference_tree)
refer_content, refer_tree_size = root_reference_tree(reference_tree, rooted_reference_tree)

df_gene_trees = pd.read_csv(gene_trees_path)
rooted_gene_trees_path = os.path.join(output_dir, "rooted_gene_trees")
for filename in df_gene_trees["path"]:
basename = Path(filename).name
rooted_gene_tree_path = os.path.join(rooted_gene_trees_path, basename)
gene_content, gene_tree_size = root_tree(filename, rooted_gene_tree_path)
results.loc[basename, "tree_size"] = gene_tree_size
gene_content, gene_tree_size, gene_tree_path, is_duplicated = root_tree(
filename,
basename,
rooted_gene_trees_path)
if not is_duplicated:
results.loc[basename, "tree_size"] = gene_tree_size
if merge_pair:
with open(rooted_gene_tree_path, "w") as f2:
with open(gene_tree_path, "w") as f2:
f2.write(refer_content + "\n" + gene_content)
#'''
return rooted_gene_trees_path
return os.path.join(rooted_gene_trees_path, "unique")


#####################################################################
Expand Down Expand Up @@ -212,7 +239,7 @@ def approx_rspr(
"-length " + str(min_branch_len),
"-support " + str(max_support_threshold),
]

group_size = 10000
cur_count = 0
lst_filename = []
Expand Down Expand Up @@ -498,7 +525,7 @@ def main(args=None):
# Generate group heatmap
group_fig_path = os.path.join(args.OUTPUT_DIR, "group_output.png")
make_group_heatmap(
results,
results,
group_fig_path,
args.MIN_HEATMAP_RSPR_DISTANCE,
args.MAX_HEATMAP_RSPR_DISTANCE
Expand Down
6 changes: 3 additions & 3 deletions bin/rspr_exact.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def fpt_rspr(results_df, min_branch_len=0, max_support_threshold=0.7, gather_clu
"-support " + str(max_support_threshold),
]

trees_path = os.path.join("rooted_gene_trees")
trees_path = os.path.join("rooted_gene_trees/unique")

cluster_file = None
if gather_cluster_info:
Expand Down Expand Up @@ -123,13 +123,13 @@ def fpt_rspr(results_df, min_branch_len=0, max_support_threshold=0.7, gather_clu
continue
elif "Clusters end" in line:
clustering_start = False

if clustering_start:
updated_line = line.replace('(', '').replace(')', '').replace('\n', '')
cluster_nodes = updated_line.split(',')
cluster_nodes = [int(node) for node in cluster_nodes if "X" not in node]
clusters.append(cluster_nodes)

output_lines.append(line)
cluster_file.write(json.dumps(clusters) + '\n')
process.wait()
Expand Down
33 changes: 30 additions & 3 deletions bin/rspr_heatmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,34 @@ def generate_cluster_network(lst_tree_clusters, refer_tree):
update_cluster_probability(refer_tree.root, dict_clstr_map, total_trees, leaf_mapping)


#####################################################################
### FUNCTION REPLACE_CLUSTER_PROBABILITY
### Replace branch support with cluster probability
### node: current node
#####################################################################

def replace_cluster_probability(node):
if not node:
return
node.comment = node.cluster_probability
if not node.is_terminal():
for child in node.clades:
replace_cluster_probability(child)


#####################################################################
### FUNCTION SAVE_CLUSTER_TREE
### Save cluster network tree
### cluster_file_path: path to store cliuster tree
### refer_tree: reference tree
#####################################################################

def save_cluster_tree(cluster_file_path, refer_tree):
print("Saving cluster tree")
replace_cluster_probability(refer_tree.root)
Phylo.write(refer_tree, cluster_file_path, "newick")


#####################################################################
### FUNCTION GENERATE_CLUSTER_HEATMAP
### Generate cluster heatmap
Expand Down Expand Up @@ -301,8 +329,6 @@ def read_tree(input_path):
formatted = re.sub(r";[^:]+:", ":", tree_string)
return Phylo.read(io.StringIO(formatted), "newick")

def write_tree(output_path, data):
Phylo.write(data, output_path, "newick")

def get_fig_size(refer_tree):
max_fig_size = 100
Expand Down Expand Up @@ -350,7 +376,6 @@ def main(args=None):
refer_tree = read_tree(refer_tree_path)
if refer_tree:
generate_cluster_network(lst_tree_clusters, refer_tree)
write_tree(cluster_file_path, refer_tree)

plt.rcParams['font.size'] = '12'
fig_size = get_fig_size(refer_tree)
Expand All @@ -361,5 +386,7 @@ def main(args=None):
plt.title("Cluster network")
plt.savefig(cluster_tree_path, format="png")

save_cluster_tree(cluster_file_path, refer_tree)

if __name__ == "__main__":
sys.exit(main())
2 changes: 1 addition & 1 deletion modules/local/evolccm.nf
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ process EVOLCCM {

stub:
"""
touch EvolCCM_test.tsv
touch EvolCCM_test.tsv.gz
touch EvolCCM_test.tsv.pvals
touch EvolCCM_test.tsv.X2
touch EvolCCM_test.tre
Expand Down

0 comments on commit 95c5489

Please sign in to comment.