diff --git a/bin/D2_cluster_EXs.R b/bin/D2_cluster_EXs.R index ad37785..476e06d 100755 --- a/bin/D2_cluster_EXs.R +++ b/bin/D2_cluster_EXs.R @@ -1,5 +1,4 @@ #!/usr/bin/env Rscript -#.libPaths(c("/software/mi/Rlib3.6/", .libPaths())) args<-commandArgs(TRUE) if (length(args)<3) {stop("[USAGE] Rscript --vanilla cluster.R ")} @@ -41,7 +40,7 @@ for (my_gene_OG in all_gene_OG) { my_final_clusters$names = rownames(my_final_clusters) single_exon_clusters_tosave = rbind(single_exon_clusters_tosave, my_final_clusters[rownames(my_final_clusters) == exon_id,]) my_final_clusters = my_final_clusters[!(rownames(my_final_clusters) == exon_id),] #remove the single-exon cluster from the final graph. - rownames(my_final_clusters) = my_final_clusters$names; my_final_clusters$names = NULL; + rownames(my_final_clusters) = my_final_clusters$names; my_final_clusters$names = NULL; } } #save unclustered exons to file @@ -69,7 +68,7 @@ for (my_gene_OG in all_gene_OG) { #build species count matrix all_exons_in_species = all_exons_in_cluster[sub("\\|", ";", all_exons_in_cluster) == my_species] #select only the exons_in_cluster belonging to species gene_species_vector = unique(sub("\\|.*\\|", ";", sub(".*;", "", sub("\\|", ";", all_exons_in_species)))) #"ENSG00000103266;Hs2" #select all the genes containing the exons_in_cluster belonging to species - cluster_gene_species_count_df = as.data.frame(table(sub(".*;", "", gene_species_vector))); colnames(cluster_gene_species_count_df) = c("Species", "Freq") + cluster_gene_species_count_df = as.data.frame(table(sub(".*;", "", gene_species_vector))); colnames(cluster_gene_species_count_df) = c("Species", "Freq") cluster_gene_species_count_df$ExonID = rep(my_exon, nrow(cluster_gene_species_count_df)) gene_species_count_matrix = rbind(gene_species_count_matrix, cluster_gene_species_count_df[,c("ExonID", "Freq")]) } @@ -96,7 +95,7 @@ for (my_gene_OG in all_gene_OG) { #Compute the number of exons for each species in each community all_ids = paste0(gsub(".*\\|", "", rownames(my_final_clusters)), "_", my_final_clusters$ClusterID) - all_species_counts_df = as.data.frame(table(gsub(".*\\|", "", all_ids))); colnames(all_species_counts_df) = c("Species_ClusterID", "Freq"); + all_species_counts_df = as.data.frame(table(gsub(".*\\|", "", all_ids))); colnames(all_species_counts_df) = c("Species_ClusterID", "Freq"); all_species_counts_df$Species_ClusterID = as.vector(all_species_counts_df$Species_ClusterID) #transform factor to vector for hashmap to work #Create hashes for translation @@ -117,7 +116,7 @@ for (my_gene_OG in all_gene_OG) { my_final_clusters$SPECIES_genes_in_cluster = gene_species_count_dict$find(my_final_clusters$ExonID) ######## Compute membership score ############# - my_final_clusters$membership_score = (my_final_clusters$In_degree + my_final_clusters$Out_degree + my_final_clusters$N_reciprocals)/(2*(my_final_clusters$TOT_exons_in_cluster - my_final_clusters$SPECIES_exons_in_cluster) + + my_final_clusters$membership_score = (my_final_clusters$In_degree + my_final_clusters$Out_degree + my_final_clusters$N_reciprocals)/(2*(my_final_clusters$TOT_exons_in_cluster - my_final_clusters$SPECIES_exons_in_cluster) + (TOT_genes_in_cluster - my_final_clusters$SPECIES_genes_in_cluster)) my_final_clusters$membership_score = round(my_final_clusters$membership_score, 2) diff --git a/docker/Rscript/Dockerfile b/docker/Rscript/Dockerfile new file mode 100644 index 0000000..15d6390 --- /dev/null +++ b/docker/Rscript/Dockerfile @@ -0,0 +1,7 @@ +FROM rocker/r-ver:3.6.3 + +RUN apt-get -y update && apt-get install -y libnlopt-dev \ + && rm -rf /var/lib/apt/lists/* + +COPY deps.R /tmp +RUN Rscript /tmp/deps.R > /tmp/deps.log diff --git a/docker/Rscript/deps.R b/docker/Rscript/deps.R new file mode 100644 index 0000000..eee3ac2 --- /dev/null +++ b/docker/Rscript/deps.R @@ -0,0 +1,8 @@ +install.packages('devtools', dependencies=TRUE, repos='cran.rstudio.com/'); +install.packages(c('igraph'), repos='http://cran.us.r-project.org'); +install.packages(c('ggplot2'), repos='http://cran.us.r-project.org'); +install.packages(c('nloptr'), repos='http://cran.us.r-project.org'); +install.packages(c('pbkrtest', 'lme4', 'pbkrtest', 'lme4'), repos='http://cran.us.r-project.org'); +install.packages(c('ggpubr', 'cowplot'), repos='http://cran.us.r-project.org'); +install.packages(c('hashmap'), repos='http://cran.us.r-project.org'); + diff --git a/main.nf b/main.nf index fbd001e..d963235 100644 --- a/main.nf +++ b/main.nf @@ -96,7 +96,10 @@ if ( !blosumfile.exists() ) exit 1, "Missing blosum file: ${blosumfile}!" LOCAL_MODULES='./modules/local/exorthist' include { CHECK_INPUT } from "${LOCAL_MODULES}/check_input.nf" +include { CLUSTER_EXS } from "${LOCAL_MODULES}/cluster_exons.nf" +include { COLLAPSE_OVERLAPPING_MATCHES } from "${LOCAL_MODULES}/collapse_matches.nf" include { FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE } from "${LOCAL_MODULES}/filter_matches.nf" +include { FORMAT_EX_CLUSTERS_INPUT } from "${LOCAL_MODULES}/format_clusters.nf" include { GENERATE_ANNOTATIONS } from "${LOCAL_MODULES}/generate_annotations.nf" include { JOIN_FILTERED_EX_MATCHES } from "${LOCAL_MODULES}/join_matches.nf" include { MERGE_PROT_EX_INT_ALN_INFO } from "${LOCAL_MODULES}/merge_aligns.nf" @@ -230,9 +233,24 @@ workflow { // Filter the best matches above score cutoffs by target gene. all_scores_to_filt_ch = SCORE_EX_MATCHES.out.all_scores_to_filt FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE(all_scores_to_filt_ch.join(dist_ranges_ch)) - // join filtered scored EX matches + // Join filtered scored EX matches filterscore_per_joining_ch = FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE.out.filterscore_per_joining JOIN_FILTERED_EX_MATCHES(filterscore_per_joining_ch) + // Removing matches from overlapping exons + + filtered_all_scores = JOIN_FILTERED_EX_MATCHES.out.filtered_all_scores + // Sic: https://nextflow-io.github.io/patterns/optional-input/ + if ( params.bonafide_pairs ) { + COLLAPSE_OVERLAPPING_MATCHES(filtered_all_scores, params.bonafide_pairs) + } else { + COLLAPSE_OVERLAPPING_MATCHES(filtered_all_scores, file("/path/to/NO_FILE")) + } + score_exon_hits_pairs = COLLAPSE_OVERLAPPING_MATCHES.out.score_exon_hits_pairs + FORMAT_EX_CLUSTERS_INPUT(score_exon_hits_pairs, file(params.cluster)) + // Split the file of exon pairs + // Unclustered are the exons ending up in single-exon clusters + cluster_parts = FORMAT_EX_CLUSTERS_INPUT.out.cluster_parts + CLUSTER_EXS(cluster_parts) // Review outputs below CHECK_INPUT.out.run_info.view() @@ -254,77 +272,13 @@ workflow { FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE.out.filterscore_per_joining.view() FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE.out.best_scored_matches.view() JOIN_FILTERED_EX_MATCHES.out.filtered_all_scores.view() + COLLAPSE_OVERLAPPING_MATCHES.out.score_exon_hits_pairs.view() + COLLAPSE_OVERLAPPING_MATCHES.out.overlapping_exs.view() + FORMAT_EX_CLUSTERS_INPUT.out.cluster_parts.view() + CLUSTER_EXS.out.unclustered_exs.view() + CLUSTER_EXS.out.ex_clusters.view() } -// -// /* -// * Removing matches from overlapping exons -// */ -// process collapse_overlapping_matches { -// publishDir "${params.output}/", mode: "copy" -// input: -// file(scores) from filtered_all_scores -// -// output: -// file("filtered_best_scored_EX_matches_by_targetgene-NoOverlap.tab") into (score_exon_hits_pairs, exon_pairs_for_reclustering) -// file("overlapping_EXs_by_species.tab") -// -// script: -// bonafide = "" -// if (params.bonafide_pairs != "") { -// bonafide = file(params.bonafide_pairs) -// if ( !bonafide.exists() ) exit 1, "Missing file with bonafide pairs: ${bonafide}!" -// -// } -// """ -// C3_count_matches_by_EX.pl ${scores} EX_matches_count_by_species.tab ${bonafide} -// C4_get_overlapping_EXs.pl -i EX_matches_count_by_species.tab -o overlapping_EXs_by_species.tab -// C5_collapse_overlapping_matches.pl overlapping_EXs_by_species.tab ${scores} filtered_best_scored_EX_matches_by_targetgene-NoOverlap.tab ${bonafide} -// """ -// } -// -// /* -// * Split the file of exon pairs -// */ -// clusterfile = file(params.cluster) -// process format_EX_clusters_input { -// input: -// file(score_exon_hits_pairs) -// file(clusterfile) -// -// output: -// file("PART_*-cluster_input.tab") into cluster_parts -// -// script: -// """ -// if [ `echo ${clusterfile} | grep ".gz"` ]; then -// zcat ${clusterfile} > cluster_file -// D1_format_EX_clusters_input.pl cluster_file ${score_exon_hits_pairs} ${params.orthogroupnum} -// rm cluster_file -// else -// D1_format_EX_clusters_input.pl ${clusterfile} ${score_exon_hits_pairs} ${params.orthogroupnum} -// fi -// """ -// } -// -// /* -// * Split the file of exon pairs -// */ -// //Unclustered are the exons ending up in single-exon clusters -// process cluster_EXs { -// -// input: -// file(cluster_part) from cluster_parts.flatten() -// -// output: -// file("EXs_${cluster_part}") into ex_clusters -// file("unclustered_EXs_${cluster_part}") into unclustered_exs -// -// script: -// """ -// D2_cluster_EXs.R ${cluster_part} EXs_${cluster_part} unclustered_EXs_${cluster_part} -// """ -// } // // /* // * Final exon clusters diff --git a/modules/local/exorthist/cluster_exons.nf b/modules/local/exorthist/cluster_exons.nf new file mode 100644 index 0000000..d50f8b1 --- /dev/null +++ b/modules/local/exorthist/cluster_exons.nf @@ -0,0 +1,14 @@ +process CLUSTER_EXS { + label 'rscript' + input: + path cluster_part + + output: + path "EXs_${cluster_part.name}", emit: ex_clusters + path "unclustered_EXs_${cluster_part.name}", emit: unclustered_exs + + script: + """ + D2_cluster_EXs.R ${cluster_part} EXs_${cluster_part.name} unclustered_EXs_${cluster_part.name} + """ +} diff --git a/modules/local/exorthist/collapse_matches.nf b/modules/local/exorthist/collapse_matches.nf new file mode 100644 index 0000000..74ef11b --- /dev/null +++ b/modules/local/exorthist/collapse_matches.nf @@ -0,0 +1,19 @@ +process COLLAPSE_OVERLAPPING_MATCHES { + publishDir "${params.output}/", mode: "copy" + + input: + path scores + path bonafide_pairs + + output: + path "filtered_best_scored_EX_matches_by_targetgene-NoOverlap.tab", emit: score_exon_hits_pairs + path "overlapping_EXs_by_species.tab", emit: overlapping_exs + + script: + def bonafide = bonafide_pairs.name != 'NO_FILE' ? "-b ${bonafide_pairs}" : '' + """ + C3_count_matches_by_EX.pl ${scores} EX_matches_count_by_species.tab ${bonafide} + C4_get_overlapping_EXs.pl -i EX_matches_count_by_species.tab -o overlapping_EXs_by_species.tab + C5_collapse_overlapping_matches.pl overlapping_EXs_by_species.tab ${scores} filtered_best_scored_EX_matches_by_targetgene-NoOverlap.tab ${bonafide} + """ +} diff --git a/modules/local/exorthist/format_clusters.nf b/modules/local/exorthist/format_clusters.nf new file mode 100644 index 0000000..07ce575 --- /dev/null +++ b/modules/local/exorthist/format_clusters.nf @@ -0,0 +1,19 @@ +process FORMAT_EX_CLUSTERS_INPUT { + input: + path score_exon_hits_pairs + path clusterfile + + output: + path "PART_*-cluster_input.tab", emit: cluster_parts + + script: + """ + if [[ "${clusterfile}" == *.gz ]]; then + zcat ${clusterfile} > cluster_file + D1_format_EX_clusters_input.pl cluster_file ${score_exon_hits_pairs} ${params.orthogroupnum} + rm cluster_file + else + D1_format_EX_clusters_input.pl ${clusterfile} ${score_exon_hits_pairs} ${params.orthogroupnum} + fi + """ +} diff --git a/nextflow.config b/nextflow.config index 08d8270..175fecc 100644 --- a/nextflow.config +++ b/nextflow.config @@ -52,6 +52,11 @@ process { container = 'quay.io/biocontainers/pandas:1.5.2' } + withLabel: rscript { + container = 'rocker/r-ver:3.6.3' + } + + } process.container = 'biocorecrg/exon_intron_pipe:0.2'