moving more modules and new dockerfile

biocorecrg · Oct 11, 2024 · dcc302d · dcc302d
1 parent 3c12617
commit dcc302d
Show file tree

Hide file tree

Showing 8 changed files with 100 additions and 75 deletions.
diff --git a/bin/D2_cluster_EXs.R b/bin/D2_cluster_EXs.R
@@ -1,5 +1,4 @@
 #!/usr/bin/env Rscript
-#.libPaths(c("/software/mi/Rlib3.6/", .libPaths()))
 
 args<-commandArgs(TRUE)
 if (length(args)<3) {stop("[USAGE] Rscript --vanilla cluster.R <input_orthopairs> <output_orthogroup> <output_unclustered_exons>")}
@@ -41,7 +40,7 @@ for (my_gene_OG in all_gene_OG) {
         my_final_clusters$names = rownames(my_final_clusters)
         single_exon_clusters_tosave = rbind(single_exon_clusters_tosave, my_final_clusters[rownames(my_final_clusters) == exon_id,])
         my_final_clusters = my_final_clusters[!(rownames(my_final_clusters) == exon_id),] #remove the single-exon cluster from the final graph.
-        rownames(my_final_clusters) =  my_final_clusters$names;  my_final_clusters$names = NULL; 
+        rownames(my_final_clusters) =  my_final_clusters$names;  my_final_clusters$names = NULL;
     }
   }
   #save unclustered exons to file
@@ -69,7 +68,7 @@ for (my_gene_OG in all_gene_OG) {
     #build species count matrix
     all_exons_in_species = all_exons_in_cluster[sub("\\|", ";", all_exons_in_cluster) == my_species] #select only the exons_in_cluster belonging to species
     gene_species_vector = unique(sub("\\|.*\\|", ";", sub(".*;", "", sub("\\|", ";", all_exons_in_species)))) #"ENSG00000103266;Hs2" #select all the genes containing the exons_in_cluster belonging to species
-    cluster_gene_species_count_df = as.data.frame(table(sub(".*;", "", gene_species_vector))); colnames(cluster_gene_species_count_df) = c("Species", "Freq") 
+    cluster_gene_species_count_df = as.data.frame(table(sub(".*;", "", gene_species_vector))); colnames(cluster_gene_species_count_df) = c("Species", "Freq")
     cluster_gene_species_count_df$ExonID = rep(my_exon, nrow(cluster_gene_species_count_df))
     gene_species_count_matrix = rbind(gene_species_count_matrix, cluster_gene_species_count_df[,c("ExonID", "Freq")])
   }
@@ -96,7 +95,7 @@ for (my_gene_OG in all_gene_OG) {
 
   #Compute the number of exons for each species in each community
   all_ids = paste0(gsub(".*\\|", "", rownames(my_final_clusters)), "_", my_final_clusters$ClusterID)
-  all_species_counts_df = as.data.frame(table(gsub(".*\\|", "", all_ids))); colnames(all_species_counts_df) = c("Species_ClusterID", "Freq"); 
+  all_species_counts_df = as.data.frame(table(gsub(".*\\|", "", all_ids))); colnames(all_species_counts_df) = c("Species_ClusterID", "Freq");
   all_species_counts_df$Species_ClusterID = as.vector(all_species_counts_df$Species_ClusterID) #transform factor to vector for hashmap to work
 
   #Create hashes for translation
@@ -117,7 +116,7 @@ for (my_gene_OG in all_gene_OG) {
   my_final_clusters$SPECIES_genes_in_cluster = gene_species_count_dict$find(my_final_clusters$ExonID)
 
   ######## Compute membership score #############
-  my_final_clusters$membership_score = (my_final_clusters$In_degree + my_final_clusters$Out_degree + my_final_clusters$N_reciprocals)/(2*(my_final_clusters$TOT_exons_in_cluster - my_final_clusters$SPECIES_exons_in_cluster) + 
+  my_final_clusters$membership_score = (my_final_clusters$In_degree + my_final_clusters$Out_degree + my_final_clusters$N_reciprocals)/(2*(my_final_clusters$TOT_exons_in_cluster - my_final_clusters$SPECIES_exons_in_cluster) +
                                                                                                                                        (TOT_genes_in_cluster - my_final_clusters$SPECIES_genes_in_cluster))
 
   my_final_clusters$membership_score = round(my_final_clusters$membership_score, 2)

diff --git a/docker/Rscript/Dockerfile b/docker/Rscript/Dockerfile
@@ -0,0 +1,7 @@
+FROM rocker/r-ver:3.6.3
+
+RUN apt-get -y update && apt-get install -y libnlopt-dev \
+  && rm -rf /var/lib/apt/lists/*
+
+COPY deps.R /tmp
+RUN Rscript /tmp/deps.R > /tmp/deps.log
diff --git a/docker/Rscript/deps.R b/docker/Rscript/deps.R
@@ -0,0 +1,8 @@
+install.packages('devtools', dependencies=TRUE, repos='cran.rstudio.com/');
+install.packages(c('igraph'), repos='http://cran.us.r-project.org');
+install.packages(c('ggplot2'), repos='http://cran.us.r-project.org');
+install.packages(c('nloptr'), repos='http://cran.us.r-project.org');
+install.packages(c('pbkrtest', 'lme4', 'pbkrtest', 'lme4'), repos='http://cran.us.r-project.org');
+install.packages(c('ggpubr', 'cowplot'), repos='http://cran.us.r-project.org');
+install.packages(c('hashmap'), repos='http://cran.us.r-project.org');
+
diff --git a/main.nf b/main.nf
@@ -96,7 +96,10 @@ if ( !blosumfile.exists() ) exit 1, "Missing blosum file: ${blosumfile}!"
 LOCAL_MODULES='./modules/local/exorthist'
 
 include { CHECK_INPUT } from "${LOCAL_MODULES}/check_input.nf"
+include { CLUSTER_EXS } from "${LOCAL_MODULES}/cluster_exons.nf"
+include { COLLAPSE_OVERLAPPING_MATCHES } from "${LOCAL_MODULES}/collapse_matches.nf"
 include { FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE } from "${LOCAL_MODULES}/filter_matches.nf"
+include { FORMAT_EX_CLUSTERS_INPUT } from "${LOCAL_MODULES}/format_clusters.nf"
 include { GENERATE_ANNOTATIONS } from "${LOCAL_MODULES}/generate_annotations.nf"
 include { JOIN_FILTERED_EX_MATCHES } from "${LOCAL_MODULES}/join_matches.nf"
 include { MERGE_PROT_EX_INT_ALN_INFO } from "${LOCAL_MODULES}/merge_aligns.nf"
@@ -230,9 +233,24 @@ workflow {
     // Filter the best matches above score cutoffs by target gene.
     all_scores_to_filt_ch = SCORE_EX_MATCHES.out.all_scores_to_filt
     FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE(all_scores_to_filt_ch.join(dist_ranges_ch))
-    //  join filtered scored EX matches
+    //  Join filtered scored EX matches
     filterscore_per_joining_ch = FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE.out.filterscore_per_joining
     JOIN_FILTERED_EX_MATCHES(filterscore_per_joining_ch)
+    //  Removing matches from overlapping exons
+
+    filtered_all_scores = JOIN_FILTERED_EX_MATCHES.out.filtered_all_scores
+    // Sic: https://nextflow-io.github.io/patterns/optional-input/
+    if ( params.bonafide_pairs ) {
+        COLLAPSE_OVERLAPPING_MATCHES(filtered_all_scores, params.bonafide_pairs)
+    } else {
+        COLLAPSE_OVERLAPPING_MATCHES(filtered_all_scores, file("/path/to/NO_FILE"))
+    }
+    score_exon_hits_pairs = COLLAPSE_OVERLAPPING_MATCHES.out.score_exon_hits_pairs
+    FORMAT_EX_CLUSTERS_INPUT(score_exon_hits_pairs, file(params.cluster))
+    // Split the file of exon pairs
+    // Unclustered are the exons ending up in single-exon clusters
+    cluster_parts = FORMAT_EX_CLUSTERS_INPUT.out.cluster_parts
+    CLUSTER_EXS(cluster_parts)
 
     // Review outputs below
     CHECK_INPUT.out.run_info.view()
@@ -254,77 +272,13 @@ workflow {
     FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE.out.filterscore_per_joining.view()
     FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE.out.best_scored_matches.view()
     JOIN_FILTERED_EX_MATCHES.out.filtered_all_scores.view()
+    COLLAPSE_OVERLAPPING_MATCHES.out.score_exon_hits_pairs.view()
+    COLLAPSE_OVERLAPPING_MATCHES.out.overlapping_exs.view()
+    FORMAT_EX_CLUSTERS_INPUT.out.cluster_parts.view()
+    CLUSTER_EXS.out.unclustered_exs.view()
+    CLUSTER_EXS.out.ex_clusters.view()
 }
 
-//
-// /*
-//  * Removing matches from overlapping exons
-//  */
-// process collapse_overlapping_matches {
-//     publishDir "${params.output}/", mode: "copy"
-//     input:
-//     file(scores) from filtered_all_scores
-//
-//     output:
-//     file("filtered_best_scored_EX_matches_by_targetgene-NoOverlap.tab") into (score_exon_hits_pairs, exon_pairs_for_reclustering)
-//     file("overlapping_EXs_by_species.tab")
-//
-// 	script:
-// 	bonafide = ""
-// 	if (params.bonafide_pairs != "") {
-// 		bonafide = file(params.bonafide_pairs)
-// 		if ( !bonafide.exists() ) exit 1, "Missing file with bonafide pairs: ${bonafide}!"
-//
-// 	}
-// 	"""
-//     C3_count_matches_by_EX.pl ${scores} EX_matches_count_by_species.tab ${bonafide}
-//     C4_get_overlapping_EXs.pl -i EX_matches_count_by_species.tab -o overlapping_EXs_by_species.tab
-//     C5_collapse_overlapping_matches.pl overlapping_EXs_by_species.tab ${scores} filtered_best_scored_EX_matches_by_targetgene-NoOverlap.tab ${bonafide}
-//     """
-// }
-//
-// /*
-// * Split the file of exon pairs
-// */
-// clusterfile = file(params.cluster)
-// process format_EX_clusters_input {
-//     input:
-//     file(score_exon_hits_pairs)
-//     file(clusterfile)
-//
-//     output:
-//     file("PART_*-cluster_input.tab") into cluster_parts
-//
-// 	script:
-// 	"""
-//    if [ `echo ${clusterfile} | grep ".gz"` ]; then
-//        zcat ${clusterfile} > cluster_file
-//        D1_format_EX_clusters_input.pl cluster_file ${score_exon_hits_pairs} ${params.orthogroupnum}
-//        rm cluster_file
-//     else
-//        D1_format_EX_clusters_input.pl ${clusterfile} ${score_exon_hits_pairs} ${params.orthogroupnum}
-//     fi
-// 	"""
-// }
-//
-// /*
-// * Split the file of exon pairs
-// */
-// //Unclustered are the exons ending up in single-exon clusters
-// process cluster_EXs {
-//
-//     input:
-//     file(cluster_part) from cluster_parts.flatten()
-//
-//     output:
-//     file("EXs_${cluster_part}") into ex_clusters
-//     file("unclustered_EXs_${cluster_part}") into unclustered_exs
-//
-// 	script:
-// 	"""
-//     D2_cluster_EXs.R ${cluster_part} EXs_${cluster_part} unclustered_EXs_${cluster_part}
-//     """
-// }
 //
 // /*
 // * Final exon clusters

diff --git a/modules/local/exorthist/cluster_exons.nf b/modules/local/exorthist/cluster_exons.nf
@@ -0,0 +1,14 @@
+process CLUSTER_EXS {
+    label 'rscript'
+    input:
+    path cluster_part
+
+    output:
+    path "EXs_${cluster_part.name}", emit: ex_clusters
+    path "unclustered_EXs_${cluster_part.name}", emit: unclustered_exs
+
+    script:
+    """
+    D2_cluster_EXs.R ${cluster_part} EXs_${cluster_part.name} unclustered_EXs_${cluster_part.name}
+    """
+}
diff --git a/modules/local/exorthist/collapse_matches.nf b/modules/local/exorthist/collapse_matches.nf
@@ -0,0 +1,19 @@
+process COLLAPSE_OVERLAPPING_MATCHES {
+    publishDir "${params.output}/", mode: "copy"
+
+    input:
+    path scores
+    path bonafide_pairs
+
+    output:
+    path "filtered_best_scored_EX_matches_by_targetgene-NoOverlap.tab", emit: score_exon_hits_pairs
+    path "overlapping_EXs_by_species.tab", emit: overlapping_exs
+
+    script:
+    def bonafide = bonafide_pairs.name != 'NO_FILE' ? "-b ${bonafide_pairs}" : ''
+    """
+    C3_count_matches_by_EX.pl ${scores} EX_matches_count_by_species.tab ${bonafide}
+    C4_get_overlapping_EXs.pl -i EX_matches_count_by_species.tab -o overlapping_EXs_by_species.tab
+    C5_collapse_overlapping_matches.pl overlapping_EXs_by_species.tab ${scores} filtered_best_scored_EX_matches_by_targetgene-NoOverlap.tab ${bonafide}
+    """
+}
diff --git a/modules/local/exorthist/format_clusters.nf b/modules/local/exorthist/format_clusters.nf
@@ -0,0 +1,19 @@
+process FORMAT_EX_CLUSTERS_INPUT {
+    input:
+    path score_exon_hits_pairs
+    path clusterfile
+
+    output:
+    path "PART_*-cluster_input.tab", emit: cluster_parts
+
+    script:
+    """
+    if [[ "${clusterfile}" == *.gz ]]; then
+        zcat ${clusterfile} > cluster_file
+        D1_format_EX_clusters_input.pl cluster_file ${score_exon_hits_pairs} ${params.orthogroupnum}
+        rm cluster_file
+    else
+        D1_format_EX_clusters_input.pl ${clusterfile} ${score_exon_hits_pairs} ${params.orthogroupnum}
+    fi
+    """
+}
diff --git a/nextflow.config b/nextflow.config
@@ -52,6 +52,11 @@ process {
      container = 'quay.io/biocontainers/pandas:1.5.2'
     }
 
+    withLabel: rscript {
+     container = 'rocker/r-ver:3.6.3'
+    }
+
+
 }
 
 process.container = 'biocorecrg/exon_intron_pipe:0.2'