Skip to content

Commit

Permalink
moving more modules and new dockerfile
Browse files Browse the repository at this point in the history
  • Loading branch information
toniher committed Oct 11, 2024
1 parent 3c12617 commit dcc302d
Show file tree
Hide file tree
Showing 8 changed files with 100 additions and 75 deletions.
9 changes: 4 additions & 5 deletions bin/D2_cluster_EXs.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env Rscript
#.libPaths(c("/software/mi/Rlib3.6/", .libPaths()))

args<-commandArgs(TRUE)
if (length(args)<3) {stop("[USAGE] Rscript --vanilla cluster.R <input_orthopairs> <output_orthogroup> <output_unclustered_exons>")}
Expand Down Expand Up @@ -41,7 +40,7 @@ for (my_gene_OG in all_gene_OG) {
my_final_clusters$names = rownames(my_final_clusters)
single_exon_clusters_tosave = rbind(single_exon_clusters_tosave, my_final_clusters[rownames(my_final_clusters) == exon_id,])
my_final_clusters = my_final_clusters[!(rownames(my_final_clusters) == exon_id),] #remove the single-exon cluster from the final graph.
rownames(my_final_clusters) = my_final_clusters$names; my_final_clusters$names = NULL;
rownames(my_final_clusters) = my_final_clusters$names; my_final_clusters$names = NULL;
}
}
#save unclustered exons to file
Expand Down Expand Up @@ -69,7 +68,7 @@ for (my_gene_OG in all_gene_OG) {
#build species count matrix
all_exons_in_species = all_exons_in_cluster[sub("\\|", ";", all_exons_in_cluster) == my_species] #select only the exons_in_cluster belonging to species
gene_species_vector = unique(sub("\\|.*\\|", ";", sub(".*;", "", sub("\\|", ";", all_exons_in_species)))) #"ENSG00000103266;Hs2" #select all the genes containing the exons_in_cluster belonging to species
cluster_gene_species_count_df = as.data.frame(table(sub(".*;", "", gene_species_vector))); colnames(cluster_gene_species_count_df) = c("Species", "Freq")
cluster_gene_species_count_df = as.data.frame(table(sub(".*;", "", gene_species_vector))); colnames(cluster_gene_species_count_df) = c("Species", "Freq")
cluster_gene_species_count_df$ExonID = rep(my_exon, nrow(cluster_gene_species_count_df))
gene_species_count_matrix = rbind(gene_species_count_matrix, cluster_gene_species_count_df[,c("ExonID", "Freq")])
}
Expand All @@ -96,7 +95,7 @@ for (my_gene_OG in all_gene_OG) {

#Compute the number of exons for each species in each community
all_ids = paste0(gsub(".*\\|", "", rownames(my_final_clusters)), "_", my_final_clusters$ClusterID)
all_species_counts_df = as.data.frame(table(gsub(".*\\|", "", all_ids))); colnames(all_species_counts_df) = c("Species_ClusterID", "Freq");
all_species_counts_df = as.data.frame(table(gsub(".*\\|", "", all_ids))); colnames(all_species_counts_df) = c("Species_ClusterID", "Freq");
all_species_counts_df$Species_ClusterID = as.vector(all_species_counts_df$Species_ClusterID) #transform factor to vector for hashmap to work

#Create hashes for translation
Expand All @@ -117,7 +116,7 @@ for (my_gene_OG in all_gene_OG) {
my_final_clusters$SPECIES_genes_in_cluster = gene_species_count_dict$find(my_final_clusters$ExonID)

######## Compute membership score #############
my_final_clusters$membership_score = (my_final_clusters$In_degree + my_final_clusters$Out_degree + my_final_clusters$N_reciprocals)/(2*(my_final_clusters$TOT_exons_in_cluster - my_final_clusters$SPECIES_exons_in_cluster) +
my_final_clusters$membership_score = (my_final_clusters$In_degree + my_final_clusters$Out_degree + my_final_clusters$N_reciprocals)/(2*(my_final_clusters$TOT_exons_in_cluster - my_final_clusters$SPECIES_exons_in_cluster) +
(TOT_genes_in_cluster - my_final_clusters$SPECIES_genes_in_cluster))

my_final_clusters$membership_score = round(my_final_clusters$membership_score, 2)
Expand Down
7 changes: 7 additions & 0 deletions docker/Rscript/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
FROM rocker/r-ver:3.6.3

RUN apt-get -y update && apt-get install -y libnlopt-dev \
&& rm -rf /var/lib/apt/lists/*

COPY deps.R /tmp
RUN Rscript /tmp/deps.R > /tmp/deps.log
8 changes: 8 additions & 0 deletions docker/Rscript/deps.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
install.packages('devtools', dependencies=TRUE, repos='cran.rstudio.com/');
install.packages(c('igraph'), repos='http://cran.us.r-project.org');
install.packages(c('ggplot2'), repos='http://cran.us.r-project.org');
install.packages(c('nloptr'), repos='http://cran.us.r-project.org');
install.packages(c('pbkrtest', 'lme4', 'pbkrtest', 'lme4'), repos='http://cran.us.r-project.org');
install.packages(c('ggpubr', 'cowplot'), repos='http://cran.us.r-project.org');
install.packages(c('hashmap'), repos='http://cran.us.r-project.org');

94 changes: 24 additions & 70 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,10 @@ if ( !blosumfile.exists() ) exit 1, "Missing blosum file: ${blosumfile}!"
LOCAL_MODULES='./modules/local/exorthist'

include { CHECK_INPUT } from "${LOCAL_MODULES}/check_input.nf"
include { CLUSTER_EXS } from "${LOCAL_MODULES}/cluster_exons.nf"
include { COLLAPSE_OVERLAPPING_MATCHES } from "${LOCAL_MODULES}/collapse_matches.nf"
include { FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE } from "${LOCAL_MODULES}/filter_matches.nf"
include { FORMAT_EX_CLUSTERS_INPUT } from "${LOCAL_MODULES}/format_clusters.nf"
include { GENERATE_ANNOTATIONS } from "${LOCAL_MODULES}/generate_annotations.nf"
include { JOIN_FILTERED_EX_MATCHES } from "${LOCAL_MODULES}/join_matches.nf"
include { MERGE_PROT_EX_INT_ALN_INFO } from "${LOCAL_MODULES}/merge_aligns.nf"
Expand Down Expand Up @@ -230,9 +233,24 @@ workflow {
// Filter the best matches above score cutoffs by target gene.
all_scores_to_filt_ch = SCORE_EX_MATCHES.out.all_scores_to_filt
FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE(all_scores_to_filt_ch.join(dist_ranges_ch))
// join filtered scored EX matches
// Join filtered scored EX matches
filterscore_per_joining_ch = FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE.out.filterscore_per_joining
JOIN_FILTERED_EX_MATCHES(filterscore_per_joining_ch)
// Removing matches from overlapping exons

filtered_all_scores = JOIN_FILTERED_EX_MATCHES.out.filtered_all_scores
// Sic: https://nextflow-io.github.io/patterns/optional-input/
if ( params.bonafide_pairs ) {
COLLAPSE_OVERLAPPING_MATCHES(filtered_all_scores, params.bonafide_pairs)
} else {
COLLAPSE_OVERLAPPING_MATCHES(filtered_all_scores, file("/path/to/NO_FILE"))
}
score_exon_hits_pairs = COLLAPSE_OVERLAPPING_MATCHES.out.score_exon_hits_pairs
FORMAT_EX_CLUSTERS_INPUT(score_exon_hits_pairs, file(params.cluster))
// Split the file of exon pairs
// Unclustered are the exons ending up in single-exon clusters
cluster_parts = FORMAT_EX_CLUSTERS_INPUT.out.cluster_parts
CLUSTER_EXS(cluster_parts)

// Review outputs below
CHECK_INPUT.out.run_info.view()
Expand All @@ -254,77 +272,13 @@ workflow {
FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE.out.filterscore_per_joining.view()
FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE.out.best_scored_matches.view()
JOIN_FILTERED_EX_MATCHES.out.filtered_all_scores.view()
COLLAPSE_OVERLAPPING_MATCHES.out.score_exon_hits_pairs.view()
COLLAPSE_OVERLAPPING_MATCHES.out.overlapping_exs.view()
FORMAT_EX_CLUSTERS_INPUT.out.cluster_parts.view()
CLUSTER_EXS.out.unclustered_exs.view()
CLUSTER_EXS.out.ex_clusters.view()
}

//
// /*
// * Removing matches from overlapping exons
// */
// process collapse_overlapping_matches {
// publishDir "${params.output}/", mode: "copy"
// input:
// file(scores) from filtered_all_scores
//
// output:
// file("filtered_best_scored_EX_matches_by_targetgene-NoOverlap.tab") into (score_exon_hits_pairs, exon_pairs_for_reclustering)
// file("overlapping_EXs_by_species.tab")
//
// script:
// bonafide = ""
// if (params.bonafide_pairs != "") {
// bonafide = file(params.bonafide_pairs)
// if ( !bonafide.exists() ) exit 1, "Missing file with bonafide pairs: ${bonafide}!"
//
// }
// """
// C3_count_matches_by_EX.pl ${scores} EX_matches_count_by_species.tab ${bonafide}
// C4_get_overlapping_EXs.pl -i EX_matches_count_by_species.tab -o overlapping_EXs_by_species.tab
// C5_collapse_overlapping_matches.pl overlapping_EXs_by_species.tab ${scores} filtered_best_scored_EX_matches_by_targetgene-NoOverlap.tab ${bonafide}
// """
// }
//
// /*
// * Split the file of exon pairs
// */
// clusterfile = file(params.cluster)
// process format_EX_clusters_input {
// input:
// file(score_exon_hits_pairs)
// file(clusterfile)
//
// output:
// file("PART_*-cluster_input.tab") into cluster_parts
//
// script:
// """
// if [ `echo ${clusterfile} | grep ".gz"` ]; then
// zcat ${clusterfile} > cluster_file
// D1_format_EX_clusters_input.pl cluster_file ${score_exon_hits_pairs} ${params.orthogroupnum}
// rm cluster_file
// else
// D1_format_EX_clusters_input.pl ${clusterfile} ${score_exon_hits_pairs} ${params.orthogroupnum}
// fi
// """
// }
//
// /*
// * Split the file of exon pairs
// */
// //Unclustered are the exons ending up in single-exon clusters
// process cluster_EXs {
//
// input:
// file(cluster_part) from cluster_parts.flatten()
//
// output:
// file("EXs_${cluster_part}") into ex_clusters
// file("unclustered_EXs_${cluster_part}") into unclustered_exs
//
// script:
// """
// D2_cluster_EXs.R ${cluster_part} EXs_${cluster_part} unclustered_EXs_${cluster_part}
// """
// }
//
// /*
// * Final exon clusters
Expand Down
14 changes: 14 additions & 0 deletions modules/local/exorthist/cluster_exons.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
process CLUSTER_EXS {
label 'rscript'
input:
path cluster_part

output:
path "EXs_${cluster_part.name}", emit: ex_clusters
path "unclustered_EXs_${cluster_part.name}", emit: unclustered_exs

script:
"""
D2_cluster_EXs.R ${cluster_part} EXs_${cluster_part.name} unclustered_EXs_${cluster_part.name}
"""
}
19 changes: 19 additions & 0 deletions modules/local/exorthist/collapse_matches.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
process COLLAPSE_OVERLAPPING_MATCHES {
publishDir "${params.output}/", mode: "copy"

input:
path scores
path bonafide_pairs

output:
path "filtered_best_scored_EX_matches_by_targetgene-NoOverlap.tab", emit: score_exon_hits_pairs
path "overlapping_EXs_by_species.tab", emit: overlapping_exs

script:
def bonafide = bonafide_pairs.name != 'NO_FILE' ? "-b ${bonafide_pairs}" : ''
"""
C3_count_matches_by_EX.pl ${scores} EX_matches_count_by_species.tab ${bonafide}
C4_get_overlapping_EXs.pl -i EX_matches_count_by_species.tab -o overlapping_EXs_by_species.tab
C5_collapse_overlapping_matches.pl overlapping_EXs_by_species.tab ${scores} filtered_best_scored_EX_matches_by_targetgene-NoOverlap.tab ${bonafide}
"""
}
19 changes: 19 additions & 0 deletions modules/local/exorthist/format_clusters.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
process FORMAT_EX_CLUSTERS_INPUT {
input:
path score_exon_hits_pairs
path clusterfile

output:
path "PART_*-cluster_input.tab", emit: cluster_parts

script:
"""
if [[ "${clusterfile}" == *.gz ]]; then
zcat ${clusterfile} > cluster_file
D1_format_EX_clusters_input.pl cluster_file ${score_exon_hits_pairs} ${params.orthogroupnum}
rm cluster_file
else
D1_format_EX_clusters_input.pl ${clusterfile} ${score_exon_hits_pairs} ${params.orthogroupnum}
fi
"""
}
5 changes: 5 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ process {
container = 'quay.io/biocontainers/pandas:1.5.2'
}

withLabel: rscript {
container = 'rocker/r-ver:3.6.3'
}


}

process.container = 'biocorecrg/exon_intron_pipe:0.2'
Expand Down

0 comments on commit dcc302d

Please sign in to comment.