Skip to content

Commit

Permalink
v1 build
Browse files Browse the repository at this point in the history
  • Loading branch information
drewjbeh committed Feb 18, 2022
1 parent 0e702ee commit 16a5f1b
Show file tree
Hide file tree
Showing 134 changed files with 139,241 additions and 0 deletions.
Empty file added build/lib/mimseq/__init__.py
Empty file.
153 changes: 153 additions & 0 deletions build/lib/mimseq/ccaPlots.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#!/usr/bin/Rscript
#!/usr/bin/env Rscript

# Plot 3' dinucleotide occurence of aligned reads
# Data table generated in mmQuant module

suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
suppressMessages(library(rlang))
suppressMessages(library(grid))
suppressMessages(library(gtable))
suppressMessages(library(ggpol))

args = commandArgs(trailingOnly = TRUE)

# source facet_share.R
# suppressMessages(source(args[5]))

if (length(args) == 0) {
stop("At least one argument must be supplied (input file).n", call.=FALSE)

} else if (length(args)>0) {
out = args[3]
double_cca = args[4]
if (double_cca == "True"){
out_string = "double_ccaPlot.pdf"
} else {
out_string = "ccaPlot.pdf"
}

dinuc = read.table(args[1], header = TRUE, sep = "\t", na.strings="")
dinuc = dinuc[!grepl("N", dinuc$dinuc),]
dinuc$color = "grey"
dinuc$color[dinuc$dinuc == "CC"] = "red"
dinuc$color[dinuc$dinuc == "CA"] = "green"

dinuc_plot = ggplot(dinuc, aes(x=dinuc, y=proportion)) +
geom_bar(stat="identity", aes(fill=color)) +
facet_wrap(~sample, ncol=3) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8),
legend.position="none") +
scale_fill_manual(values = c("#66828D","#737373","#A05C45")) +
xlab("3' dinucleotide") +
ylab("Proportion")
ggsave(paste(out, "dinuc_plot.pdf", sep = ''), dinuc_plot, height=5, width=10)

cca_counts = read.table(args[2], header = TRUE, sep = "\t")
cca_counts$gene = sub(".*_mito_tRNA-","mito",cca_counts$gene)
cca_counts$gene = sub(".*_nmt_tRNA-","nmt",cca_counts$gene)
cca_counts$gene = sub(".*_tRNA-","",cca_counts$gene)
cca_counts$gene = ifelse(cca_counts$gene == "eColiLys-TTT-1-1", "eColiLys", cca_counts$gene)
cca_counts$gene = gsub("/[0-9].*", "-multi", cca_counts$gene)

cca_prop = cca_counts %>% group_by(gene,sample) %>%
mutate(countT = sum(count)) %>%
group_by(end, .add = TRUE) %>%
mutate(per = round(100*count/countT,2))

cca_summary = aggregate(cca_prop$per, by=list(gene = cca_prop$gene,
end = cca_prop$end,
condition = cca_prop$condition),
function(x) c(mean = mean(x), sd = sd(x)))
cca_summary = do.call("data.frame", cca_summary)

if (length(unique(cca_counts$condition)) > 1) {
combinations = combn(unique(cca_counts$condition), 2, simplify = FALSE)

# For each contrast...
for (i in 1:length(combinations)) {
cca_summary_sub = subset(cca_summary, condition %in% combinations[[i]])
cca_summary_sub = cca_summary_sub %>%
mutate(x.mean = ifelse(condition == cca_summary_sub$condition[1],
x.mean * -1,
x.mean))

cca_prop_sub = subset(cca_prop, condition %in% combinations[[i]])
cca_prop_sub = cca_prop_sub %>%
mutate(per = ifelse(condition == cca_summary_sub$condition[1],
per * -1,
per))

cca_prop_sub$bar_pos = NA
cca_prop_sub$bar_pos[cca_prop_sub$end == "Absent"] = cca_prop_sub$per[cca_prop_sub$end == "Absent"]
cca_prop_sub$bar_pos[cca_prop_sub$end == "C"] = cca_prop_sub$per[cca_prop_sub$end == "Absent"]+
cca_prop_sub$per[cca_prop_sub$end == "C"]
cca_prop_sub$bar_pos[cca_prop_sub$end == "CC"] = cca_prop_sub$per[cca_prop_sub$end == "Absent"]+
cca_prop_sub$per[cca_prop_sub$end == "CC"] + cca_prop_sub$per[cca_prop_sub$end == "C"]
cca_prop_sub$bar_pos[cca_prop_sub$end == "CA"] = cca_prop_sub$per[cca_prop_sub$end == "Absent"] +
cca_prop_sub$per[cca_prop_sub$end == "CA"] + cca_prop_sub$per[cca_prop_sub$end == "CC"] +
cca_prop_sub$per[cca_prop_sub$end == "C"]

avg_cca = aggregate(cca_summary_sub$x.mean, by = list(condition = cca_summary_sub$condition, end = cca_summary_sub$end), mean)
cca_summary_sub$end = factor(cca_summary_sub$end, levels = c("CA", "CC", "C", "Absent"))

cca_plot = ggplot(cca_summary_sub, aes(x = gene, y = x.mean, fill = end)) +
geom_bar(stat = 'identity', width = 0.8) +
geom_hline(data = subset(avg_cca, end == "CA"), aes(yintercept=x), color = "white", alpha = 0.9) +
geom_jitter(data = cca_prop_sub[cca_prop_sub$end == "CC",], aes(x = gene, y = bar_pos), size = 0.5, color = "#383D3B", alpha = 0.7) +
geom_text(data = subset(avg_cca, end == 'CA'), aes(label = paste(abs(round(x,1)), "%"), x = Inf, y = ifelse(sign(x) == -1 , x + 7, x - 7)), size = 3.3, vjust = 1.5, color = '#3E606F', fontface='bold') +
facet_share(~condition, dir = "h", scales = "free", reverse_num = TRUE) +
coord_flip() +
scale_fill_manual(name = "", values = alpha(c(CA = "#F0F9ED", CC = "#427AA1", C = "#0D4282", Absent = "#133C55"), 0.8), labels = c("3'-CCA", "3'-CC", "3'-C", "Absent")) +
scale_y_continuous(breaks = c(c(-100, -75, -50, -25, 0), c(0, 25, 50, 75, 100)))+
scale_x_discrete(expand = c(0.03, 0)) +
theme_bw() +
theme(axis.title = element_blank(),
strip.text = element_text(face = "bold"),
axis.text.y = element_text(size = 9),
axis.text.x = element_text(face = "bold"),
panel.border = element_blank(),
panel.background = element_blank())

ggsave(paste(out, paste(combinations[[i]][1], combinations[[i]][2], out_string, sep = "_"), sep = ""), cca_plot, height = 8, width = 9)

}

} else {
cca_summary_sub = cca_summary
cca_prop$bar_pos = NA
cca_prop$bar_pos[cca_prop$end == "Absent"] = cca_prop$per[cca_prop$end == "Absent"]
cca_prop$bar_pos[cca_prop$end == "C"] = cca_prop$per[cca_prop$end == "Absent"]+
cca_prop$per[cca_prop$end == "C"]
cca_prop$bar_pos[cca_prop$end == "CC"] = cca_prop$per[cca_prop$end == "Absent"]+
cca_prop$per[cca_prop$end == "CC"] + cca_prop$per[cca_prop$end == "C"]
cca_prop$bar_pos[cca_prop$end == "CA"] = cca_prop$per[cca_prop$end == "Absent"] +
cca_prop$per[cca_prop$end == "CA"] + cca_prop$per[cca_prop$end == "CC"] +
cca_prop$per[cca_prop$end == "C"]

avg_cca = aggregate(cca_summary_sub$x.mean, by = list(condition = cca_summary_sub$condition, end = cca_summary_sub$end), mean)
cca_summary_sub$end = factor(cca_summary_sub$end, levels = c('CA', 'CC', 'C', 'Absent'))

cca_plot = ggplot(cca_summary_sub, aes(x = gene, y = x.mean, fill = end)) +
geom_bar(stat = "identity", width = 0.8) +
geom_hline(data = subset(avg_cca, end == 'CA'), aes(yintercept=x), color = "white", alpha = 0.9) +
geom_jitter(data = cca_prop[cca_prop$end == "CC",], aes(x = gene, y = bar_pos), size = 0.5, color = "#383D3B", alpha = 0.7) +
geom_text(data = subset(avg_cca, end == "CA"), aes(label = paste(abs(round(x,1)), '%'), x = Inf, y = x), size = 3.3, vjust = 1, color = '#3E606F', fontface='bold') +
#facet_share(~condition, dir = "h", scales = "free", reverse_num = TRUE) +
coord_flip() +
scale_fill_manual(name = "", values = alpha(c(CA = "#F0F9ED", CC = "#427AA1", C = "#0D4282", Absent = "#133C55"), 0.8), labels = c("3'-CCA", "3'-CC", "3'-C", "Absent")) +
scale_y_continuous(breaks = c(c(-100, -75, -50, -25, 0), c(0, 25, 50, 75, 100)))+
scale_x_discrete(expand = c(0.03, 0)) +
theme_bw() +
theme(axis.title = element_blank(),
strip.text = element_text(face = "bold"),
axis.text.y = element_text(size = 9),
axis.text.x = element_text(face = "bold"),
panel.border = element_blank(),
panel.background = element_blank())

ggsave(paste(out, out_string, sep = ""), cca_plot, height = 8, width = 9)

}
}
149 changes: 149 additions & 0 deletions build/lib/mimseq/coveragePlot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#! /usr/bin/Rscript

# Plot tRNA coverage by isoacceptor
# Accepts coverage files generated by getCoverage.py

suppressMessages(library(ggplot2))
suppressMessages(library(RColorBrewer))
suppressMessages(library(plyr))
suppressMessages(library(gridExtra))

args = commandArgs(trailingOnly = TRUE)

aa_groups = data.frame(aa = c("Gly","Ala","Val","Leu","Met","iMet","Ile","Ser","Thr","Cys","SeC","Pro","Asn","Gln","Phe","Tyr","Trp","Lys","Arg","His","Asp","Glu","Und","Undet","Sup"),
group = c(rep("Nonpolar, aliphatic",7), rep("Polar, uncharged",7), rep("Aromatic",3), rep("Positively charged",3), rep("Negatively charged",2), rep("Other",3)))

if (length(args)==0) {
stop("At least one argument must be supplied (input file).n", call.=FALSE)

} else if (length(args)>0) {

out_dir = args[3]
mito_trnas = args[5]
if (mito_trnas == ''){
mito_trnas = NA
}
sorted_aa = args[4]
sorted_aa = unlist(strsplit(sorted_aa, "_"))

## Aggregated coverage plots for all Isoacceptors
cov_byaa = read.table(args[2], header=TRUE, sep = "\t")
cov_byaa = cov_byaa[complete.cases(cov_byaa),]
cov_byaa$bam = gsub(".unpaired_uniq.bam","",cov_byaa$bam)
cov_byaa$bam = gsub("(.*/).*?","\\2",cov_byaa$bam)
cov_byaa = cov_byaa[!grepl("eColi", cov_byaa$aa),]
cov_byaa$aa = factor(cov_byaa$aa, levels = sorted_aa)
cyt_colourCount = length(unique(subset(cov_byaa$aa, !grepl("mito", cov_byaa$aa) & !grepl("nmt", cov_byaa$aa))))
mit_colourCount = length(unique(subset(cov_byaa$aa, grepl("mito", cov_byaa$aa) | grepl("nmt", cov_byaa$aa))))
facetCount = length(unique(cov_byaa$bam))
getPalette = colorRampPalette(brewer.pal(10, 'Paired'))

cov_byaa_norm = ggplot(subset(cov_byaa, !grepl("mito", cov_byaa$aa) & !grepl("nmt", cov_byaa$aa)), aes(x = bin, y = cov_norm, fill = aa, group = aa)) +
geom_bar(stat = "identity", alpha = 0.8) +
facet_wrap(~bam, ncol = 4) +
xlab("Gene (%)") +
ylab("Normalised coverage (coverage/uniquely mapped reads)") +
labs(fill = "Isotype") +
guides(fill=guide_legend(ncol=2)) +
scale_fill_manual(values = getPalette(cyt_colourCount)) +
theme_bw()
ggsave(paste(out_dir, "coverage_byaa_norm.pdf", sep = ''), cov_byaa_norm, height = ceiling(facetCount/4) * 4, width = 14)

if (!is.na(mito_trnas)){
mito_cov_byaa = subset(cov_byaa, grepl("mito", cov_byaa$aa) | grepl("nmt", cov_byaa$aa))
if (nrow(mito_cov_byaa) != 0 ) {
mitocov_byaa_norm = ggplot(mito_cov_byaa, aes(x = bin, y = cov_norm, fill = aa, group = aa)) +
geom_bar(stat = "identity", alpha = 0.8) +
facet_wrap(~bam, ncol = 4) +
xlab("Gene (%)") + ylab("Normalised coverage (coverage/uniquely mapped reads)") +
labs(fill = "Isotype") +
guides(fill=guide_legend(ncol=2)) +
scale_fill_manual(values = getPalette(mit_colourCount)) +
theme_bw()
ggsave(paste(out_dir, "mitocoverage_byaa_norm.pdf", sep = ''), mitocov_byaa_norm, height = ceiling(facetCount/4) * 4, width = 14)
}

}

cyto_cov_byaa = subset(cov_byaa, !grepl("mito", cov_byaa$aa) & !grepl("nmt", cov_byaa$aa))
cyto_cov_byaa_sum = aggregate(x = cyto_cov_byaa$cov_norm, by = list(bin = cyto_cov_byaa$bin, bam = cyto_cov_byaa$bam), FUN = sum)
cyto_scale_factors = cyto_cov_byaa_sum[which(cyto_cov_byaa_sum$bin == 96),] # 96 is second last bin of 4%
cyto_cov_byaa$cov_norm_scaled = NA
for (i in 1:nrow(cyto_cov_byaa)){
cyto_cov_byaa[i,'cov_norm_scaled'] = cyto_cov_byaa[i,'cov_norm']/cyto_scale_factors[which(cyto_cov_byaa[i,'bam'] == cyto_scale_factors$bam),3]
}

if (!is.na(mito_trnas) && nrow(mito_cov_byaa) != 0){
mito_cov_byaa_sum = aggregate(x = mito_cov_byaa$cov_norm, by = list(bin = mito_cov_byaa$bin, bam = mito_cov_byaa$bam), FUN = sum)
mito_scale_factors = mito_cov_byaa_sum[which(mito_cov_byaa_sum$bin == 96),] # 96 is second last bin of 4%
mito_cov_byaa$cov_norm_scaled = NA
for (i in 1:nrow(mito_cov_byaa)){
mito_cov_byaa[i,'cov_norm_scaled'] = mito_cov_byaa[i,'cov_norm']/mito_scale_factors[which(mito_cov_byaa[i,'bam'] == mito_scale_factors$bam),3]
}
cov_byaa_scaled = rbind(cyto_cov_byaa, mito_cov_byaa)
}
else {
cov_byaa_scaled = cyto_cov_byaa
}

cov_byaa_norm_scaled = ggplot(subset(cov_byaa_scaled, !grepl("mito", cov_byaa_scaled$aa) & !grepl("nmt", cov_byaa_scaled$aa)), aes(x = bin, y = cov_norm_scaled, fill = aa, group = aa)) +
geom_bar(stat = "identity", alpha = 0.8) + facet_wrap(~bam, ncol = 4) +
xlab("Gene (%)") +
ylab("Scaled normalised coverage") +
labs(fill = "Isotype") +
guides(fill=guide_legend(ncol=2)) +
scale_y_continuous(breaks = seq(0,1,0.25)) +
scale_fill_manual(values = getPalette(cyt_colourCount)) +
theme_bw()
ggsave(paste(out_dir, "coverage_byaa_norm_scaled.pdf", sep = ''), cov_byaa_norm_scaled, height = ceiling(facetCount/4) * 4, width = 14)

if (!is.na(mito_trnas)){
mitocov_byaa_scaled = subset(cov_byaa_scaled, grepl("mito", cov_byaa_scaled$aa) | grepl("nmt", cov_byaa_scaled$aa))
if (nrow(mitocov_byaa_scaled) != 0) {
mitocov_byaa_norm_scaled = ggplot(mitocov_byaa_scaled, aes(x = bin, y = cov_norm_scaled, fill = aa, group = aa)) +
geom_bar(stat = "identity", alpha = 0.8) + facet_wrap(~bam, ncol = 4) +
xlab("Gene (%)") +
ylab("Scaled normalised coverage") +
labs(fill = "Isotype") +
guides(fill=guide_legend(ncol=2)) +
scale_y_continuous(breaks = seq(0,1,0.25)) +
scale_fill_manual(values = getPalette(mit_colourCount)) +
theme_bw()
ggsave(paste(out_dir, "mitocoverage_byaa_norm_scaled.pdf", sep = ''), mitocov_byaa_norm_scaled, height = ceiling(facetCount/4) * 4, width = 14)
}
}

cov_byaa$aa_groups = aa_groups[match(cov_byaa$aa, aa_groups$aa),2]

cov_byaa_agg = aggregate(x = cov_byaa$cov_norm, by = list(bam = cov_byaa$bam, bin = cov_byaa$bin, aa_groups = cov_byaa$aa_groups), FUN = mean)
cov_byaa_line = ggplot(cov_byaa_agg, aes(x = bin, y = x, color = aa_groups, group = aa_groups)) +
geom_line() +
facet_wrap(~bam, ncol = 4) +
xlab("Gene (%)") +
ylab("Normalised coverage (coverage/uniquely mapped reads)") +
labs(color = "Amino acid group") +
theme_bw()
ggsave(paste(out_dir, "coverage_byaa_line.pdf", sep = ''), cov_byaa_line, height = ceiling(facetCount/4) * 4, width = 14)

## Coverage plots per cluster multipage
cov_bygene = read.table(args[1], header = TRUE, sep="\t")
cov_bygene$Cluster = sub(".*_mito_tRNA-", "mito", cov_bygene$Cluster)
cov_bygene$Cluster = sub(".*_nmt_tRNA-", "nmt", cov_bygene$Cluster)
cov_bygene$Cluster = sub(".*_tRNA-", "", cov_bygene$Cluster)
cov_bygene$Cluster = sub(".*_tRX-", "tRX-", cov_bygene$Cluster)
cov_bygene$bam = gsub(".unpaired_uniq.bam","",cov_bygene$bam)
cov_bygene$bam = gsub("(.*/).*?","\\2",cov_bygene$bam)
plot_func = ggplot(cov_bygene, aes(x = bin, y = cov_norm)) + geom_bar(stat = 'identity', fill = "#6BA7BB") +
facet_grid(Cluster~bam, scales='free') + xlab("Gene (%)") + ylab("Normalised coverage") +
theme(axis.title.x=element_blank(),
axis.text.y=element_text(colour="black",size=8),
axis.text.x=element_blank(), strip.text.y = element_text(angle=0, size = 6),
strip.text.x = element_text(angle=0, size = 4)) +
theme_bw()

plots = dlply(cov_bygene , "Cluster", `%+%`, e1 = plot_func)
multi = marrangeGrob(plots, nrow = 3, ncol = 1)
ggsave(paste(out_dir, "coverage_byUniquetRNA_norm.pdf", sep = ''), multi, height = 12, width = 14)


}
48 changes: 48 additions & 0 deletions build/lib/mimseq/data/additionalMods.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
Homo sapiens Ala-AGC I34
Homo sapiens Arg-ACG I34
Homo sapiens Ile-AAT I34
Homo sapiens Leu-AAG I34
Homo sapiens Pro-AGG I34
Homo sapiens Ser-AGA I34
Homo sapiens Thr-AGT I34
Homo sapiens Val-AAC I34
Homo sapiens Asp-GTC m1A9
Mus musculus Ala-AGC I34
Mus musculus Pro-AGG I34
Mus musculus Thr-AGT I34
Mus musculus Ser-AGA I34
Mus musculus Leu-AAG I34
Saccharomyces cerevisiae Gln-TTG mcm5s2U34
Schizosaccharomyces pombe Gln-TTG mcm5s2U34
Schizosaccharomyces pombe Pro-AGG I34
Schizosaccharomyces pombe Thr-AGT I34
Schizosaccharomyces pombe Arg-ACG I34
Schizosaccharomyces pombe Leu-AAG I34
Schizosaccharomyces pombe Ser-AGA I34
Schizosaccharomyces pombe Ala-AGC I34
Schizosaccharomyces pombe Ile-AAT I34
Schizosaccharomyces pombe Val-AAC I34
Danio rerio Ala-AGC I34
Danio rerio Arg-ACG I34
Danio rerio Ile-AAT I34
Danio rerio Leu-AAG I34
Danio rerio Pro-AGG I34
Danio rerio Ser-AGA I34
Danio rerio Thr-AGT I34
Danio rerio Val-AAC I34
Gorilla gorilla Ala-AGC I34
Gorilla gorilla Arg-ACG I34
Gorilla gorilla Ile-AAT I34
Gorilla gorilla Leu-AAG I34
Gorilla gorilla Pro-AGG I34
Gorilla gorilla Ser-AGA I34
Gorilla gorilla Thr-AGT I34
Gorilla gorilla Val-AAC I34
Rattus norvegicus Ala-AGC I34
Rattus norvegicus Arg-ACG I34
Rattus norvegicus Ile-AAT I34
Rattus norvegicus Leu-AAG I34
Rattus norvegicus Pro-AGG I34
Rattus norvegicus Ser-AGA I34
Rattus norvegicus Thr-AGT I34
Rattus norvegicus Val-AAC I34
Loading

0 comments on commit 16a5f1b

Please sign in to comment.