forked from dxs435/COMPACT_enrichment
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCOMPACT_enrichment_v3.R
116 lines (100 loc) · 4.16 KB
/
COMPACT_enrichment_v3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#############################################
# Marker enrichment analysis through combinatorial compact
# Author: DGS
#
# This all assumes that the data objects from "pre_built.RData" are in the environment
#
# VERSION 3: major changes to the algorithm--importantly the master array is dropped
# 1. COMPACT matrices will be calculated on the fly
# 2. COMPACT time codes will be sorted differently (using Raj's method)
# 3. Enrichment is scored as a p-value calculated from a hypergeometric distribution
#
# General procedure
# 1. Read in the gene list
# 2. Build COMPACT gene lists
# 3. Get the gene list / COMPACT intersections
# 4. calculate p-values
#
#############################################
library(reshape2)
library(ggplot2)
library(gplots)
#read in the gene/marker list to tested
gene_list <- read.table(file = "HSC_test_list.txt", colClasses = "character")
gene_list <- gene_list$V1
#make empty results matrix
enrichMatrix <- data.frame(matrix(0, nrow = length(Tlist), ncol = length(Tlist)), row.names = names(Tlist))
colnames(enrichMatrix) <- names(Tlist)
pairs_for_CB <- combn(length(discreteList),2)
for(p in 1:ncol(pairs_for_CB)){
#browser()
ind <- pairs_for_CB[,p]
print(paste("working on pairings: ", ind[1], ind[2]))
#make 3D array of COMPACT and individual gene lists
patternLists <- getPatternGenes(ind[1], ind[2])
#make an empty 3D array to store temporary COMPACT and gene lists
#...getting the dimensions
maxA <- max(sapply(patternLists[["a"]],length, USE.NAMES = F))
maxB <- max(sapply(patternLists[["b"]],length, USE.NAMES = F))
maxGenes <- max(c(maxA,maxB))
COMPACTdims <- c(length(patternLists[["patterns"]]), maxGenes)
#...making the empty array
COMPACT <- array(NA, dim = c(COMPACTdims[1], COMPACTdims[1], COMPACTdims[2]))
for(i in 1:nrow(COMPACT)){
for(j in 1:ncol(COMPACT)){
genes <- intersect(unique(patternLists[["a"]][[i]]), unique(patternLists[["b"]][[j]]))
if(length(genes) > 0){
COMPACT[i,j,1:length(genes)] <- genes
}
}
}
#make an empty matrix to store the Marker occurance in gene lists
occurance.matrix <- matrix(0, nrow = COMPACTdims[1], ncol = COMPACTdims[1])
for(i in 1:nrow(COMPACT)){
for(j in 1:ncol(COMPACT)){
occurance.matrix[i,j] <- sum(sapply(gene_list, function(x) sum(grepl(x, COMPACT[i,j,]))))
}
}
#flatten the COMPACT array
COMPACT.flat <- flattenCOMPACT()
#calculate enrichment p-value
pvalues <- hypergeometric.pvalue()
enrichMatrix[ind[1], ind[2]] <- pvalues[["m"]]
enrichMatrix[ind[2], ind[1]] <- pvalues[["n"]]
}
#make diagonal NAs, I'm concerned that the 0's will influence FDR adjustments
ematrix <- as.matrix(enrichMatrix)
ematrix[ematrix == 0] <- NA
#get rid of negative p-values...also, why are there negative p-values??
if(any(enrichMatrix < 0)){
ematrix[which(ematrix < 0)] <- NA
}
#Correcting for multiple comparisons
#FDR by ematrix row
enrich.adjusted.pvalue <- vector(mode="numeric")
# for(trmt in 1:nrow(ematrix)){
# enrich.adjusted.pvalue <- c(enrich.adjusted.pvalue,unname(p.adjust(ematrix[trmt,], method="BH")))
# }
#FDR by whole matrix
enrich.adjusted.pvalue <- p.adjust(ematrix, method="BH")
enrich.adjusted.pvalue[is.na(enrich.adjusted.pvalue)] <- 1
#The p-values seemed similar between by row and by whole matrix, so I'll continue with whole matrix bc it's less lines
#putting the p-values in a matrix
#this method of putting a vector into a matrix only works for square matrices, which I think we will always have
#For FDR by row, the matrix needs to be transformed
enrich.matrix.adjusted <- data.frame(matrix(enrich.adjusted.pvalue, nrow = length(Tlist), ncol = length(Tlist)), row.names = names(Tlist))
colnames(enrich.matrix.adjusted) <- names(Tlist)
#visualizing...
#because there is such a huge range of values and we only care about the low values, the values above 5e-3 are forced to 5e-3
ematrix <- as.matrix(enrich.matrix.adjusted)
ematrix[ematrix > 5e-3] <- 5e-3
heatmap.2(-log(t(ematrix)),
Colv = F,
dendrogram = "row",
#scale = "none",
trace = "none",
key = T,
cexCol = 1,
cexRow = 1,
margins = c(5,15))
save(ematrix, file="HSC_enrichment_v3.RData")