-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathH3K27ac_IOSE_consensus_upsetr_analysis.Rmd
127 lines (93 loc) · 3.77 KB
/
H3K27ac_IOSE_consensus_upsetr_analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
---
title: "H3K27ac_IOSE_consensus_upsetr_analysis"
author: "Alberto Luiz Pascasio Reyes"
date: "11/2/2018"
output: html_document
---
load libraries and input files
```{r}
library(GenomicRanges)
library(GenomicFeatures)
library(HelloRanges)
library(IRanges)
library(S4Vectors)
library(readr)
library(dplyr)
library(tidyr)
library(tidyverse)
library(ggplot2)
library(UpSetR)
library(rtracklayer)
library(optparse)
library(pheatmap)
iose_k27ac_hg19 <- read_table2("hg19_H3K27ac_biofeature_beds/IOSE/allpeaks.anno.tsv")
iose_k27ac_hg19 <- iose_k27ac_hg19[,-6]
```
make consensus set
```{r}
consensus_iose_k27ac_hg19 <- iose_k27ac_hg19[which(rowSums(iose_k27ac_hg19[,4:5]) >= 1.5),]
gr_consensus_iose_k27ac_hg19 <- makeGRangesFromDataFrame(consensus_iose_k27ac_hg19, keep.extra.columns = FALSE)
gr_consensus_iose_k27ac_hg19
gr_consensus_iose_k27ac_hg19 <- GenomicRanges::reduce(gr_consensus_iose_k27ac_hg19, ignore.strand = FALSE) ### this "undercounts" # of shared peaks
gr_consensus_iose_k27ac_hg19
```
make unique sets
```{r}
union_unique_iose_k27ac_hg19 <- iose_k27ac_hg19[which(rowSums(iose_k27ac_hg19[,4:5]) < 1.5),]
colnames(union_unique_iose_k27ac_hg19)
unique_iose4_iose_k27ac_hg19 <- filter(union_unique_iose_k27ac_hg19, B02_iOSE4 == 1)
unique_iose11_iose_k27ac_hg19 <- filter(union_unique_iose_k27ac_hg19, B03_iOSE11 == 1)
gr_unique_iose4_iose_k27ac_hg19 <- makeGRangesFromDataFrame(unique_iose4_iose_k27ac_hg19, keep.extra.columns = FALSE)
gr_unique_iose11_iose_k27ac_hg19 <- makeGRangesFromDataFrame(unique_iose11_iose_k27ac_hg19, keep.extra.columns = FALSE)
```
write peak subsets to files
```{r}
write.table(as.data.frame(gr_consensus_iose_k27ac_hg19),
file = "./peak_subsets/hg19_H3K27ac/iOSE/consensus_iose_k27ac_hg19.bed",
row.names = FALSE,
col.names = FALSE,
quote = FALSE,
sep = "\t")
write.table(as.data.frame(gr_unique_iose4_iose_k27ac_hg19),
file = "./peak_subsets/hg19_H3K27ac/iOSE/unique_iose4_iose_k27ac_hg19.bed",
row.names = FALSE,
col.names = FALSE,
quote = FALSE,
sep = "\t")
write.table(as.data.frame(gr_unique_iose11_iose_k27ac_hg19),
file = "./peak_subsets/hg19_H3K27ac/iOSE/unique_iose11_iose_k27ac_hg19.bed",
row.names = FALSE,
col.names = FALSE,
quote = FALSE,
sep = "\t")
```
do (simon's newest method for) upsetr analysis (can run this part by itself independent of other chunks)
```{r}
iose_k27ac_hg19 <- read_table2("hg19_H3K27ac_biofeature_beds/IOSE/allpeaks.anno.tsv")
iose_k27ac_hg19 <- iose_k27ac_hg19[,-6]
x <- iose_k27ac_hg19 ###change this obj name to whatever file being analyzed
View(x)
y <- x; y[, c(4:ncol(x))] <- x[, c(4:ncol(x))] > 0.5
View(y)
y <- GRanges(seqnames = y$chr,
ranges = IRanges(start = y$start, end = y$end - 1),
data = y[, 4:ncol(y)])
y
yr <- GenomicRanges::reduce(y, with.revmap = TRUE)
sample_overlap <- t(bind_cols(lapply(
relist(mcols(y)[unlist(yr$revmap),], yr$revmap),
function(x) {
tibble::as_tibble(colSums(as.matrix(x)) >= 1) ### set threshold of overlap (percentage) here.....why is this mat bigger than x???...how to interp this chunk
})))
mcols(yr) <- sample_overlap
colnames(sample_overlap) <- gsub("data.", "", colnames(mcols(y)))
yr$sample_count <- rowSums(as.matrix(mcols(yr)))
# file_ranges <- yr
# save(file_ranges, sample_overlap, file = paste0(opt$out, "_rdata.rda"))
#
# pdf(paste0(opt$out, "_upset.pdf")) ### not yet working
upset(as.data.frame(sample_overlap + 0), ### simon's method
order.by = "freq",
text.scale = c(2, 2, 2, 1.5, 2.25, 2.25),
point.size = 5) ### use this method bc of reduce step (ie bedtools merge) --> doesn't double count peaks, instead uses our defn of inclusive boundaries
```