-
Notifications
You must be signed in to change notification settings - Fork 0
/
bioinformatics_section.R
73 lines (54 loc) · 2.85 KB
/
bioinformatics_section.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
################################################################################
# Alex Bartlett
# March 13, 2021
#
# use a rolling average of log2ratios to calculate copy number of various genomic
# locations
################################################################################
library(tidyverse)
library(zoo)
################################################################################
# determine which value to use for width of the window or the rolling average
################################################################################
# load data
log2ratios <- read.table('log2Ratio_values.txt', header = T)
log2ratios$position <- as.numeric(log2ratios$position)
# for calculating approximate length in bases of window widths
avg_point2point_dist <- mean(diff(log2ratios$position))
# check a variety of window sizes
poss_windows <- c(5, 10, 15, 20, 30, 40, 50)
for (window in poss_windows){
# to dampen noise, calculate rolling average of log2ratios
LR_smooth <- rollmean(log2ratios$LR, window, fill = c('extend', 'extend', 'extend'))
# calculate and visualize copy number (rounder to nearest integer)
log2ratios$CN <- round(2 * (2 ^ LR_smooth))
CN_vs_pos <- ggplot(log2ratios, aes(x = position/1e6, y = CN)) + geom_point(shape = 'o')
ggsave(paste0('CN_vs_pos_window_', round(window * avg_point2point_dist/1e6, 2), 'MB.png'))
}
################################################################################
# return copy number for point or segment in genome
################################################################################
# calculate rolling average of log2ratios using chosen window width
LR_smooth <- rollmean(log2ratios$LR, 30, fill = c('extend', 'extend', 'extend'))
# calculate and visualize copy number (rounder to nearest integer)
log2ratios$CN <- round(2 * (2 ^ LR_smooth))
write.table(log2ratios, 'copyNumber_values.txt', sep = '\t', row.names = F, quote = F)
# return copy number for segment with start and end coordinates
CN_segment <- function(start, end){
data_start <- max(log2ratios$position[log2ratios$position <= start])
data_end <- min(log2ratios$position[log2ratios$position >= end])
CN <- round(mean(log2ratios$CN[log2ratios$position >= data_start & log2ratios$position <= data_end]))
return(CN)
}
# return copy number for point
CN_point <- function(point){
return(CN_segment(point, point))
}
# output copy numbers for requested genes
print(paste('PDPN:', CN_segment(13583831, 13617957)))
print(paste('MEAF6:', CN_segment(37492575, 37514763)))
print(paste('MIER1:', CN_segment(66924895, 66988619)))
print(paste('ADGRL2:', CN_segment(13583831, 13617957)))
print(paste('FCRL5:', CN_segment(157513377, 157552515)))
print(paste('PBX1:', CN_segment(164559635, 164851831)))
print(paste('USH2A:', CN_segment(215622891, 216423448)))