-
Notifications
You must be signed in to change notification settings - Fork 1
/
tsne.cnv.r
99 lines (69 loc) · 2.36 KB
/
tsne.cnv.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
library(data.table)
setwd("~/SCRATCH/tsne/cnv")
source("../tsne.util.r")
cnv.file = "TCGA.cnv.window_1M.txt"
cnv = read.table(cnv.file, h=T, stringsAsFactors=F, row.names=1, check.names=F)
# aliquot selection
aliquot = colnames(cnv)
aliquot2 = SelectAliquot(aliquot, my.type = c("01", "03"))
patient = substr(aliquot2, 1, 12)
m = match(patient, substr(aliquot, 1, 12))
cnv = cnv[, m]
colnames(cnv) = patient
# replace sample names to barcode
meta = fread("../meta/snp6.sample.txt", h=T)
meta = meta[, c("patient_barcode", "disease"), with=F]
meta$disease = toupper(meta$disease)
colnames(meta) = c("patient", "project")
m = match(colnames(cnv2), meta$patient)
meta = meta[m, ]
# QC
# rnaseq data do not have NA. Because this is accross multiple project, zero.ratio set to 0.99
cnv2 = CompletenessFilter(cnv, feature.missing = 0.1, sample.mising = 1, dup.removal = T)
if(sum(is.na(cnv2)) > 0) {
cnv2 = ImputeNA(cnv2)
}
save(cnv2, file="cnv2.1M.rda")
# pca
load("cnv2.1M.rda")
ptm <- proc.time()
pca2 = pca(cnv2)
proc.time() - ptm
save(pca2, file="TCGA.cnv.1m.pca.rda")
# t-SNE
library(Rtsne)
ptm <- proc.time()
tsne2 = tsne(pca2$x, permute=1, use=50)
proc.time() - ptm
# save(tsne2, file="TCGA.rna.allgene.tsne.rda")
pdata2 = tsne2
pdata2$project = meta$project
write.table(pdata2, "TCGA.cnv.10M.per10.pdata.txt", col.names=T, row.names=F, sep="\t", quote=F)
png("TCGA.cnv.10M.per10.png", width=1200, height=800)
p = ggplot(pdata2, aes(x=feature_1, y=feature_2, col=project, shape = project)) +
geom_point(size = 2) +
scale_shape_manual(values=c(rep(c(0, 2, 8, 16, 17), 6), 0, 2, 8) )
p
dev.off()
#####
low expr
m = apply(tpm2, 1, mean)
w = which(m > 1e-6)
tpm3 = tpm2[w, ]
pca3 = pca(tpm3)
save(pca3, file="TCGA.rna.filter1.lowexpr.pca.rda")
# t-SNE
library(Rtsne)
tsne3 = tsne(pca3$x, permute=100, use=50)
save(tsne3, file="TCGA.rna.lowexpr.tsne.rda")
meta$aliquot = meta$barcode
pdata3 = annotate.tsne(tsne3, meta)
pdata3 = pdata3[, c("aliquot", "feature_1", "feature_2", "disease", "center")]
colnames(pdata3)[4] = "project"
write.table(pdata3, "TCGA.rna.lowexpr.pdata.txt", col.names=T, row.names=F, sep="\t", quote=F)
png("TCGA.rna.lowexpr.pdata.perm100.png", width=1200, height=800)
p = ggplot(pdata3, aes(x=feature_1, y=feature_2, col=project, shape = project)) +
geom_point(size = 2) +
scale_shape_manual(values=c(rep(c(0, 2, 8, 16, 17), 6), 0, 2, 8) )
p
dev.off()