-
Notifications
You must be signed in to change notification settings - Fork 7
/
Project1fs.R
75 lines (51 loc) · 2.95 KB
/
Project1fs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# important -- this makes sure our runs are consistent and reproducible
set.seed(0)
file <- "TCGA_breast_cancer_ERstatus_allGenes.txt"
nfold <- 5
sd_threashold <- 4
top_num <- 10
header <- scan(file, nlines = 1, sep="\t", what = character())
data <- read.table(file, skip = 2, header = FALSE, sep = "\t", quote = "", check.names=FALSE)
names(data) <- header
header2 <- scan(file, skip = 1, nlines = 1, sep="\t", what = character())
# cleanup - remove genes with sd < 1
# compute sd for each gene
data_sd<-sapply(seq(nrow(data)), function(x) { as.numeric(sd(data[x,-1])) })
# add gene names to the sd list
data_sd_names<-cbind(data.frame(data_sd),data[,1])
# create an "include" list of all those genes where sd > threashold
include_list <- data_sd_names[data_sd_names[,1]>sd_threashold,2]
Positive <- data[data$id %in% include_list,header2=='Positive']
Negative <- data[data$id %in% include_list,header2=='Negative']
# define function cross_valid so we can rerun the cross validataion with various parameters
cross_validation <- function (nfold, alg="centroid") {
Positive_groups <- split(sample(colnames(Positive)), 1+(seq_along(colnames(Positive)) %% nfold))
Negative_groups <- split(sample(colnames(Negative)), 1+(seq_along(colnames(Negative)) %% nfold))
result <- array()
for (test_group in 1:nfold) {
testA <- Positive[,colnames(Positive) %in% unlist(Positive_groups[test_group])]
testB <- Negative[,colnames(Negative) %in% unlist(Negative_groups[test_group])]
trainingA <- Positive[,!(colnames(Positive) %in% unlist(Positive_groups[test_group]))]
trainingB <- Negative[,!(colnames(Negative) %in% unlist(Negative_groups[test_group]))]
# Feature selection --
# compute t-statistic for each row
training_t_stat<-data.frame(sapply(seq(nrow(trainingA)), function(x) { abs(as.numeric(t.test(trainingA[x,], trainingB[x,])$statistic)) }))
# add gene id column
training_t_stat_geneid<-cbind(training_t_stat,rownames(trainingA))
colnames(training_t_stat_geneid) <- c('t','id')
# pick top 50 based on t-statistic
selected_genes <- head(training_t_stat_geneid[order(-training_t_stat_geneid$t),],n=top_num)[,2]
# narrow down the list of genes based on t-statistic
testA <- testA[rownames(testA) %in% selected_genes,]
testB <- testB[rownames(testB) %in% selected_genes,]
trainingA <- trainingA[rownames(trainingA) %in% selected_genes,]
trainingB <- trainingB[rownames(trainingB) %in% selected_genes,]
centroidA <- rowMeans(trainingA)
centroidB <- rowMeans(trainingB)
misclassifiedA <- sum(sapply(testA, function(x) { sqrt(sum((x-centroidA)^2))-sqrt(sum((x-centroidB)^2))>0 }))
misclassifiedB <- sum(sapply(testB, function(x) { sqrt(sum((x-centroidA)^2))-sqrt(sum((x-centroidB)^2))<0 }))
result[test_group] <- (misclassifiedA+misclassifiedB)/(ncol(testA)+ncol(testB))
}
paste0(round(mean(result),4)," sd=(",round(sd(result),4),")")
}
kable(cross_validation(5))