forked from StanfordBioinformatics/mvp_aaa_codelabs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSample-QC.Rmd
111 lines (89 loc) · 3.62 KB
/
Sample-QC.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
<!-- R Markdown Documentation, DO NOT EDIT THE PLAIN MARKDOWN VERSION OF THIS FILE -->
<!-- Copyright 2015 Stanford University All rights reserved. -->
Performing QC on AAA genomes using Google Genomics
================================================
The following example makes use of 5 genomes from the MVP.
Setting Up and Describing the Data
----------------------------------
```{r echo=FALSE, eval=FALSE}
# This codelab assumes that the current working directory is where the Rmd file resides
setwd("/Users/gmcinnes/GitHub/VA_AAA_BigQuery")
```
```{r comment=NA, message=FALSE, warning=FALSE}
# Setup for BigQuery access
require(bigrquery)
require(xtable)
require(RCurl)
require(dplyr)
project <- "gbsc-gcp-project-mvp" # put your projectID here
DisplayAndDispatchQuery <- function(queryUri, replacements=list()) {
if(grepl("^https.*", queryUri)) {
querySql <- getURL(queryUri, ssl.verifypeer=FALSE)
} else {
querySql <- readChar(queryUri, nchars=1e6)
}
for(replacement in names(replacements)) {
querySql <- gsub(replacement, replacements[[replacement]], querySql, fixed=TRUE)
}
cat(querySql)
query_exec(querySql, project)
}
table_replacement <- list("_THE_TABLE_"="gbsc-gcp-project-mvp:va_aaa_pilot_data.sample_gvcfs",
"_THE_EXPANDED_TABLE_"="gbsc-gcp-project-mvp:va_aaa_pilot_data.sample_vcfs")
```
Check Singletons
-----------------------------------
```{r message=FALSE, warning=FALSE, comment=NA}
limits = "WHERE
reference_name = 'chr22'
AND call.QUAL >= 30"
result <- DisplayAndDispatchQuery("./sql/private-variants-brca1.sql",
replacements=c(table_replacement, "_LIMITS_" = limits))
```
Number of rows returned by this query: `r nrow(result)`.
Displaying the first few rows of the dataframe of results:
```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"}
print(xtable(head(result)), type="html", include.rownames=F)
```
Check Individual Heterozygosity
-----------------------------------
```{r message=FALSE, warning=FALSE, comment=NA}
limits = "WHERE
reference_name = 'chr22'
AND call.QUAL >= 30"
result <- DisplayAndDispatchQuery("./sql/homozygous-variants.sql",
replacements=c(table_replacement, "_LIMITS_"=limits))
```
Number of rows returned by this query: `r nrow(result)`.
```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"}
print(xtable(result), type="html", include.rownames=F)
```
Cohort Level QC
===============
Check Hardy-Weinberg Equilibrium
-----------------------------------
```{r message=FALSE, warning=FALSE, comment=NA}
limits = "WHERE
reference_name = 'chr22'
AND call.QUAL >= 30"
result <- DisplayAndDispatchQuery("./sql/hardy-weinberg-brca1-expanded.sql",
replacements=c(table_replacement, "_LIMITS_"=limits))
```
Number of rows returned by this query: `r nrow(result)`.
Displaying the first few results:
```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"}
print(xtable(head(result)), type="html", include.rownames=F)
```
===============
Check Transition-Transversion Ratio
-----------------------------------
```{r message=FALSE, warning=FALSE, comment=NA}
limits = "AND reference_name = 'chr22'
AND call.QUAL >= 30"
result <- DisplayAndDispatchQuery("./sql/ti-tv-ratio.sql",
replacements=c(table_replacement,"_LIMITS_"=limits))
```
The result:
```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"}
print(xtable(result), type="html", include.rownames=F)
```