-
Notifications
You must be signed in to change notification settings - Fork 0
/
merge_metadata.R
executable file
·66 lines (59 loc) · 2.36 KB
/
merge_metadata.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env Rscript
library(xlsx)
# A function to read in VCF file
readXlsx <- function(filename) {
data.file <- read.xlsx2(
file = filename, # passed as an argument
sheetName = "JHI-FINAL_WBDC_LIST_JAN-2017",
header = TRUE, # First line is a header
fill = TRUE, # Fill empty fields with NAs
na.strings = "NA"
)
return(data.file)
}
# A function to read in SNP_BAC.txt file format
readCsv <- function(filename) {
data.file <- read.csv(
file = filename, # passed as an argument
header = TRUE, # First line is a header
fill = TRUE, # Fill empty fields with NAs
na.strings = "NA"
)
return(data.file)
}
# A function to merge files
# based on matching SNP names
# If SNP names match, add position to new column in HarvEST file
mergeFile <- function(morrell_lab, jhi) {
# Merge harvest and physical based on matches found between Query_SNP and SNP_id columns
merged <- merge(x = morrell_lab,
y = jhi,
by.x = "Accession_ID", # Use Query_SNP column from harvestData for merge
by.y = "WBDC_accession", # USE SNP_id column from physicalData for merge
all = FALSE # Rows that do not have a match will not be put into final dataframe
)
return(merged)
}
# A function to write data to outfile
writeOutFile <- function(mergedData, outFilename) {
write.xlsx2(x = mergedData,
file = outFilename,
quote = FALSE,
col.names = TRUE,
row.names = FALSE,
showNA = TRUE)
}
# Driver function
main <- function() {
# Take command line arguments
# Stores arguments into a vector
args <- commandArgs(trailingOnly = TRUE)
vcfFile <- args[1] # vcf file
snpBAC <- args[2] # HarvEST Barley SNP_BAC.txt file is second arguemnt
outName <- args[3] # name given by user is third argument
physical <- readVcf(filename = vcfFile) # read in physical positions
harvest <- readSnpBAC(filename = snpBAC) # read in SNP_BACs
merged <- mergeFile(physicalData = physical, harvestData = harvest) # merge physical and harvest based on matching SNP names
writeOutFile(mergedData = merged, outFilename = outName) # write merged data to outfile
}
main() # Run the program