-
Notifications
You must be signed in to change notification settings - Fork 1
/
private_SNPs_in_VCF.sh
54 lines (42 loc) · 1.51 KB
/
private_SNPs_in_VCF.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/bin/env bash
set -euo pipefail
# Need to test for memory limitations in the new vcftools 0.1.16
module load vcftools_ML/0.1.14
# A script to identify private SNPs within a sample
# Peter L. Morrell, 05 May 2017, St. Paul, MN
# Dependencies: VCFTools
# Requires three input files:
# 1) a VCF including all SNPs and all samples
# 2) a list of SNPs to be included [optional], for example, noncoding variants
# 3) the name of the sample to be tested as private
if [ $# -ne 3 ]; then
echo $0: usage: "private_SNPs_in_VCF.sh [full VCF] [SNP list] [sample] [chromosome]"
exit 1
fi
full_vcf=$1
SNP_list=$2
sample_name=$3
chromosome=$4
sample=$(basename $sample_name)
# /panfs/roc/scratch/tkono/GATK_Capture_WithID.vcf.gz
vcftools --gzvcf $full_vcf \
--snps $SNP_list \
--recode \
--mac 1 \
--chr $chromosome \
--remove $sample_name \
--out ${sample}_removed
vcftools --gzvcf $full_vcf \
--snps $SNP_list \
--recode \
--mac 1 \
--chr $chromosome \
--keep $sample_name \
--out ${sample}_only
vcftools --vcf ${sample}_removed.recode.vcf \
--diff ${sample}_only.recode.vcf --diff-site --out ${sample}
# The output is only the prefix. The rest of the name will include ".diff.sites_in_files"
# The file contains a report on every site retained in the analysis.
# The 4th column of this file reports the number of sites
# that are unique to the 2nd VCF file. Here that will
awk '$4=="2" {++count} END {print count}' ${sample}.diff.sites_in_files