forked from bigdatagenomics/adam
-
Notifications
You must be signed in to change notification settings - Fork 1
/
adam.avdl
347 lines (305 loc) · 13.5 KB
/
adam.avdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
@namespace("edu.berkeley.cs.amplab.adam.avro")
protocol ADAM {
record ADAMRecord {
/**
* These two fields, along with the two
* reference{Length, Url} fields at the bottom
* of the schema, collectively form the contents
* of the Sequence Dictionary embedded in the these
* records from the BAM / SAM itself.
*/
union { null, string } referenceName = null;
union { null, int } referenceId = null;
// 0-based reference position start
union { null, long } start = null;
union { null, int } mapq = null;
union { null, string } readName = null;
union { null, string } sequence = null;
union { null, string } mateReference = null;
union { null, long } mateAlignmentStart = null;
union { null, string } cigar = null;
union { null, string } qual = null;
union { null, string } recordGroupName = null;
union { null, int } recordGroupId = null;
// Read flags (all default to false)
union { boolean, null } readPaired = false;
union { boolean, null } properPair = false;
union { boolean, null } readMapped = false;
union { boolean, null } mateMapped = false;
union { boolean, null } readNegativeStrand = false;
union { boolean, null } mateNegativeStrand = false;
union { boolean, null } firstOfPair = false;
union { boolean, null } secondOfPair = false;
union { boolean, null } primaryAlignment = false;
union { boolean, null } failedVendorQualityChecks = false;
union { boolean, null } duplicateRead = false;
// Commonly used optional attributes
union { null, string } mismatchingPositions = null;
// Remaining optional attributes flattened into a string
union { null, string } attributes = null;
// record group identifer from sequencing run
union { null, string } recordGroupSequencingCenter = null;
union { null, string } recordGroupDescription = null;
union { null, long } recordGroupRunDateEpoch = null;
union { null, string } recordGroupFlowOrder = null;
union { null, string } recordGroupKeySequence = null;
union { null, string } recordGroupLibrary = null;
union { null, int } recordGroupPredictedMedianInsertSize = null;
union { null, string } recordGroupPlatform = null;
union { null, string } recordGroupPlatformUnit = null;
union { null, string } recordGroupSample = null;
union { null, int } mateReferenceId = null;
// See note, above.
union { null, long } referenceLength = null;
union { null, string } referenceUrl = null;
union { null, long } mateReferenceLength = null;
union { null, string } mateReferenceUrl = null;
}
enum Base {
A,
C,
T,
G,
U,
N, // any
X, // any
K, // keto: G/T
M, // aMino: A/C
R, // puRine: A/G
Y, // pYriminidine: C/T
S, // Strong: C/G
W, // Weak: A/T
B, // not A
V, // not T
H, // not G
D // not C
}
record ADAMNucleotideContig {
union { null, string } contigName = null;
union { null, int } contigId = null;
union { null, string } description = null;
array<Base> sequence = [];
union { null, long } sequenceLength = null;
union { null, string } url = null;
}
record ADAMPileup {
union { null, string } referenceName = null;
union { null, int } referenceId = null;
union { null, long } position = null;
union { null, int } rangeOffset = null;
union { null, int } rangeLength = null;
union { null, Base } referenceBase = null;
union { null, Base } readBase = null;
union { null, int } sangerQuality = null;
union { null, int } mapQuality = null;
union { null, int } numSoftClipped = null;
union { null, int } numReverseStrand = null;
union { null, int } countAtPosition = null;
union { null, string } readName = null;
union { null, long } readStart = null;
union { null, long } readEnd = null;
// record group identifer from sequencing run
union { null, string } recordGroupSequencingCenter = null;
union { null, string } recordGroupDescription = null;
union { null, long } recordGroupRunDateEpoch = null;
union { null, string } recordGroupFlowOrder = null;
union { null, string } recordGroupKeySequence = null;
union { null, string } recordGroupLibrary = null;
union { null, int } recordGroupPredictedMedianInsertSize = null;
union { null, string } recordGroupPlatform = null;
union { null, string } recordGroupPlatformUnit = null;
union { null, string } recordGroupSample = null;
}
record ADAMNestedPileup {
// nested pileup data type - contains reference to list of overlapping reads
// note: cannot be used with databases (e.g. hive/shark)
ADAMPileup pileup;
array<ADAMRecord> readEvidence;
}
enum StructuralVariantType {
Deletion,
Insertion,
Duplication,
Inversion,
CopyNumberVariation,
TandemDuplication,
MobileElementDeletion,
MobileElementInsertion
}
enum VariantType {
SNP,
MNP,
Insertion,
Deletion,
Complex,
SV
}
record ADAMVariant {
// reference identifiers
union { null, int } referenceId = null;
union { null, string } referenceName = null;
union { null, long } referenceLength = null;
union { null, string } referenceUrl = null;
// start position of variant against reference
union { null, long } position = null;
// base pair string describing reference allele at locus
union { null, string } referenceAllele = null;
// true if this variant is the reference allele
union { boolean, null } isReference = false;
// base pair string describing this allele
union { null, string } variant = null;
// enum to describe type of variant called
union { null, VariantType } variantType = null;
// variant identifier
union { null, string } id = null;
// phred scaled quality of variant call
union { null, int } quality = null;
// comma delimited string describing filters run on variant that have failed, null if filters were run and passed
union { null, string } filters = null;
// indicates whether filters have been run or not
union { boolean, null } filtersRun = false;
// frequency with which this allele occurs/occurred
union { null, double } alleleFrequency = null;
// RMS base quality at variant site
union { null, int } rmsBaseQuality = null;
// RMS mapping quality for reads aligned to variant site
union { null, int } siteRmsMappingQuality = null;
// count of reads with map quality == 0 aligned to variant site
union { null, int } siteMapQZeroCounts = null;
// total number of reads mapped to variant site
union { null, int } totalSiteMapCounts = null;
// total number of samples which had at least 1 read aligned at variant site
union { null, int } numberOfSamplesWithData = null;
// total number of samples in variant calling run
union { null, int } totalNumberOfSamplesCount = null;
// ratio representing # forward strand reads / # reverse strand reads aligned at variant site
union { null, double } strandBias = null;
// further descriptor for structural variants
// type of structural variant called
union { null, StructuralVariantType } svType = null;
// length of variant relative to reference allele - deletions have negative values, inserts have positive values
union { null, long } svLength = null;
// indicates whether start/end of SV is known precisely
union { null, boolean } svIsPrecise = null;
// end position of SV - for imprecise variants, best guess for end
union { null, long } svEnd = null;
// for imprecise variants, these fields represent the svConfidence interval upper/lower bounds for the SV start/end
union { null, long } svConfidenceIntervalStartLow = null;
union { null, long } svConfidenceIntervalStartHigh = null;
union { null, long } svConfidenceIntervalEndLow = null;
union { null, long } svConfidenceIntervalEndHigh = null;
}
record ADAMGenotype {
// reference identifiers
union { null, int } referenceId = null;
union { null, string } referenceName = null;
union { null, long } referenceLength = null;
union { null, string } referenceUrl = null;
// start position of genotype call against reference
union { null, long } position = null;
// identifier for sample in call set
union { null, string } sampleId = null;
// ploidy at call site - this is the number of chromosomes we have seen/called at this site in this specific sample
// this is not the number of chromosomes expected by the reference genome at this site, e.g. if we genotype
// an individual with trisomy 21, we expect the ploidy of all calls on chr 21 of this sample to be 3
union { null, int } ploidy = null;
// number of haplotype at a specific call site - this is important for variant callers that perform assembly or
// other methods leading to phased calls. specifically, these methods function by assembling candidate haplotypes,
// which represent the expected base sequence of an individual chromosome across multiple sites
// this number need not be provided for unphased calls, but if it is provided it is expected that all genotypes
// of a specific sample at a specific site
union { null, int } haplotypeNumber = null;
// allele variant type
union { null, VariantType } alleleVariantType = null;
// base pair string describing allele
union { null, string } allele = null;
// true if is reference
union { null, boolean } isReference = null;
// base pair string describing reference allele
union { null, string } referenceAllele = null;
// expected value of number of chromosomes with this allele at this site. this is a function of the genotype
// state likelihoods.
union { null, double } expectedAlleleDosage = null;
// phred scaled quality score describing likelihood genotype is correct
union { null, int } genotypeQuality = null;
// coverage at genotype site
union { null, int } depth = null;
// comma delimited list of phred scaled genotype likelihoods
union { null, string } phredLikelihoods = null;
// comma delimited list of phred scaled posteriors used when calculating genotype likelihoods
union { null, string } phredPosteriorLikelihoods = null;
// comma delimited list describing phred scaled likelihoods of different ploidy when unclear for site
union { null, string } ploidyStateGenotypeLikelihoods = null;
// phred scaled quality score for assembled haplotype
union { null, int } haplotypeQuality = null;
// RMS base quality score for reads aligned to genotyped site
union { null, int } rmsBaseQuality = null;
// RMS mapping quality for reads aligned to genotyped site
union { null, int } rmsMappingQuality = null;
// number of reads mapped at site on forward strand
union { null, int } readsMappedForwardStrand = null;
// number of reads mapped with mapq = 0
union { null, int } readsMappedMapQ0 = null;
// further descriptor for structural variants
// type of structural variant called
union { null, StructuralVariantType } svType = null;
// length of variant relative to reference allele - deletions have negative values, inserts have positive values
union { null, long } svLength = null;
// indicates whether start/end of SV is known precisely
union { null, boolean } svIsPrecise = null;
// end position of SV - for imprecise variants, best guess for end
union { null, long } svEnd = null;
// for imprecise variants, these fields represent the svConfidence interval upper/lower bounds for the SV start/end
union { null, long } svConfidenceIntervalStartLow = null;
union { null, long } svConfidenceIntervalStartHigh = null;
union { null, long } svConfidenceIntervalEndLow = null;
union { null, long } svConfidenceIntervalEndHigh = null;
// true if call is phased
union { boolean, null } isPhased = false;
// true if call is phased and phase has switched at site, null if not phased
union { boolean, null } isPhaseSwitch = false;
// if phased, identifier for phasing set
union { null, string } phaseSetId = null;
// phred scaled quality for phasing
union { null, int } phaseQuality = null;
}
/**
* Begin Variant/genotype annotation records.
*
* Any updates that add annotations should impact three or four files:
* - adam-format/.../adam.avdl
* - this file
* - rdd/AdamContext.scala --> adamVariantLoad function
* - If there is a corresponding conversion of that data between VCF and ADAM, commands/VariantContextConverter.scala
*/
record ADAMVariantDomain {
// reference identifiers
union { null, int } referenceId = null;
union { null, string } referenceName = null;
union { null, long } referenceLength = null;
union { null, string } referenceUrl = null;
// position of variant site
union { null, long } position = null;
// fields denote whether variant has been seen in dbSNP, Hapmap2/3, or 1000 genomes
union { boolean, null } inDbSNP = false;
union { boolean, null } inHM2 = false;
union { boolean, null } inHM3 = false;
union { boolean, null } in1000G = false;
}
record ADAMGenotypeIdentification {
// identifiers for sample cohort and sample ethnicity
union { null, string } sampleEthnicity = null;
union { null, string } sampleCohort = null;
// record group identifer from sequencing run
union { null, string } recordGroupSequencingCenter = null;
union { null, string } recordGroupDescription = null;
union { null, long } recordGroupRunDateEpoch = null;
union { null, string } recordGroupFlowOrder = null;
union { null, string } recordGroupKeySequence = null;
union { null, string } recordGroupLibrary = null;
union { null, int } recordGroupPredictedMedianInsertSize = null;
union { null, string } recordGroupPlatform = null;
union { null, string } recordGroupPlatformUnit = null;
union { null, string } recordGroupSample = null;
}
}