adam-format/src/main/resources/avro/adam.avdl

@namespace("edu.berkeley.cs.amplab.adam.avro")
protocol ADAM {

record ADAMRecord {

    /**
     * These two fields, along with the two
     * reference{Length, Url} fields at the bottom
     * of the schema, collectively form the contents
     * of the Sequence Dictionary embedded in the these
     * records from the BAM / SAM itself.
     */
    union { null, string } referenceName = null;
    union { null, int } referenceId = null;

    // 0-based reference position start
    union { null, long } start = null;

    union { null, int } mapq = null;
    union { null, string } readName = null;
    union { null, string } sequence = null;
    union { null, string } mateReference = null;
    union { null, long } mateAlignmentStart = null;
    union { null, string } cigar = null;
    union { null, string } qual = null;
    union { null, string } recordGroupName = null;
    union { null, int } recordGroupId = null;

    // Read flags (all default to false)
    union { boolean, null } readPaired = false;
    union { boolean, null } properPair = false;
    union { boolean, null } readMapped = false;
    union { boolean, null } mateMapped = false;
    union { boolean, null } readNegativeStrand = false;
    union { boolean, null } mateNegativeStrand = false;
    union { boolean, null } firstOfPair = false;
    union { boolean, null } secondOfPair = false;
    union { boolean, null } primaryAlignment = false;
    union { boolean, null } failedVendorQualityChecks = false;
    union { boolean, null } duplicateRead = false;

    // Commonly used optional attributes
    union { null, string } mismatchingPositions = null;

    // Remaining optional attributes flattened into a string
    union { null, string } attributes = null;

    // record group identifer from sequencing run
    union { null, string } recordGroupSequencingCenter = null;
    union { null, string } recordGroupDescription = null;
    union { null, long } recordGroupRunDateEpoch = null;
    union { null, string } recordGroupFlowOrder = null;
    union { null, string } recordGroupKeySequence = null;
    union { null, string } recordGroupLibrary = null;
    union { null, int } recordGroupPredictedMedianInsertSize = null;
    union { null, string } recordGroupPlatform = null;
    union { null, string } recordGroupPlatformUnit = null;
    union { null, string } recordGroupSample = null;

    union { null, int } mateReferenceId = null;

    // See note, above.
    union { null, long }   referenceLength = null;
    union { null, string } referenceUrl = null;

    union { null, long } mateReferenceLength = null;
    union { null, string } mateReferenceUrl = null;
}

enum Base {
    A,
    C,
    T,
    G,
    U,
    N, // any
    X, // any
    K, // keto: G/T
    M, // aMino: A/C
    R, // puRine: A/G
    Y, // pYriminidine: C/T
    S, // Strong: C/G
    W, // Weak: A/T
    B, // not A
    V, // not T
    H, // not G
    D  // not C
}

record ADAMNucleotideContig {
    union { null, string } contigName = null;
    union { null, int } contigId = null;
    union { null, string } description = null;
    array<Base> sequence = [];
    union { null, long } sequenceLength = null;
    union { null, string } url = null;
}

record ADAMPileup {
    union { null, string } referenceName = null;
    union { null, int } referenceId = null;
    union { null, long } position = null;
    union { null, int } rangeOffset = null;
    union { null, int } rangeLength = null;
    union { null, Base } referenceBase = null;
    union { null, Base } readBase = null;
    union { null, int } sangerQuality = null;
    union { null, int } mapQuality = null;
    union { null, int } numSoftClipped = null;
    union { null, int } numReverseStrand = null;
    union { null, int } countAtPosition = null;

    union { null, string } readName = null;
    union { null, long } readStart = null;
    union { null, long } readEnd = null;

    // record group identifer from sequencing run
    union { null, string } recordGroupSequencingCenter = null;
    union { null, string } recordGroupDescription = null;
    union { null, long } recordGroupRunDateEpoch = null;
    union { null, string } recordGroupFlowOrder = null;
    union { null, string } recordGroupKeySequence = null;
    union { null, string } recordGroupLibrary = null;
    union { null, int } recordGroupPredictedMedianInsertSize = null;
    union { null, string } recordGroupPlatform = null;
    union { null, string } recordGroupPlatformUnit = null;
    union { null, string } recordGroupSample = null;
}

record ADAMNestedPileup {
     // nested pileup data type - contains reference to list of overlapping reads
     // note: cannot be used with databases (e.g. hive/shark)
     ADAMPileup pileup;
     array<ADAMRecord> readEvidence;
}

enum StructuralVariantType {
    Deletion,
    Insertion,
    Duplication,
    Inversion,
    CopyNumberVariation,
    TandemDuplication,
    MobileElementDeletion,
    MobileElementInsertion
}

enum VariantType {
    SNP,
    MNP,
    Insertion,
    Deletion,
    Complex,
    SV
}

record ADAMVariant {
  // reference identifiers
  union { null, int } referenceId = null;
  union { null, string } referenceName = null;
  union { null, long } referenceLength = null;
  union { null, string } referenceUrl = null;

  // start position of variant against reference
  union { null, long } position = null;
  // base pair string describing reference allele at locus
  union { null, string } referenceAllele = null;
  // true if this variant is the reference allele
  union { boolean, null } isReference = false;
  // base pair string describing this allele
  union { null, string } variant = null;
  // enum to describe type of variant called
  union { null, VariantType } variantType = null;
  // variant identifier
  union { null, string } id = null;
  // phred scaled quality of variant call
  union { null, int } quality = null;
  // comma delimited string describing filters run on variant that have failed, null if filters were run and passed
  union { null, string } filters = null;
  // indicates whether filters have been run or not
  union { boolean, null } filtersRun = false;

  // frequency with which this allele occurs/occurred
  union { null, double } alleleFrequency = null;
  // RMS base quality at variant site
  union { null, int } rmsBaseQuality = null;
  // RMS mapping quality for reads aligned to variant site
  union { null, int } siteRmsMappingQuality = null;
  // count of reads with map quality == 0 aligned to variant site
  union { null, int } siteMapQZeroCounts = null;
  // total number of reads mapped to variant site
  union { null, int } totalSiteMapCounts = null;
  // total number of samples which had at least 1 read aligned at variant site
  union { null, int } numberOfSamplesWithData = null;
  // total number of samples in variant calling run
  union { null, int } totalNumberOfSamplesCount = null;
  // ratio representing # forward strand reads / # reverse strand reads aligned at variant site
  union { null, double } strandBias = null;

  // further descriptor for structural variants
  // type of structural variant called
  union { null, StructuralVariantType } svType = null;

  // length of variant relative to reference allele - deletions have negative values, inserts have positive values
  union { null, long } svLength = null;
  // indicates whether start/end of SV is known precisely
  union { null, boolean } svIsPrecise = null;
  // end position of SV - for imprecise variants, best guess for end
  union { null, long } svEnd = null;
  
  // for imprecise variants, these fields represent the svConfidence interval upper/lower bounds for the SV start/end
  union { null, long } svConfidenceIntervalStartLow = null;
  union { null, long } svConfidenceIntervalStartHigh = null;
  union { null, long } svConfidenceIntervalEndLow = null;
  union { null, long } svConfidenceIntervalEndHigh = null;

}

record ADAMGenotype {
  // reference identifiers
  union { null, int } referenceId = null;
  union { null, string } referenceName = null;
  union { null, long } referenceLength = null;
  union { null, string } referenceUrl = null;

  // start position of genotype call against reference
  union { null, long } position = null;
  // identifier for sample in call set
  union { null, string } sampleId = null;

  // ploidy at call site - this is the number of chromosomes we have seen/called at this site in this specific sample
  // this is not the number of chromosomes expected by the reference genome at this site, e.g. if we genotype
  // an individual with trisomy 21, we expect the ploidy of all calls on chr 21 of this sample to be 3
  union { null, int } ploidy = null;
  // number of haplotype at a specific call site - this is important for variant callers that perform assembly or
  // other methods leading to phased calls. specifically, these methods function by assembling candidate haplotypes,
  // which represent the expected base sequence of an individual chromosome across multiple sites
  // this number need not be provided for unphased calls, but if it is provided it is expected that all genotypes
  // of a specific sample at a specific site 
  union { null, int } haplotypeNumber = null;
  // allele variant type
  union { null, VariantType } alleleVariantType = null;
  // base pair string describing allele
  union { null, string } allele = null;
  // true if is reference
  union { null, boolean } isReference = null;
  // base pair string describing reference allele
  union { null, string } referenceAllele = null;
  // expected value of number of chromosomes with this allele at this site. this is a function of the genotype
  // state likelihoods.
  union { null, double } expectedAlleleDosage = null;
  // phred scaled quality score describing likelihood genotype is correct
  union { null, int } genotypeQuality = null;
  // coverage at genotype site
  union { null, int } depth = null;
  // comma delimited list of phred scaled genotype likelihoods
  union { null, string } phredLikelihoods = null;
  // comma delimited list of phred scaled posteriors used when calculating genotype likelihoods
  union { null, string } phredPosteriorLikelihoods = null;
  // comma delimited list describing phred scaled likelihoods of different ploidy when unclear for site
  union { null, string } ploidyStateGenotypeLikelihoods = null;
  // phred scaled quality score for assembled haplotype
  union { null, int } haplotypeQuality = null;
  // RMS base quality score for reads aligned to genotyped site
  union { null, int } rmsBaseQuality = null;
  // RMS mapping quality for reads aligned to genotyped site
  union { null, int } rmsMappingQuality = null;
  // number of reads mapped at site on forward strand
  union { null, int } readsMappedForwardStrand = null;
  // number of reads mapped with mapq = 0
  union { null, int } readsMappedMapQ0 = null;

  // further descriptor for structural variants
  // type of structural variant called
  union { null, StructuralVariantType } svType = null;

  // length of variant relative to reference allele - deletions have negative values, inserts have positive values
  union { null, long } svLength = null;
  // indicates whether start/end of SV is known precisely
  union { null, boolean } svIsPrecise = null;
  // end position of SV - for imprecise variants, best guess for end
  union { null, long } svEnd = null;
  
  // for imprecise variants, these fields represent the svConfidence interval upper/lower bounds for the SV start/end
  union { null, long } svConfidenceIntervalStartLow = null;
  union { null, long } svConfidenceIntervalStartHigh = null;
  union { null, long } svConfidenceIntervalEndLow = null;
  union { null, long } svConfidenceIntervalEndHigh = null;

  // true if call is phased
  union { boolean, null } isPhased = false;
  // true if call is phased and phase has switched at site, null if not phased
  union { boolean, null } isPhaseSwitch = false;
  // if phased, identifier for phasing set
  union { null, string } phaseSetId = null;
  // phred scaled quality for phasing
  union { null, int } phaseQuality = null;
}

/**
 * Begin Variant/genotype annotation records.
 *
 * Any updates that add annotations should impact three or four files:
 * - adam-format/.../adam.avdl
 * - this file
 * - rdd/AdamContext.scala --> adamVariantLoad function
 * - If there is a corresponding conversion of that data between VCF and ADAM, commands/VariantContextConverter.scala
 */

record ADAMVariantDomain {
  // reference identifiers
  union { null, int } referenceId = null;
  union { null, string } referenceName = null;
  union { null, long } referenceLength = null;
  union { null, string } referenceUrl = null;

  // position of variant site
  union { null, long } position = null;

  // fields denote whether variant has been seen in dbSNP, Hapmap2/3, or 1000 genomes
  union { boolean, null } inDbSNP = false;
  union { boolean, null } inHM2 = false;
  union { boolean, null } inHM3 = false;
  union { boolean, null } in1000G = false;
}

record ADAMGenotypeIdentification {

  // identifiers for sample cohort and sample ethnicity
  union { null, string } sampleEthnicity = null;
  union { null, string } sampleCohort = null;

  // record group identifer from sequencing run
  union { null, string } recordGroupSequencingCenter = null;
  union { null, string } recordGroupDescription = null;
  union { null, long } recordGroupRunDateEpoch = null;
  union { null, string } recordGroupFlowOrder = null;
  union { null, string } recordGroupKeySequence = null;
  union { null, string } recordGroupLibrary = null;
  union { null, int } recordGroupPredictedMedianInsertSize = null;
  union { null, string } recordGroupPlatform = null;
  union { null, string } recordGroupPlatformUnit = null;
  union { null, string } recordGroupSample = null;

}

}