Feat/aliquot 2.0.0 (#48)

* feat(aliquot-2): UNIPROT_ISOFORM * feat(aliquot-2): MANE, APPRIS, FLAGS * fix(aliquot-2): shift RefSeq * feat(aliquot-2): 100G columns * feat(aliquot-2): ESP columns * feat(aliquot-2): gnomAD columns * feat(aliquot-2): VEP max AF columns * feat(aliquot-2): non-cancer gnomAD columns * feat(aliquot-2): TF and miRNA columns * feat(aliquot-2): drop exac * feat(genie): genie 2.0.0 Co-authored-by: Charles Czysz <[email protected]>
NCI-GDC · Jun 25, 2021 · 04753bc · 04753bc
1 parent b203de9
commit 04753bc
Show file tree

Hide file tree

Showing 2 changed files with 227 additions and 0 deletions.
diff --git a/maflib/schemas/gdc-2.0.0-aliquot.json b/maflib/schemas/gdc-2.0.0-aliquot.json
@@ -0,0 +1,118 @@
+{
+	"version" : "gdc-1.0.0",
+	"annotation-spec" : "gdc-2.0.0-aliquot",
+	"extends" : "gdc-1.0.0",
+	"columns" : [
+            [ "HGVSc", "NullableStringColumn", "The coding sequence of the variant in HGVS recommended format" ],
+            [ "HGVSp", "NullableStringColumn", "The protein sequence of the variant in HGVS recommended format. \"p.=\" signifies no change in the protein" ],
+            [ "HGVSp_Short", "NullableStringColumn", "Same as the HGVSp column, but using 1-letter amino-acid codes" ],
+            [ "Transcript_ID", "NullableStringColumn", "Ensembl ID of the transcript affected by the variant" ],
+            [ "Exon_Number", "NullableStringColumn", "The exon number (out of total number)" ],
+            [ "t_depth", "ZeroBasedIntegerColumn", "Read depth across this locus in tumor BAM" ],
+            [ "t_ref_count", "ZeroBasedIntegerColumn", "Read depth supporting the reference allele in tumor BAM" ],
+            [ "t_alt_count", "ZeroBasedIntegerColumn", "Read depth supporting the variant allele in tumor BAM" ],
+            [ "n_depth", "NullableZeroBasedIntegerColumn", "Read depth across this locus in normal BAM" ],
+            [ "n_ref_count", "NullableZeroBasedIntegerColumn", "Read depth supporting the reference allele in normal BAM (cleared in somatic MAF)" ],
+            [ "n_alt_count", "NullableZeroBasedIntegerColumn", "Read depth supporting the variant allele in normal BAM (cleared in somatic MAF)" ],
+            [ "all_effects", "SequenceOfStrings", "A semicolon delimited list of all possible variant effects, sorted by priority ([Symbol,Consequence,HGVSp_Short,Transcript_ID,RefSeq,HGVSc,Impact,Canonical,Sift,PolyPhen,Strand])" ],
+            [ "Allele", "StringColumn", "The variant allele used to calculate the consequence" ],
+            [ "Gene", "NullableStringColumn", "Stable Ensembl ID of affected gene" ],
+            [ "Feature", "NullableStringColumn", "Stable Ensembl ID of feature (transcript, regulatory, motif)" ],
+            [ "Feature_type", "FeatureType", "Type of feature. Currently one of Transcript, RegulatoryFeature, MotifFeature (or blank)" ],
+            [ "One_Consequence", "StringColumn", "The single consequence of the canonical transcript in sequence ontology terms" ],
+            [ "Consequence", "SequenceOfStrings", "Consequence type of this variant" ],
+            [ "cDNA_position", "NullableStringColumn", "Relative position of base pair in the cDNA sequence as a fraction. A \"-\" symbol is displayed as the numerator if the variant does not appear in cDNA" ],
+            [ "CDS_position", "NullableStringColumn", "Relative position of base pair in coding sequence. A \"-\" symbol is displayed as the numerator if the variant does not appear in coding sequence" ],
+            [ "Protein_position", "NullableStringColumn", "Relative position of affected amino acid in protein. A \"-\" symbol is displayed as the numerator if the variant does not appear in coding sequence" ],
+            [ "Amino_acids", "NullableStringColumn", "Only given if the variation affects the protein-coding sequence" ],
+            [ "Codons", "NullableStringColumn", "The alternative codons with the variant base in upper case" ],
+            [ "Existing_variation", "SequenceOfStrings", "Known identifier of existing variation" ],
+            [ "ALLELE_NUM", "ZeroBasedIntegerColumn", "Allele number from input; 0 is reference, 1 is first alternate etc." ],
+            [ "DISTANCE", "NullableIntegerColumn", "Shortest distance from the variant to transcript" ],
+            [ "TRANSCRIPT_STRAND", "TranscriptStrand", "The DNA strand (1 or -1) on which the transcript/feature lies" ],
+            [ "SYMBOL", "NullableStringColumn", "The gene symbol" ],
+            [ "SYMBOL_SOURCE", "NullableStringColumn", "The source of the gene symbol" ],
+            [ "HGNC_ID", "NullableStringColumn", "Gene identifier from the HUGO Gene Nomenclature Committee if applicable" ],
+            [ "BIOTYPE", "NullableStringColumn", "Biotype of transcript" ],
+            [ "CANONICAL", "Canonical", "A flag (YES) indicating that the VEP-based canonical transcript, the longest translation, was used for this gene. If not, the value is null" ],
+            [ "CCDS", "NullableStringColumn", "The CCDS identifier for this transcript, where applicable" ],
+            [ "ENSP", "NullableStringColumn", "The Ensembl protein identifier of the affected transcript" ],
+            [ "SWISSPROT", "SequenceOfStrings", "UniProtKB/Swiss-Prot accession" ],
+            [ "TREMBL", "SequenceOfStrings", "UniProtKB/TrEMBL identifier of protein product" ],
+            [ "UNIPARC", "SequenceOfStrings", "UniParc identifier of protein product" ],
+            [ "UNIPROT_ISOFORM", "SequenceOfStrings", "Direct mappings to UniProtKB isoforms" ],
+            [ "RefSeq", "SequenceOfStrings", "RefSeq identifier for this transcript" ],
+            [ "MANE", "SequenceOfStrings", "MANE (Matched Annotation by NCBI and EMBL-EBI) Transcript" ],
+            [ "APPRIS", "NullableStringColumn", "Annotates alternatively spliced transcripts as primary or alternate based on a range of computational methods" ],
+            [ "FLAGS", "SequenceOfStrings", "Transcript quality flags" ],
+            [ "SIFT", "NullableStringColumn", "The SIFT prediction and/or score, with both given as prediction (score)" ],
+            [ "PolyPhen", "NullableStringColumn", "The PolyPhen prediction and/or score" ],
+            [ "EXON", "NullableStringColumn", "The exon number (out of total number)" ],
+            [ "INTRON", "NullableStringColumn", "The intron number (out of total number)" ],
+            [ "DOMAINS", "SequenceOfStrings", "The source and identifier of any overlapping protein domains" ],
+            [ "1000G_AF", "NullableFloatColumn", "Non-reference allele and frequency of existing variant in 1000 Genomes" ],
+            [ "1000G_AFR_AF", "NullableFloatColumn", "Non-reference allele and frequency of existing variant in 1000 Genomes combined African population" ],
+            [ "1000G_AMR_AF", "NullableFloatColumn", "Non-reference allele and frequency of existing variant in 1000 Genomes combined American population" ],
+            [ "1000G_EAS_AF", "NullableFloatColumn", "Non-reference allele and frequency of existing variant in 1000 Genomes combined East Asian population" ],
+            [ "1000G_EUR_AF", "NullableFloatColumn", "Non-reference allele and frequency of existing variant in 1000 Genomes combined European population" ],
+            [ "1000G_SAS_AF", "NullableFloatColumn", "Non-reference allele and frequency of existing variant in 1000 Genomes combined South Asian population" ],
+            [ "ESP_AA_AF", "NullableFloatColumn", "Non-reference allele and frequency of existing variant in NHLBI-ESP African American population" ],
+            [ "ESP_EA_AF", "NullableFloatColumn", "Non-reference allele and frequency of existing variant in NHLBI-ESP European American population" ],
+            [ "gnomAD_AF", "NullableFloatColumn", "Frequency of existing variant in gnomAD exomes combined population" ],
+            [ "gnomAD_AFR_AF", "NullableFloatColumn", "Frequency of existing variant in gnomAD exomes African/American population" ],
+            [ "gnomAD_AMR_AF", "NullableFloatColumn", "Frequency of existing variant in gnomAD exomes American population" ],
+            [ "gnomAD_ASJ_AF", "NullableFloatColumn", "Frequency of existing variant in gnomAD exomes Ashkenazi Jewish population" ],
+            [ "gnomAD_EAS_AF", "NullableFloatColumn", "Frequency of existing variant in gnomAD exomes East Asian population" ],
+            [ "gnomAD_FIN_AF", "NullableFloatColumn", "Frequency of existing variant in gnomAD exomes Finnish population" ],
+            [ "gnomAD_NFE_AF", "NullableFloatColumn", "Frequency of existing variant in gnomAD exomes Non-Finnish European population" ],
+            [ "gnomAD_OTH_AF", "NullableFloatColumn", "Frequency of existing variant in gnomAD exomes other combined population" ],
+            [ "gnomAD_SAS_AF", "NullableFloatColumn", "Frequency of existing variant in gnomAD exomes South Asian population" ],
+            [ "MAX_AF", "NullableFloatColumn", "Maximum observed allele frequency in 1000 Genomes, ESP and ExAC/gnomAD" ],
+            [ "MAX_AF_POPS", "SequenceOfStrings", "Populations in which maximum allele frequency was observed" ],
+            [ "gnomAD_non_cancer_AF", "NullableFloatColumn", "Frequency of exisiting variant in gnomAD genomes combined non-cancer population" ],
+            [ "gnomAD_non_cancer_AFR_AF", "NullableFloatColumn", "Frequency of exisiting variant in gnomAD genomes non-cancer African/American population" ],
+            [ "gnomAD_non_cancer_AMI_AF", "NullableFloatColumn", "Frequency of exisiting variant in gnomAD genomes non-cancer Amish population" ],
+            [ "gnomAD_non_cancer_AMR_AF", "NullableFloatColumn", "Frequency of exisiting variant in gnomAD genomes non-cancer Latino population" ],
+            [ "gnomAD_non_cancer_ASJ_AF", "NullableFloatColumn", "Frequency of exisiting variant in gnomAD genomes non-cancer Ashkenazi Jewish population" ],
+            [ "gnomAD_non_cancer_EAS_AF", "NullableFloatColumn", "Frequency of exisiting variant in gnomAD genomes non-cancer East Asian population" ],
+            [ "gnomAD_non_cancer_FIN_AF", "NullableFloatColumn", "Frequency of exisiting variant in gnomAD genomes non-cancer Finnish population" ],
+            [ "gnomAD_non_cancer_MID_AF", "NullableFloatColumn", "Frequency of exisiting variant in gnomAD genomes non-cancer Middle Eastern population" ],
+            [ "gnomAD_non_cancer_NFE_AF", "NullableFloatColumn", "Frequency of exisiting variant in gnomAD genomes non-cancer Non-Finnish European population" ],
+            [ "gnomAD_non_cancer_OTH_AF", "NullableFloatColumn", "Frequency of exisiting variant in gnomAD genomes non-cancer Other population" ],
+            [ "gnomAD_non_cancer_SAS_AF", "NullableFloatColumn", "Frequency of exisiting variant in gnomAD genomes non-cancer South Asian population" ],
+            [ "gnomAD_non_cancer_MAX_AF_adj", "NullableFloatColumn", "Maximum observed allele frequency in non-cancer gnomAD genomes populations after removing subpopulations with less than 2 allele counts" ],
+            [ "gnomAD_non_cancer_MAX_AF_POPS_adj", "SequenceOfStrings", "Non-cancer gnomAD genomes populations in which the maximum allele frequency was observed after removing those with less than 2 allele counts" ],
+            [ "CLIN_SIG", "SequenceOfStrings", "Clinical significance of variant from dbSNP" ],
+            [ "SOMATIC", "SequenceOfNullableYesOrNo", "Somatic status of each ID reported under Existing_variation (0, 1, or null)" ],
+            [ "PUBMED", "SequenceOfIntegers", "Pubmed ID(s) of publications that cite existing variant" ],
+            [ "TRANSCRIPTION_FACTORS", "SequenceOfStrings", "List of transcription factors which bind to the transcription factor binding profile" ],
+            [ "MOTIF_NAME", "NullableStringColumn", "The stable identifier of a transcription factor binding profile (TFBP) aligned at this position" ],
+            [ "MOTIF_POS", "NullableIntegerColumn", "The relative position of the variation in the aligned TFBP" ],
+            [ "HIGH_INF_POS", "NullableYOrN", "A flag indicating if the variant falls in a high information position of a transcription factor binding profile (TFBP) (Y, N, or null)" ],
+            [ "MOTIF_SCORE_CHANGE", "NullableFloatColumn", "The difference in motif score of the reference and variant sequences for the TFBP" ],
+            [ "miRNA", "SequenceOfStrings", "SO terms of overlapped miRNA secondary structure feature(s)" ],
+            [ "IMPACT", "Impact", "The impact modifier for the consequence type" ],
+            [ "PICK", "PickColumn", "Indicates if this block of consequence data was picked by VEP's pick feature (1 or null)" ],
+            [ "VARIANT_CLASS", "NullableStringColumn", "Sequence Ontology variant class" ],
+            [ "TSL", "NullableIntegerColumn", "Transcript support level, which is based on independent RNA analyses" ],
+            [ "HGVS_OFFSET", "NullableIntegerColumn", "Indicates by how many bases the HGVS notations for this variant have been shifted" ],
+            [ "PHENO", "SequenceOfNullableYesOrNo", "Indicates if existing variant is associated with a phenotype, disease or trait (0, 1, or null)" ],
+            [ "MINIMISED", "PickColumn", "Alleles in this variant have been converted to minimal representation before consequence calculation (1 or null)" ],
+            [ "GENE_PHENO", "NullableYesOrNo", "Indicates if gene that the variant maps to is associated with a phenotype, disease or trait (0, 1, or null)" ],
+            [ "FILTER", "SequenceOfStrings", "Copied from input VCF. This includes filters implemented directly by the variant caller and other external software used in the DNA-Seq pipeline. See below for additional details." ],
+            [ "CONTEXT", "StringColumn", "The reference allele per VCF specs, and its five flanking base pairs" ],
+            [ "src_vcf_id", "UUIDColumn", "GDC UUID for the input VCF file" ],
+            [ "tumor_bam_uuid", "UUIDColumn", "GDC UUID for the tumor bam file" ],
+            [ "normal_bam_uuid", "NullableUUIDColumn", "GDC UUID for the normal bam file" ],
+            [ "case_id", "UUIDColumn", "GDC UUID for the case" ],
+            [ "GDC_FILTER", "SequenceOfStrings", "GDC filters applied universally across all MAFs" ],
+            [ "COSMIC", "SequenceOfStrings", "Overlapping COSMIC variants" ],
+            [ "hotspot", "NullableYOrN", "A flag indicating if the variant is a known hotspot (Y, N, or null)" ],
+            [ "vcf_region", "StringColumn", "Colon separated string containing the CHROM, POS, ID, REF, and ALT columns from the VCF file (e.g., chrZ:20:rs1234:A:T) (not in somatic MAF)" ],
+            [ "vcf_info", "NullableStringColumn", "INFO column from VCF (not in somatic MAF)" ],
+            [ "vcf_format", "StringColumn", "FORMAT column from VCF (not in somatic MAF)" ],
+            [ "vcf_tumor_gt", "StringColumn", "Tumor sample genotype column from VCF (not in somatic MAF)" ],
+            [ "vcf_normal_gt", "NullableStringColumn", "Normal sample genotype column from VCF (not in somatic MAF)" ]
+	],
+	"filtered" : "None"
+}