diff --git a/INSTALL b/INSTALL index bcdd2f4e3..379c5aa7e 100644 --- a/INSTALL +++ b/INSTALL @@ -232,8 +232,10 @@ Alpine Linux Note: To install gsl-dev, it may be necessary to enable the "community" repository in /etc/apk/repositories. +Note: some older Alpine versions use libressl-dev rather than openssl-dev. + doas apk update # Ensure the package list is up to date -doas apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev libressl-dev gsl-dev perl-dev +doas apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev openssl-dev gsl-dev perl-dev OpenSUSE -------- diff --git a/LICENSE b/LICENSE index 6d40ae2d1..46dc0e0e3 100644 --- a/LICENSE +++ b/LICENSE @@ -723,11 +723,12 @@ Public License instead of this License. But first, please read ----------------------------------------------------------------------------- -LICENSE FOR VariantKey (https://github.com/Genomicsplc/variantkey) +LICENSE FOR VariantKey (https://github.com/tecnickcom/variantkey) The MIT License Copyright (c) 2017-2018 GENOMICS plc +Copyright (c) 2018-2023 Nicola Asuni - Tecnick.com Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Makefile b/Makefile index b0cd99ead..7013cd594 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ OBJS = main.o vcfindex.o tabix.o \ regidx.o smpl_ilist.o csq.o vcfbuf.o \ mpileup.o bam2bcf.o bam2bcf_indel.o bam2bcf_iaux.o read_consensus.o bam_sample.o \ vcfsort.o cols.o extsort.o dist.o abuf.o \ - ccall.o em.o prob1.o kmin.o str_finder.o + ccall.o em.o prob1.o kmin.o str_finder.o gff.o PLUGIN_OBJS = vcfplugin.o prefix = /usr/local @@ -104,7 +104,7 @@ endif include config.mk -PACKAGE_VERSION = 1.17 +PACKAGE_VERSION = 1.18 # If building from a Git repository, replace $(PACKAGE_VERSION) with the Git # description of the working tree: either a release tag with the same value @@ -246,7 +246,7 @@ vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htsli vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h) $(htslib_kstring_h) $(htslib_bgzf_h) $(bcftools_h) vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) $(filter_h) vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) regidx.h $(bcftools_h) vcmp.h $(htslib_khash_h) -vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h abuf.h +vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h abuf.h gff.h vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h) $(smpl_ilist_h) vcfroh.o: vcfroh.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(bcftools_h) HMM.h $(smpl_ilist_h) $(filter_h) vcfcnv.o: vcfcnv.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(bcftools_h) HMM.h rbuf.h @@ -289,6 +289,7 @@ vcfbuf.o: vcfbuf.c $(htslib_vcf_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcf abuf.o: abuf.c $(htslib_vcf_h) $(bcftools_h) rbuf.h abuf.h extsort.o: extsort.c $(bcftools_h) extsort.h kheap.h smpl_ilist.o: smpl_ilist.c $(bcftools_h) $(smpl_ilist_h) +gff.o: gff.c gff.h regidx.h csq.o: csq.c $(htslib_hts_h) $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_h) $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) regidx.h kheap.h $(smpl_ilist_h) rbuf.h # test programs diff --git a/NEWS b/NEWS index 06c0593ca..62c4699ac 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,121 @@ +## Release 1.18 (25th July 2023) + + +Changes affecting the whole of bcftools, or multiple commands: + +* Support auto indexing during writing BCF and VCF.gz via new `--write-index` option + + +Changes affecting specific commands: + +* bcftools annotate + + - The `-m, --mark-sites` option can be now used to mark all sites without the + need to provide the `-a` file (#1861) + + - Fix a bug where the `-m` function did not respect the `--min-overlap` option (#1869) + + - Fix a bug when update of INFO/END results in assertion error (#1957) + +* bcftools concat + + - New option `--drop-genotypes` + +* bcftools consensus + + - Support higher-ploidy genotypes with `-H, --haplotype` (#1892) + + - Allow `--mark-ins` and `--mark-snv` with a character, similarly to `--mark-del` + +* bcftools convert + + - Support for conversion from tab-delimited files (CHROM,POS,REF,ALT) to sites-only VCFs + +* bcftools csq + + - New `--unify-chr-names` option to automatically unify different chromosome + naming conventions in the input GFF, fasta and VCF files (e.g. "chrX" vs "X") + + - More versatility in parsing various flavors of GFF + + - A new `--dump-gff` option to help with debugging and investigating the internals + of hGFF parsing + + - When printing consequences in nonsense mediated decay transcripts, include 'NMD_transcript' + in the consequence part of the annotation. This is to make filtering easier and analogous to + VEP annotations. For example the consequence annotation + 3_prime_utr|PCGF3|ENST00000430644|NMD + is newly printed as + 3_prime_utr&NMD_transcript|PCGF3|ENST00000430644|NMD + +* bcftools gtcheck + + - Add stats for the number of sites matched in the GT-vs-GT, GT-vs-PL, etc modes. This + information is important for interpretation of the discordance score, as only the + GT-vs-GT matching can be interpreted as the number of mismatching genotypes. + +* bcftools +mendelian2 + + - Fix in command line argument parsing, the `-p` and `-P` options were not + functioning (#1906) + +* bcftools merge + + - New `-M, --missing-rules` option to control the behavior of merging of vector tags + to prevent mixtures of known and missing values in tags when desired + + - Use values pertaining to the unknown allele (<*> or ) when available + to prevent mixtures of known and missing values (#1888) + + - Revamped line matching code to fix problems in gVCF merging where split gVCF blocks + would not update genotypes (#1891, #1164). + +* bcftool mpileup + + - Fix a bug in --indels-v2.0 which caused an endless loop when CIGAR operator 'H' or 'P' + was encountered + +* bcftools norm + + - The `-m, --multiallelics +` mode now preserves phasing (#1893) + + - Symbolic alleles are now normalized too (#1919) + + - New `-g, --gff-annot` option to right-align indels in forward transcripts to follow + HGVS 3'rule (#1929) + +* bcftools query + + - Force newline character in formatting expression when not given explicitly + + - Fix `-H` header output in formatting expressions containing newlines + +* bcftools reheader + + - Make `-f, --fai` aware of long contigs not representable by 32-bit integer (#1959) + +* bcftools +split-vep + + - Prevent a segfault when `-i/-e` use a VEP subfield not included in `-f` or `-c` (#1877) + + - New `-X, --keep-sites` option complementing the existing `-x, --drop-sites` options + + - Force newline character in formatting expression when not given explicitly + + - Fix a subtle ambiguity: identical rows must be returned when `-s` is applied regardless + of `-f` containing the `-a` VEP tag itself or not. + +* bcftools stats + + - Collect new VAF (variant allele frequency) statistics from FORMAT/AD field + + - When counting transitions/transversions, consider also alternate het genotypes + +* plot-vcfstats + + - Add three new VAF plots + + ## Release 1.17 (21st February 2023) diff --git a/bcftools.h b/bcftools.h index c3f7ded16..bba71e3b6 100644 --- a/bcftools.h +++ b/bcftools.h @@ -1,6 +1,6 @@ /* bcftools.h -- utility function declarations. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -49,6 +49,9 @@ void error(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2 // newline will be added by the function. void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2); +// For on the fly index creation with --write-index +int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname); + void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd); const char *hts_bcf_wmode(int file_type); const char *hts_bcf_wmode2(int file_type, const char *fname); diff --git a/cigar_state.h b/cigar_state.h index a12a70995..dacac14ac 100644 --- a/cigar_state.h +++ b/cigar_state.h @@ -107,6 +107,12 @@ static inline int cstate_seek_fwd(cigar_state_t *cs, hts_pos_t *pos_ptr, int tri cs->icig++; continue; } + if ( op==BAM_CHARD_CLIP || op==BAM_CPAD ) + { + cs->icig++; + continue; + } + error("FIXME: not ready for CIGAR operator %d\n",op); } // the read starts after pos if ( trim_left ) @@ -175,6 +181,12 @@ static inline int cstate_seek_op_fwd(cigar_state_t *cs, hts_pos_t pos, int seek_ cs->icig++; continue; } + if ( op==BAM_CHARD_CLIP || op==BAM_CPAD ) + { + cs->icig++; + continue; + } + error("FIXME: not ready for CIGAR operator %d\n",op); } return cs->icig < cs->ncig ? -1 : -2; } diff --git a/consensus.c b/consensus.c index 397d45f98..2b58670c7 100644 --- a/consensus.c +++ b/consensus.c @@ -54,8 +54,8 @@ #define PICK_SHORT 8 #define PICK_IUPAC 16 -#define TO_UPPER 0 -#define TO_LOWER 1 +#define TO_UPPER 1 +#define TO_LOWER 2 typedef struct { @@ -324,7 +324,7 @@ static void init_region(args_t *args, char *line) { char *ss, *se = line; while ( *se && !isspace(*se) && *se!=':' ) se++; - int from = 0, to = 0; + hts_pos_t from = 0, to = 0; char tmp = 0, *tmp_ptr = NULL; if ( *se ) { @@ -356,7 +356,14 @@ static void init_region(args_t *args, char *line) args->fa_frz_mod = -1; args->fa_case = -1; args->vcf_rbuf.n = 0; - bcf_sr_seek(args->files,line,args->fa_ori_pos); + + kstring_t str = {0,0,0}; + if ( from==0 ) from = 1; + if ( to==0 ) to = HTS_POS_MAX; + ksprintf(&str,"%s:%"PRIhts_pos"-%"PRIhts_pos,line,from,to); + bcf_sr_set_regions(args->files,line,0); + free(str.s); + if ( tmp_ptr ) *tmp_ptr = tmp; fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line); if ( args->chain_fname ) @@ -466,25 +473,37 @@ static char *mark_del(char *ref, int rlen, char *alt, int mark) static void mark_ins(char *ref, char *alt, char mark) { int i, nref = strlen(ref), nalt = strlen(alt); - if ( mark=='l' ) + if ( mark==TO_LOWER ) for (i=nref; imark_del = optarg[0]; break; case 2 : - if ( !strcasecmp(optarg,"uc") ) args->mark_ins = 'u'; - else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = 'l'; + if ( !strcasecmp(optarg,"uc") ) args->mark_ins = TO_UPPER; + else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = TO_LOWER; + else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_ins = optarg[0]; else error("The argument is not recognised: --mark-ins %s\n",optarg); break; case 3 : - if ( !strcasecmp(optarg,"uc") ) args->mark_snv = 'u'; - else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = 'l'; + if ( !strcasecmp(optarg,"uc") ) args->mark_snv = TO_UPPER; + else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = TO_LOWER; + else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_snv = optarg[0]; else error("The argument is not recognised: --mark-snv %s\n",optarg); break; case 'p': args->chr_prefix = optarg; break; @@ -1211,7 +1231,8 @@ int main_consensus(int argc, char *argv[]) { char *tmp; args->haplotype = strtol(optarg, &tmp, 10); - if ( tmp==optarg || *tmp ) error("Error: Could not parse --haplotype %s, expected numeric argument\n", optarg); + if ( tmp==optarg || (*tmp && strcasecmp(tmp,"pIu")) ) error("Error: Could not parse \"--haplotype %s\", expected number of number followed with \"pIu\"\n", optarg); + if ( *tmp ) args->allele |= PICK_IUPAC; if ( args->haplotype <=0 ) error("Error: Expected positive integer with --haplotype\n"); } break; diff --git a/convert.c b/convert.c index 80e54747d..07ff01862 100644 --- a/convert.c +++ b/convert.c @@ -106,6 +106,7 @@ struct _convert_t char **used_tags_list; int nused_tags; int allow_undef_tags; + int force_newline; uint8_t **subset_samples; }; @@ -648,6 +649,7 @@ static void process_type(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { vcf_format1(convert->header, line, str); + if ( str->s[str->l-1]=='\n' ) str->l--; } static void process_chrom_pos_id(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { @@ -1560,7 +1562,6 @@ void convert_destroy(convert_t *convert) int convert_header(convert_t *convert, kstring_t *str) { int i, icol = 0, l_ori = str->l; - bcf_hdr_t *hdr = convert->header; // Supress the header output if LINE is present for (i=0; infmt; i++) @@ -1568,6 +1569,12 @@ int convert_header(convert_t *convert, kstring_t *str) if ( i!=convert->nfmt ) return str->l - l_ori; + // Header formatting becomes problematic when the formatting expression contains a newline. + // Simple cases like + // -f'[%CHROM %POS %SAMPLE\n]' + // can be handled quite easily with has_fmt_newline. Note this will not work if multiple newlines + // are present. + int has_fmt_newline = 0; kputc('#', str); for (i=0; infmt; i++) { @@ -1578,18 +1585,25 @@ int convert_header(convert_t *convert, kstring_t *str) while ( convert->fmt[j].is_gt_field ) j++; for (js=0; jsnsamples; js++) { - int ks = convert->samples[js]; for (k=i; kfmt[k].type == T_SEP ) { - if ( convert->fmt[k].key ) kputs(convert->fmt[k].key, str); + if ( convert->fmt[k].key ) + { + char *tmp = convert->fmt[k].key; + while ( *tmp ) + { + if ( *tmp=='\n' ) has_fmt_newline = 1; + else kputc(*tmp,str); + tmp++; + } + } } - else if ( convert->fmt[k].type == T_SAMPLE ) - ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key); else - ksprintf(str, "[%d]%s:%s", ++icol, hdr->samples[ks], convert->fmt[k].key); + ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key); } + if ( has_fmt_newline ) break; } i = j-1; continue; @@ -1602,6 +1616,7 @@ int convert_header(convert_t *convert, kstring_t *str) } ksprintf(str, "[%d]%s", ++icol, convert->fmt[i].key); } + if ( has_fmt_newline ) kputc('\n',str); return str->l - l_ori; } @@ -1678,6 +1693,47 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) return str->l - l_ori; } +static void force_newline_(convert_t *convert) +{ + int i, has_newline = 0; + for (i=0; infmt; i++) + { + if ( !convert->fmt[i].key ) continue; + char *tmp = convert->fmt[i].key; + while (*tmp) + { + if ( *tmp=='\n' ) { has_newline = 1; break; } + tmp++; + } + if ( has_newline ) break; + } + if ( has_newline ) return; + + // A newline is not present, force it. But where to add it? + // Consider + // -f'%CHROM[ %SAMPLE]\n' + // vs + // -f'[%CHROM %SAMPLE\n]' + for (i=0; infmt; i++) + if ( !convert->fmt[i].is_gt_field && convert->fmt[i].key ) break; + + if ( i < convert->nfmt ) + register_tag(convert, "\n", 0, T_SEP); // the first case + else + { + // the second case + i = convert->nfmt - 1; + if ( !convert->fmt[i].key ) + { + convert->fmt[i].key = strdup("\n"); + convert->fmt[i].is_gt_field = 1; + register_tag(convert, NULL, 0, T_SEP); + } + else + register_tag(convert, "\n", 1, T_SEP); + } +} + int convert_set_option(convert_t *convert, enum convert_option opt, ...) { int ret = 0; @@ -1692,6 +1748,10 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...) case subset_samples: convert->subset_samples = va_arg(args, uint8_t**); break; + case force_newline: + convert->force_newline = va_arg(args, int); + if ( convert->force_newline ) force_newline_(convert); + break; default: ret = -1; } diff --git a/convert.h b/convert.h index 5bbbc2cde..062607093 100644 --- a/convert.h +++ b/convert.h @@ -1,6 +1,6 @@ /* convert.h -- functions for converting between VCF/BCF and related formats. - Copyright (C) 2014-2021 Genome Research Ltd. + Copyright (C) 2014-2023 Genome Research Ltd. Author: Petr Danecek @@ -32,6 +32,7 @@ enum convert_option { allow_undef_tags, subset_samples, + force_newline, }; convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *str); diff --git a/csq.c b/csq.c index 49812d4de..f619e061a 100644 --- a/csq.c +++ b/csq.c @@ -35,7 +35,7 @@ Read about transcript types here http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html http://www.ensembl.org/info/genome/variation/predicted_data.html - http://www.gencodegenes.org/gencode_biotypes.html + https://www.gencodegenes.org/pages/biotypes.html List of supported biotypes antisense @@ -45,6 +45,7 @@ IG_LV_gene IG_V_gene lincRNA + lncRNA .. generic term for 3prime_overlapping_ncRNA, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic, sense_overlapping macro_lncRNA miRNA misc_RNA @@ -52,7 +53,7 @@ Mt_tRNA polymorphic_pseudogene processed_transcript - protein_coding + protein_coding, mRNA ribozyme rRNA sRNA @@ -144,6 +145,7 @@ #include #include #include +#include #include #include #include @@ -153,6 +155,7 @@ #include "kheap.h" #include "smpl_ilist.h" #include "rbuf.h" +#include "gff.h" #ifndef __FUNCTION__ # define __FUNCTION__ __func__ @@ -162,20 +165,8 @@ #define FLT_INCLUDE 1 #define FLT_EXCLUDE 2 -// Definition of splice_region, splice_acceptor and splice_donor -#define N_SPLICE_DONOR 2 -#define N_SPLICE_REGION_EXON 3 -#define N_SPLICE_REGION_INTRON 8 - #define N_REF_PAD 10 // number of bases to avoid boundary effects -#define STRAND_REV 0 -#define STRAND_FWD 1 - -#define TRIM_NONE 0 -#define TRIM_5PRIME 1 -#define TRIM_3PRIME 2 - // How to treat phased/unphased genotypes #define PHASE_REQUIRE 0 // --phase r #define PHASE_MERGE 1 // --phase m @@ -223,6 +214,7 @@ #define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION))) #define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING)) +#define CSQ_PRN_NMD (~(CSQ_INTRON|CSQ_NON_CODING)) #define CSQ_PRN_BIOTYPE CSQ_NON_CODING // see kput_vcsq() @@ -254,119 +246,6 @@ const char *csq_strings[] = "start_retained" }; - -// GFF line types -#define GFF_UNKN_LINE 0 -#define GFF_TSCRIPT_LINE 1 -#define GFF_GENE_LINE 2 - - -/* - Genomic features, for fast lookup by position to overlapping features -*/ -#define GF_coding_bit 6 -#define GF_is_coding(x) ((x) & (1<aux) +typedef struct { - uint32_t id; // transcript id - uint32_t beg,end; // transcript's beg and end coordinate (ref strand, 0-based, inclusive) - uint32_t strand:1, // STRAND_REV or STRAND_FWD - ncds:31, // number of exons - mcds; - gf_cds_t **cds; // ordered list of exons char *ref; // reference sequence, padded with N_REF_PAD bases on both ends char *sref; // spliced reference sequence, padded with N_REF_PAD bases on both ends hap_node_t *root; // root of the haplotype tree hap_node_t **hap; // pointer to haplotype leaves, two for each sample int nhap, nsref; // number of haplotypes and length of sref, including 2*N_REF_PAD - uint32_t trim:2, // complete, 5' or 3' trimmed, see TRIM_* types - type:30; // one of GF_* types - gf_gene_t *gene; -}; -static inline int cmp_tscript(tscript_t **a, tscript_t **b) +} +tscript_t; +static inline int cmp_tscript(gf_tscript_t **a, gf_tscript_t **b) { return ( (*a)->end < (*b)->end ) ? 1 : 0; } -KHEAP_INIT(trhp, tscript_t*, cmp_tscript) +KHEAP_INIT(trhp, gf_tscript_t*, cmp_tscript) typedef khp_trhp_t tr_heap_t; typedef struct { @@ -494,7 +366,7 @@ typedef struct { int mstack; hstack_t *stack; - tscript_t *tr; // tr->ref: spliced transcript on ref strand + gf_tscript_t *tr; // tr->ref: spliced transcript on ref strand kstring_t sseq; // spliced haplotype sequence on ref strand kstring_t tseq; // the variable part of translated haplotype transcript, coding strand kstring_t tref; // the variable part of translated reference transcript, coding strand @@ -503,77 +375,20 @@ typedef struct } hap_t; - -/* - Helper structures, only for initialization - - ftr_t - temporary list of all exons, CDS, UTRs -*/ -KHASH_MAP_INIT_INT(int2tscript, tscript_t*) -KHASH_MAP_INIT_INT(int2gene, gf_gene_t*) -typedef struct -{ - int type; // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR - uint32_t beg; - uint32_t end; - uint32_t trid; - uint32_t strand:1; // STRAND_REV,STRAND_FWD - uint32_t phase:2; // 0, 1, 2, or 3 for unknown - uint32_t iseq:29; -} -ftr_t; -/* - Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001) - to integer id. To keep the memory requirements low, the original version - relied on IDs in the form of a string prefix and a numerical id. However, - it turns out that this assumption is not valid for some ensembl GFFs, see - for example Zea_mays.AGPv4.36.gff3.gz - */ -typedef struct -{ - void *str2id; // khash_str2int - int nstr, mstr; - char **str; // numeric id to string -} -id_tbl_t; -typedef struct -{ - // all exons, CDS, UTRs - ftr_t *ftr; - int nftr, mftr; - - // mapping from gene id to gf_gene_t - kh_int2gene_t *gid2gene; - - // mapping from transcript id to tscript, for quick CDS anchoring - kh_int2tscript_t *id2tr; - - // sequences - void *seq2int; // str2int hash - char **seq; - int nseq, mseq; - - // ignored biotypes - void *ignored_biotypes; - - id_tbl_t gene_ids; // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx -} -aux_t; - typedef struct _args_t { // the main regidx lookups, from chr:beg-end to overlapping features and // index iterator + gff_t *gff; regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript; regitr_t *itr; - // temporary structures, deleted after initializtion - aux_t init; - // text tab-delimited output (out) or vcf/bcf output (out_fh) FILE *out; htsFile *out_fh; + char *index_fn; + int write_index; + char *dump_gff; // vcf bcf_srs_t *sr; @@ -597,6 +412,13 @@ typedef struct _args_t int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values) int ncsq2_small_warned; int brief_predictions; + int unify_chr_names; + char *chr_name; + int mchr_name; + struct { + int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id; + int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds; + } warned; int rid; // current chromosome tr_heap_t *active_tr; // heap of active transcripts for quick flushing @@ -604,11 +426,10 @@ typedef struct _args_t vbuf_t **vcf_buf; // buffered VCF lines to annotate with CSQ and flush rbuf_t vcf_rbuf; // round buffer indexes to vcf_buf kh_pos2vbuf_t *pos2vbuf; // fast lookup of buffered lines by position - tscript_t **rm_tr; // buffer of transcripts to clean + gf_tscript_t **rm_tr; // buffer of transcripts to clean int nrm_tr, mrm_tr; csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs int ncsq_buf, mcsq_buf; - id_tbl_t tscript_ids; // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx int force; // force run under various conditions. Currently only to skip out-of-phase transcripts int n_threads; // extra compression/decompression threads @@ -645,818 +466,6 @@ const uint8_t cnt4[] = #define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ] #define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ] -static const char *gf_strings_noncoding[] = -{ - "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript", - "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping", - "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene", - "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene", - "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene", - "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene", - "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene", - "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf" -}; -static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"}; -static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" }; - -const char *gf_type2gff_string(int type) -{ - if ( !GF_is_coding(type) ) - { - if ( type < (1<init; - char c = chr_end[1]; - chr_end[1] = 0; - int iseq; - if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 ) - { - // check for possible mismatch in chromosome naming convention such as chrX vs X - char *new_chr = NULL; - if ( faidx_has_seq(args->fai,chr_beg) ) - new_chr = strdup(chr_beg); // valid chr name, the same in gff and faidx - else - { - int len = strlen(chr_beg); - if ( !strncmp("chr",chr_beg,3) && len>3 ) - new_chr = strdup(chr_beg+3); // gff has the prefix, faidx does not - else - { - new_chr = malloc(len+4); // gff does not have the prefix, faidx has - memcpy(new_chr,"chr",3); - memcpy(new_chr+3,chr_beg,len); - new_chr[len+3] = 0; - } - if ( !faidx_has_seq(args->fai,new_chr) ) // modification did not help, this sequence is not in fai - { - static int unkwn_chr_warned = 0; - if ( !unkwn_chr_warned && args->verbosity>0 ) - fprintf(stderr,"Warning: GFF chromosome \"%s\" not part of the reference genome\n",chr_beg); - unkwn_chr_warned = 1; - free(new_chr); - new_chr = strdup(chr_beg); // use the original sequence name - } - } - if ( khash_str2int_get(aux->seq2int, new_chr, &iseq)!=0 ) - { - hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq); - aux->seq[aux->nseq] = new_chr; - iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]); - aux->nseq++; - assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq - } - else - free(new_chr); - } - chr_end[1] = c; - return iseq; -} -static inline char *gff_skip(const char *line, char *ss) -{ - while ( *ss && *ss!='\t' ) ss++; - if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - return ss+1; -} -static inline void gff_parse_chr(const char *line, char **chr_beg, char **chr_end) -{ - char *se = (char*) line; - while ( *se && *se!='\t' ) se++; - if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - *chr_beg = (char*) line; - *chr_end = se-1; -} -static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end) -{ - char *se = ss; - *beg = strtol(ss, &se, 10) - 1; - if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss); - ss = se+1; - *end = strtol(ss, &se, 10) - 1; - if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - return se+1; -} -static void gff_id_init(id_tbl_t *tbl) -{ - memset(tbl, 0, sizeof(*tbl)); - tbl->str2id = khash_str2int_init(); -} -static void gff_id_destroy(id_tbl_t *tbl) -{ - khash_str2int_destroy_free(tbl->str2id); - free(tbl->str); -} -// returns 0 on success, -1 on failure -static inline int gff_id_parse(id_tbl_t *tbl, const char *needle, char *ss, uint32_t *id_ptr) -{ - ss = strstr(ss,needle); // e.g. "ID=transcript:" - if ( !ss ) return -1; - ss += strlen(needle); - - char *se = ss; - while ( *se && *se!=';' && !isspace(*se) ) se++; - char tmp = *se; - *se = 0; - - int id; - if ( khash_str2int_get(tbl->str2id, ss, &id) < 0 ) - { - id = tbl->nstr++; - hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str); - tbl->str[id] = strdup(ss); - khash_str2int_set(tbl->str2id, tbl->str[id], id); - } - *se = tmp; - *id_ptr = id; - return 0; -} -static inline int gff_parse_type(char *line) -{ - line = strstr(line,"ID="); - if ( !line ) return -1; - line += 3; - if ( !strncmp(line,"transcript:",11) ) return GFF_TSCRIPT_LINE; - else if ( !strncmp(line,"gene:",5) ) return GFF_GENE_LINE; - return -1; -} -static inline int gff_parse_biotype(char *_line) -{ - char *line = strstr(_line,"biotype="); - if ( !line ) return -1; - - line += 8; - switch (*line) - { - case 'p': - if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING; - else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE; - else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT; - else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE; - else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE; - break; - case 'a': - if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT; - else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE; - else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF; - break; - case 'I': - if ( !strncmp(line,"IG_C_gene",9) ) return GF_IG_C; - else if ( !strncmp(line,"IG_D_gene",9) ) return GF_IG_D; - else if ( !strncmp(line,"IG_J_gene",9) ) return GF_IG_J; - else if ( !strncmp(line,"IG_LV_gene",10) ) return GF_IG_LV; - else if ( !strncmp(line,"IG_V_gene",9) ) return GF_IG_V; - else if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE; - else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE; - else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE; - else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE; - break; - case 'T': - if ( !strncmp(line,"TR_C_gene",9) ) return GF_TR_C; - else if ( !strncmp(line,"TR_D_gene",9) ) return GF_TR_D; - else if ( !strncmp(line,"TR_J_gene",9) ) return GF_TR_J; - else if ( !strncmp(line,"TR_V_gene",9) ) return GF_TR_V; - else if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE; - else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE; - break; - case 'M': - if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE; - else if ( !strncmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA; - else if ( !strncmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA; - break; - case 'l': - if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA; - break; - case 'm': - if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA; - else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE; - else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE; - else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA; - else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA; - break; - case 'r': - if ( !strncmp(line,"rRNA",4) ) return GF_rRNA; - else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME; - else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON; - else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED; - break; - case 's': - if ( !strncmp(line,"snRNA",5) ) return GF_snRNA; - else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA; - else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA; - else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA; - else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA; - else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC; - else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING; - break; - case 't': - if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE; - else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE; - else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE; - else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE; - else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE; - else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE; - break; - case 'n': - if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD; - else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY; - break; - case 'k': - if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA; - break; - case 'u': - if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE; - else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE; - break; - case 'L': - if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE; - break; - case '3': - if ( !strncmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA; - break; - case 'd': - if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN; - break; - case 'v': - if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA; - break; - case 'b': - if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA; - break; - } - return 0; -} -static inline int gff_ignored_biotype(args_t *args, char *ss) -{ - ss = strstr(ss,"biotype="); - if ( !ss ) return 0; - - ss += 8; - char *se = ss, tmp; - while ( *se && *se!=';' ) se++; - tmp = *se; - *se = 0; - - char *key = ss; - int n = 0; - if ( khash_str2int_get(args->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss); - khash_str2int_set(args->init.ignored_biotypes, key, n+1); - - *se = tmp; - return 1; -} -gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id) -{ - khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id); - gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k); - if ( !gene ) - { - gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t)); - int ret; - k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret); - kh_val(aux->gid2gene,k) = gene; - } - return gene; -} -void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr) -{ - aux_t *aux = &args->init; - int biotype = gff_parse_biotype(ss); - if ( biotype <= 0 ) - { - if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored transcript, unknown biotype: %s\n",line); - return; - } - - // create a mapping from transcript_id to gene_id - uint32_t trid, gene_id; - if ( gff_id_parse(&args->tscript_ids, "ID=transcript:", ss, &trid) ) - { - if ( gff_id_parse(&args->tscript_ids, "ID=", ss, &trid) ) - error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - static int warned = 0; - if ( !warned && args->verbosity > 0 ) - { - fprintf(stderr,"Warning: non-standard transcript ID notation in the GFF, expected \"ID=transcript:XXX\", found %s\n",line); - warned = 1; - } - } - if ( gff_id_parse(&args->init.gene_ids, "Parent=gene:", ss, &gene_id) ) - { - if ( gff_id_parse(&args->init.gene_ids, "Parent=", ss, &gene_id) ) - error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - static int warned = 0; - if ( !warned && args->verbosity > 0 ) - { - fprintf(stderr,"Warning: non-standard transcript Parent notation in the GFF, expected \"Parent=gene:XXX\", found %s\n",line); - warned = 1; - } - } - - tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t)); - tr->id = trid; - tr->strand = ftr->strand; - tr->gene = gene_init(aux, gene_id); - tr->type = biotype; - tr->beg = ftr->beg; - tr->end = ftr->end; - - khint_t k; - int ret; - k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret); - kh_val(aux->id2tr,k) = tr; -} -void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, char *chr_end, ftr_t *ftr) -{ - int biotype = gff_parse_biotype(ss); - if ( biotype <= 0 ) - { - if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored gene, unknown biotype: %s\n",line); - return; - } - - aux_t *aux = &args->init; - - // substring search for "ID=gene:ENSG00000437963" - uint32_t gene_id; - if ( gff_id_parse(&aux->gene_ids, "ID=gene:", ss, &gene_id) ) - { - if ( gff_id_parse(&aux->gene_ids, "ID=", ss, &gene_id) ) - error("[%s:%d %s] Could not parse the line, neither \"ID=gene:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - static int warned = 0; - if ( !warned && args->verbosity > 0 ) - { - fprintf(stderr,"Warning: non-standard gene ID notation in the GFF, expected \"ID=gene:XXX\", found %s\n",line); - warned = 1; - } - } - - gf_gene_t *gene = gene_init(aux, gene_id); - assert( !gene->name ); // the gene_id should be unique - - gene->iseq = feature_set_seq(args, chr_beg,chr_end); - - // substring search for "Name=OR4F5" - ss = strstr(chr_end+2,"Name="); - if ( ss ) - { - ss += 5; - char *se = ss; - while ( *se && *se!=';' && !isspace(*se) ) se++; - gene->name = (char*) malloc(se-ss+1); - memcpy(gene->name,ss,se-ss); - gene->name[se-ss] = 0; - } - else - gene->name = strdup(aux->gene_ids.str[gene_id]); // Name= field is not present, use the gene ID instead -} -int gff_parse(args_t *args, char *line, ftr_t *ftr) -{ - // - skip empty lines and commented lines - // - columns - // 1. chr - // 2. - // 3. CDS, transcript, gene, ... - // 4-5. beg,end - // 6. - // 7. strand - // 8. phase - // 9. Parent=transcript:ENST(\d+);ID=... etc - - char *ss = line; - if ( !*ss ) return -1; // skip blank lines - if ( *ss=='#' ) return -1; // skip comments - - char *chr_beg, *chr_end; - gff_parse_chr(line, &chr_beg, &chr_end); - ss = gff_skip(line, chr_end + 2); - - // 3. column: is this a CDS, transcript, gene, etc. - if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; } - else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; } - else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; } - else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; } - else - { - int type = GFF_UNKN_LINE; - if ( !strncmp("gene\t",ss,4) ) type = GFF_GENE_LINE; - else if ( !strncmp("transcript\t",ss,4) ) type = GFF_TSCRIPT_LINE; - ss = gff_skip(line, ss); - ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end); - ss = gff_skip(line, ss); - if ( type==GFF_UNKN_LINE ) type = gff_parse_type(ss); // determine type from ID=transcript: or ID=gene: - if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE ) - { - // we ignore these, debug print to see new types: - ss = strstr(ss,"ID="); - if ( !ss ) return -1; // no ID, ignore the line - if ( !strncmp("chromosome",ss+3,10) ) return -1; - if ( !strncmp("supercontig",ss+3,11) ) return -1; - if ( args->verbosity > 0 ) fprintf(stderr,"ignored: %s\n", line); - return -1; - } - - // 7. column: strand - if ( *ss == '+' ) ftr->strand = STRAND_FWD; - else if ( *ss == '-' ) ftr->strand = STRAND_REV; - else error("Unknown strand: %c .. %s\n", *ss,ss); - - if ( type==GFF_TSCRIPT_LINE ) - gff_parse_transcript(args, line, ss, ftr); - else - gff_parse_gene(args, line, ss, chr_beg, chr_end, ftr); - - return -1; - } - ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end); - ss = gff_skip(line, ss); - - // 7. column: strand - if ( *ss == '+' ) ftr->strand = STRAND_FWD; - else if ( *ss == '-' ) ftr->strand = STRAND_REV; - else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown strand: %c\n", *ss); return -1; } - ss += 2; - - // 8. column: phase (codon offset) - if ( *ss == '0' ) ftr->phase = 0; - else if ( *ss == '1' ) ftr->phase = 1; - else if ( *ss == '2' ) ftr->phase = 2; - else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN; // exons and even CDS in some GFFs do not have phase - else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; } - ss += 2; - - // substring search for "Parent=transcript:ENST00000437963" - if ( gff_id_parse(&args->tscript_ids, "Parent=transcript:", ss, &ftr->trid) ) - { - if ( gff_id_parse(&args->tscript_ids, "Parent=", ss, &ftr->trid) ) - error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - static int warned = 0; - if ( !warned && args->verbosity > 0 ) - { - fprintf(stderr,"Warning: non-standard gene Parent notation in the GFF, expected \"Parent=transcript:XXX\", found %s\n",line); - warned = 1; - } - } - - ftr->iseq = feature_set_seq(args, chr_beg,chr_end); - return 0; -} - -static int cmp_cds_ptr(const void *a, const void *b) -{ - // comparison function for qsort of transcripts's CDS - if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1; - if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1; - return 0; -} - -static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end) -{ - *chr_beg = *chr_end = aux->seq[iseq]; - while ( (*chr_end)[1] ) (*chr_end)++; -} -tscript_t *tscript_init(aux_t *aux, uint32_t trid) -{ - khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid); - tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k); - assert( tr ); - return tr; -} -void register_cds(args_t *args, ftr_t *ftr) -{ - // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet. - // ftr is the result of parsing a gff CDS line - aux_t *aux = &args->init; - - tscript_t *tr = tscript_init(aux, ftr->trid); - if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand); - - gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t)); - cds->tr = tr; - cds->beg = ftr->beg; - cds->len = ftr->end - ftr->beg + 1; - cds->icds = 0; // to keep valgrind on mac happy - cds->phase = ftr->phase; - - hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds); - tr->cds[tr->ncds++] = cds; -} -void register_utr(args_t *args, ftr_t *ftr) -{ - aux_t *aux = &args->init; - gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t)); - utr->which = ftr->type==GF_UTR3 ? prime3 : prime5; - utr->beg = ftr->beg; - utr->end = ftr->end; - utr->tr = tscript_init(aux, ftr->trid); - - char *chr_beg, *chr_end; - chr_beg_end(&args->init, utr->tr->gene->iseq, &chr_beg, &chr_end); - regidx_push(args->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr); -} -void register_exon(args_t *args, ftr_t *ftr) -{ - aux_t *aux = &args->init; - gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t)); - exon->beg = ftr->beg; - exon->end = ftr->end; - exon->tr = tscript_init(aux, ftr->trid); - - char *chr_beg, *chr_end; - chr_beg_end(&args->init, exon->tr->gene->iseq, &chr_beg, &chr_end); - regidx_push(args->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon); -} - -void tscript_init_cds(args_t *args) -{ - aux_t *aux = &args->init; - - // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds) - khint_t k; - int warn_phase_unkn = 0; - for (k=0; kid2tr); k++) - { - if ( !kh_exist(aux->id2tr, k) ) continue; - tscript_t *tr = (tscript_t*) kh_val(aux->id2tr, k); - - // position-to-tscript lookup - char *chr_beg, *chr_end; - chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end); - regidx_push(args->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr); - - if ( !tr->ncds ) continue; // transcript with no CDS - - // sort CDs - qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr); - - // trim non-coding start - int i, len = 0; - if ( tr->strand==STRAND_FWD ) - { - if ( tr->cds[0]->phase != CDS_PHASE_UNKN ) - { - if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME; - tr->cds[0]->beg += tr->cds[0]->phase; - tr->cds[0]->len -= tr->cds[0]->phase; - tr->cds[0]->phase = 0; - } - - // sanity check phase; the phase number in gff tells us how many bases to skip in this - // feature to reach the first base of the next codon - int tscript_ok = 1; - for (i=0; incds; i++) - { - if ( tr->cds[i]->phase == CDS_PHASE_UNKN ) - { - warn_phase_unkn = 1; - len += tr->cds[i]->len; - continue; - } - int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; - if ( phase!=len%3 ) - { - if ( args->force ) - { - if ( args->verbosity > 0 ) - fprintf(stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n", - args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); - tscript_ok = 0; - break; - } - error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", - args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); - } - len += tr->cds[i]->len; - } - if ( !tscript_ok ) continue; // skip this transcript - } - else - { - if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN ) - { - // Check that the phase is not bigger than CDS length. Curiously, this can really happen, - // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141 - // todo: the same for the fwd strand - i = tr->ncds - 1; - int phase = tr->cds[i]->phase; - if ( phase ) tr->trim |= TRIM_5PRIME; - while ( i>=0 && phase > tr->cds[i]->len ) - { - phase -= tr->cds[i]->len; - tr->cds[i]->phase = 0; - tr->cds[i]->len = 0; - i--; - } - tr->cds[i]->len -= tr->cds[i]->phase; - tr->cds[i]->phase = 0; - } - - // sanity check phase - int tscript_ok = 1; - for (i=tr->ncds-1; i>=0; i--) - { - if ( tr->cds[i]->phase == CDS_PHASE_UNKN ) - { - warn_phase_unkn = 1; - len += tr->cds[i]->len; - continue; - } - int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; - if ( phase!=len%3) - { - if ( args->force ) - { - if ( args->verbosity > 0 ) - fprintf(stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n", - args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); - tscript_ok = 0; - break; - } - error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", - args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); - } - len += tr->cds[i]->len; - } - if ( !tscript_ok ) continue; // skip this transcript - } - - // set len. At the same check that CDS within a transcript do not overlap - len = 0; - for (i=0; incds; i++) - { - tr->cds[i]->icds = i; - len += tr->cds[i]->len; - if ( !i ) continue; - - gf_cds_t *a = tr->cds[i-1]; - gf_cds_t *b = tr->cds[i]; - if ( a->beg + a->len - 1 >= b->beg ) - { - if ( args->force ) - { - fprintf(stderr,"Warning: GFF contains overlapping CDS %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32".\n", - args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len); - } - else - error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n" - " Use the --force option to override (at your own risk).\n", - args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len); - } - } - if ( len%3 != 0 ) - { - // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289 - // http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289 - // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one. - - tr->trim |= TRIM_3PRIME; - if ( tr->strand==STRAND_FWD ) - { - i = tr->ncds - 1; - while ( i>=0 && len%3 ) - { - int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len; - tr->cds[i]->len -= dlen; - len -= dlen; - i--; - } - } - else - { - i = 0; - while ( incds && len%3 ) - { - int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len; - tr->cds[i]->len -= dlen; - tr->cds[i]->beg += dlen; - len -= dlen; - i++; - } - } - } - - // set CDS offsets and insert into regidx - len=0; - for (i=0; incds; i++) - { - tr->cds[i]->pos = len; - len += tr->cds[i]->len; - regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]); - } - } - if ( warn_phase_unkn && args->verbosity > 0 ) - fprintf(stderr,"Warning: encountered CDS with phase column unset, could not verify reading frame\n"); -} - -void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); } -void regidx_free_tscript(void *payload) { tscript_t *tr = *((tscript_t**)payload); free(tr->cds); free(tr); } - -void init_gff(args_t *args) -{ - aux_t *aux = &args->init; - aux->seq2int = khash_str2int_init(); // chrom's numeric id - aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene - aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t - args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL); - aux->ignored_biotypes = khash_str2int_init(); - gff_id_init(&aux->gene_ids); - gff_id_init(&args->tscript_ids); - - // parse gff - kstring_t str = {0,0,0}; - htsFile *fp = hts_open(args->gff_fname,"r"); - if ( !fp ) error("Failed to read %s\n", args->gff_fname); - while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 ) - { - hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr); - int ret = gff_parse(args, str.s, aux->ftr + aux->nftr); - if ( !ret ) aux->nftr++; - } - free(str.s); - if ( hts_close(fp)!=0 ) error("Close failed: %s\n", args->gff_fname); - - - // process gff information: connect CDS and exons to transcripts - args->idx_cds = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL); - args->idx_utr = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL); - args->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL); - args->itr = regitr_init(NULL); - - int i; - for (i=0; inftr; i++) - { - ftr_t *ftr = &aux->ftr[i]; - - // check whether to keep this feature: is there a mapping trid -> gene_id -> gene? - khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid); - if ( k==kh_end(aux->id2tr) ) continue; // no such transcript - - tscript_t *tr = kh_val(aux->id2tr,k); - if ( !tr->gene->name ) - { - // not a supported biotype (e.g. gene:pseudogene, transcript:processed_transcript) - regidx_free_tscript(&tr); - kh_del(int2tscript, aux->id2tr,k); - continue; - } - - // populate regidx by category: - // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5 - // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ... - if ( ftr->type==GF_CDS ) register_cds(args, ftr); - else if ( ftr->type==GF_EXON ) register_exon(args, ftr); - else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr); - else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr); - else - error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,args->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type)); - } - tscript_init_cds(args); - - if ( args->verbosity > 0 ) - { - fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n", - regidx_nregs(args->idx_tscript), - regidx_nregs(args->idx_exon), - regidx_nregs(args->idx_cds), - regidx_nregs(args->idx_utr)); - } - if ( !regidx_nregs(args->idx_tscript) ) - fprintf(stderr, - "Warning: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n" - " or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n" - " of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n"); - - free(aux->ftr); - khash_str2int_destroy_free(aux->seq2int); - // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene); - kh_destroy(int2tscript,aux->id2tr); - free(aux->seq); - gff_id_destroy(&aux->gene_ids); - - if ( args->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) ) - { - khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes; - fprintf(stderr,"Ignored the following biotypes:\n"); - for (i = kh_begin(ign); i < kh_end(ign); i++) - { - if ( !kh_exist(ign,i)) continue; - const char *biotype = kh_key(ign,i); - if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")"; - fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype); - } - } - khash_str2int_destroy_free(aux->ignored_biotypes); -} - static inline int ncsq2_to_nfmt(int ncsq2) { return 1 + (ncsq2 - 1) / 30; @@ -1474,8 +483,17 @@ void init_data(args_t *args) args->fai = fai_load(args->fa_fname); if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname); - if ( args->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", args->gff_fname); - init_gff(args); + args->gff = gff_init(args->gff_fname); + gff_set(args->gff,verbosity,args->verbosity); + gff_set(args->gff,strip_chr_names,args->unify_chr_names); + gff_set(args->gff,force_out_of_phase,args->force); + gff_set(args->gff,dump_fname,args->dump_gff); + gff_parse(args->gff); + args->idx_cds = gff_get(args->gff,idx_cds); + args->idx_utr = gff_get(args->gff,idx_utr); + args->idx_exon = gff_get(args->gff,idx_exon); + args->idx_tscript = gff_get(args->gff,idx_tscript); + args->itr = regitr_init(NULL); args->rid = -1; @@ -1536,6 +554,7 @@ void init_data(args_t *args) if ( args->hdr_nsmpl ) bcf_hdr_printf(args->hdr,"##FORMAT=",args->bcsq_tag); if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); + if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); } if ( args->verbosity > 0 ) fprintf(stderr,"Calling...\n"); } @@ -1547,21 +566,8 @@ void destroy_data(args_t *args) "Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n" " the limit can be increased by running with `--ncsq %d`.\n",ncsq2_to_nfmt(args->ncsq2_max)/8,1+args->ncsq2_small_warned/2); - regidx_destroy(args->idx_cds); - regidx_destroy(args->idx_utr); - regidx_destroy(args->idx_exon); - regidx_destroy(args->idx_tscript); regitr_destroy(args->itr); - - khint_t k,i,j; - for (k=0; kinit.gid2gene); k++) - { - if ( !kh_exist(args->init.gid2gene, k) ) continue; - gf_gene_t *gene = (gf_gene_t*) kh_val(args->init.gid2gene, k); - free(gene->name); - free(gene); - } - kh_destroy(int2gene,args->init.gid2gene); + gff_destroy(args->gff); if ( args->filter ) filter_destroy(args->filter); @@ -1569,9 +575,20 @@ void destroy_data(args_t *args) khp_destroy(trhp,args->active_tr); kh_destroy(pos2vbuf,args->pos2vbuf); if ( args->smpl ) smpl_ilist_destroy(args->smpl); - int ret; + int i,j,ret; if ( args->out_fh ) + { + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } ret = hts_close(args->out_fh); + } else ret = fclose(args->out); if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); @@ -1602,7 +619,7 @@ void destroy_data(args_t *args) free(args->gt_arr); free(args->str.s); free(args->str2.s); - gff_id_destroy(&args->tscript_ids); + free(args->chr_name); } /* @@ -1614,7 +631,7 @@ void destroy_data(args_t *args) #define SPLICE_OVERLAP 3 // indel overlaps region boundary, csq set but could not determine csq typedef struct { - tscript_t *tr; + gf_tscript_t *tr; struct { int32_t pos, rlen, alen, ial; char *ref, *alt; @@ -1678,7 +695,7 @@ fprintf(stderr,"build_hap: rbeg=%d + %d abeg=%d \n",rbeg,rlen,abeg); if ( rbeg < splice->vcf.pos ) { assert( splice->tr->beg <= rbeg ); // this can be extended thanks to N_REF_PAD - kputsn(splice->tr->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref); + kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref); roff = 0; } else @@ -1703,7 +720,7 @@ fprintf(stderr,"r2: %s\n",splice->kref.s); if ( end + rlen - splice->kref.l - 1 > splice->tr->end ) // trim, the requested sequence is too long (could be extended, see N_REF_PAD) rlen -= end + rlen - splice->kref.l - 1 - splice->tr->end; if ( splice->kref.l < rlen ) - kputsn(splice->tr->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref); + kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref); } #if XDBG fprintf(stderr,"r3: %s\n",splice->kref.s); @@ -1714,7 +731,7 @@ fprintf(stderr,"r3: %s\n",splice->kref.s); if ( abeg < splice->vcf.pos ) { assert( splice->tr->beg <= abeg ); - kputsn(splice->tr->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt); + kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt); aoff = 0; } else @@ -1742,7 +759,7 @@ fprintf(stderr,"a2: %s aoff=%d\n",splice->kalt.s,aoff); if ( end + alen + aoff - splice->kalt.l - 1 > splice->tr->end ) // trim, the requested sequence is too long alen -= end + alen + aoff - splice->kalt.l - 1 - splice->tr->end; if ( alen > 0 && alen > splice->kalt.l ) - kputsn(splice->tr->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt); + kputsn(TSCRIPT_AUX(splice->tr)->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt); } #if XDBG fprintf(stderr,"a3: %s\n",splice->kalt.s); @@ -1755,7 +772,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32 while ( regitr_overlap(itr) ) { gf_utr_t *utr = regitr_payload(itr, gf_utr_t*); - tscript_t *tr = utr->tr; + gf_tscript_t *tr = utr->tr; if ( tr->id != trid ) continue; csq_t csq; memset(&csq, 0, sizeof(csq_t)); @@ -1771,7 +788,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32 } return 0; } -static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type, int ial) +static inline void csq_stage_splice(args_t *args, bcf1_t *rec, gf_tscript_t *tr, uint32_t type, int ial) { #if XDBG fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type); @@ -1788,6 +805,21 @@ fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type); csq.type.gene = tr->gene->name; csq_stage(args, &csq, rec); } +static inline const char *drop_chr_prefix(args_t *args, const char *chr) +{ + if ( !args->unify_chr_names ) return chr; + if ( !strncasecmp("chr",chr,3) ) return chr+3; + return chr; +} +static inline const char *add_chr_prefix(args_t *args, const char *chr) +{ + if ( !args->unify_chr_names ) return chr; + int len = strlen(chr); + hts_expand(char,len+4,args->mchr_name,args->chr_name); + memcpy(args->chr_name,"chr",3); + memcpy(args->chr_name+3,chr,len+1); + return args->chr_name; +} static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) { // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG, 1bp @@ -1813,7 +845,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr { ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); @@ -1851,7 +883,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr { ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); @@ -1924,7 +956,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) { static int small_ref_padding_warned = 0; - tscript_t *tr = splice->tr; + gf_tscript_t *tr = splice->tr; // We know the VCF record overlaps the exon, but does it overlap the start codon? if ( tr->strand==STRAND_REV && splice->vcf.pos + splice->vcf.rlen + 2 <= ex_end ) return 0; @@ -1956,7 +988,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint } char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele - char *ptr_ref = splice->tr->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg); // the first ref base after the ndel bases deleted + char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg); // the first ref base after the ndel bases deleted #if XDBG fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref); #endif @@ -1985,7 +1017,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint } char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele - char *ptr_ref = splice->tr->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg; // the replacement ref block + char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg; // the replacement ref block #if XDBG fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref); #endif @@ -2030,7 +1062,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,% if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -2086,7 +1118,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,% if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -2175,7 +1207,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -2205,7 +1237,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -2291,7 +1323,7 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, { int i; kstring_t str = {0,0,0}; - tscript_t *tr = cds->tr; + gf_tscript_t *tr = cds->tr; child->icds = cds->icds; // index of cds in the tscript's list of exons child->vcf_ial = ial; @@ -2313,8 +1345,8 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, } if ( splice.check_start ) // do not check starts in incomplete CDS, defined as not starting with M { - if ( tr->strand==STRAND_FWD ) { if ( dna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; } - else { if ( cdna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; } + if ( tr->strand==STRAND_FWD ) { if ( dna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; } + else { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; } } if ( child->icds!=0 ) splice.check_region_beg = 1; if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1; @@ -2373,12 +1405,12 @@ fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n // the variant is on a new exon, finish up the previous int len = tr->cds[i]->len - parent->rbeg - parent->rlen + tr->cds[i]->beg; if ( len > 0 ) - kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); + kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); } // append any skipped non-variant exons while ( ++i < cds->icds ) - kputsn_(tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str); + kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str); if ( parent->icds==child->icds ) { @@ -2390,10 +1422,10 @@ fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n free(splice.kalt.s); return 1; } - kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); + kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); } else - kputsn_(tr->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str); + kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str); } kputs(splice.kalt.s + dbeg, &str); @@ -2645,28 +1677,28 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill, #endif } -void tscript_splice_ref(tscript_t *tr) +void tscript_splice_ref(gf_tscript_t *tr) { int i, len = 0; for (i=0; incds; i++) len += tr->cds[i]->len; - tr->nsref = len + 2*N_REF_PAD; - tr->sref = (char*) malloc(len + 1 + 2*N_REF_PAD); + TSCRIPT_AUX(tr)->nsref = len + 2*N_REF_PAD; + TSCRIPT_AUX(tr)->sref = (char*) malloc(len + 1 + 2*N_REF_PAD); len = 0; - memcpy(tr->sref, tr->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD); + memcpy(TSCRIPT_AUX(tr)->sref, TSCRIPT_AUX(tr)->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD); len += N_REF_PAD; for (i=0; incds; i++) { - memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len); + memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len); len += tr->cds[i]->len; } - memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD); + memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD); len += N_REF_PAD; - tr->sref[len] = 0; + TSCRIPT_AUX(tr)->sref[len] = 0; } // returns: 0 if consequence was added, 1 if it already exists or could not be added @@ -2800,18 +1832,25 @@ void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str) if ( csq->type & CSQ_UPSTREAM_STOP ) kputc_('*',str); - int i, n = sizeof(csq_strings)/sizeof(char*); + int has_csq = 0, i, n = sizeof(csq_strings)/sizeof(char*); for (i=1; itype&(1<type&(1<type&(1<type&(1<biotype==GF_NMD) && (csq->type & CSQ_PRN_NMD) ) + { + if ( has_csq ) kputc_('&',str); // just in case, this should always be true + kputs("NMD_transcript",str); + } kputc_('|', str); if ( csq->gene ) kputs(csq->gene , str); kputc_('|', str); - if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str); +// if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str); + if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(gff_id2string(args->gff,transcript,csq->trid), str); kputc_('|', str); kputs(gf_type2gff_string(csq->biotype), str); @@ -2840,7 +1879,7 @@ void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str) void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel) { int i; - tscript_t *tr = hap->tr; + gf_tscript_t *tr = hap->tr; int ref_node = tr->strand==STRAND_FWD ? ibeg : iend; int icsq = node->ncsq_list++; hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list); @@ -2954,7 +1993,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, str.l = 0; // create the aa variant string - int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (hap->tr->nsref - 2*N_REF_PAD - node2rend(iend))/3+1; + int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (TSCRIPT_AUX(hap->tr)->nsref - 2*N_REF_PAD - node2rend(iend))/3+1; int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1; kputc_('|', &str); kputw(aa_rbeg, &str); @@ -3020,13 +2059,13 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, void hap_finalize(args_t *args, hap_t *hap) { - tscript_t *tr = hap->tr; - if ( !tr->sref ) + gf_tscript_t *tr = hap->tr; + if ( !TSCRIPT_AUX(tr)->sref ) tscript_splice_ref(tr); kstring_t sref; - sref.s = tr->sref; - sref.l = tr->nsref; + sref.s = TSCRIPT_AUX(tr)->sref; + sref.l = TSCRIPT_AUX(tr)->nsref; sref.m = sref.l; int istack = 0; @@ -3034,7 +2073,7 @@ void hap_finalize(args_t *args, hap_t *hap) hap->sseq.l = 0; hap->tseq.l = 0; - hap->stack[0].node = tr->root; + hap->stack[0].node = TSCRIPT_AUX(tr)->root; hap->stack[0].ichild = -1; hap->stack[0].slen = 0; hap->stack[0].dlen = 0; @@ -3214,7 +2253,7 @@ static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap) kput_vcsq(args, &csq->type, &args->str); fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s); } -static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node) +static inline void hap_print_text(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node) { if ( !node || !node->ncsq_list ) return; @@ -3240,7 +2279,7 @@ static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ih } } -static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node) +static inline void hap_stage_vcf(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node) { if ( !node || !node->ncsq_list || ismpl<0 ) return; @@ -3276,23 +2315,23 @@ void hap_flush(args_t *args, uint32_t pos) tr_heap_t *heap = args->active_tr; while ( heap->ndat && heap->dat[0]->end<=pos ) { - tscript_t *tr = heap->dat[0]; + gf_tscript_t *tr = heap->dat[0]; khp_delete(trhp, heap); args->hap->tr = tr; - if ( tr->root && tr->root->nchild ) // normal, non-localized calling + if ( TSCRIPT_AUX(tr)->root && TSCRIPT_AUX(tr)->root->nchild ) // normal, non-localized calling { hap_finalize(args, args->hap); if ( args->output_type==FT_TAB_TEXT ) // plain text output, not a vcf { if ( args->phase==PHASE_DROP_GT ) - hap_print_text(args, tr, -1,0, tr->hap[0]); + hap_print_text(args, tr, -1,0, TSCRIPT_AUX(tr)->hap[0]); else { for (i=0; ismpl->n; i++) { for (j=0; j<2; j++) - hap_print_text(args, tr, args->smpl->idx[i],j+1, tr->hap[i*2+j]); + hap_print_text(args, tr, args->smpl->idx[i],j+1, TSCRIPT_AUX(tr)->hap[i*2+j]); } } } @@ -3301,7 +2340,7 @@ void hap_flush(args_t *args, uint32_t pos) for (i=0; ismpl->n; i++) { for (j=0; j<2; j++) - hap_stage_vcf(args, tr, args->smpl->idx[i],j, tr->hap[i*2+j]); + hap_stage_vcf(args, tr, args->smpl->idx[i],j, TSCRIPT_AUX(tr)->hap[i*2+j]); } } } @@ -3309,7 +2348,7 @@ void hap_flush(args_t *args, uint32_t pos) // mark the transcript for deletion. Cannot delete it immediately because // by-position VCF output will need them when flushed by vcf_buf_push args->nrm_tr++; - hts_expand(tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr); + hts_expand(gf_tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr); args->rm_tr[args->nrm_tr-1] = tr; } } @@ -3424,24 +2463,33 @@ void vbuf_flush(args_t *args, uint32_t pos) for (i=0; inrm_tr; i++) { - tscript_t *tr = args->rm_tr[i]; - if ( tr->root ) hap_destroy(tr->root); - tr->root = NULL; - free(tr->hap); - free(tr->ref); - free(tr->sref); + gf_tscript_t *tr = args->rm_tr[i]; + tscript_t *aux = TSCRIPT_AUX(tr); + if ( aux->root ) hap_destroy(aux->root); + aux->root = NULL; + free(aux->hap); + free(aux->ref); + free(aux->sref); + free(aux); + tr->aux = NULL; } args->nrm_tr = 0; args->ncsq_buf = 0; } -void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr) +void tscript_init_ref(args_t *args, gf_tscript_t *tr, const char *chr) { int i, len; int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg; - tr->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len); - if ( !tr->ref ) + const char *tmp_chr = chr; + if ( !faidx_has_seq(args->fai,tmp_chr) ) + { + tmp_chr = drop_chr_prefix(args,chr); + if ( !faidx_has_seq(args->fai,tmp_chr) ) tmp_chr = add_chr_prefix(args,chr); + } + TSCRIPT_AUX(tr)->ref = faidx_fetch_seq(args->fai, tmp_chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len); + if ( !TSCRIPT_AUX(tr)->ref ) error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1); int pad_end = len - (tr->end - tr->beg + 1 + pad_beg); @@ -3449,23 +2497,23 @@ void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr) { char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD + 1); for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N'; - memcpy(ref+i, tr->ref, len); + memcpy(ref+i, TSCRIPT_AUX(tr)->ref, len); len += i; for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N'; ref[i+len] = 0; - free(tr->ref); - tr->ref = ref; + free(TSCRIPT_AUX(tr)->ref); + TSCRIPT_AUX(tr)->ref = ref; } } -static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec) +static void sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec) { int vbeg = 0; int rbeg = rec->pos - tr->beg + N_REF_PAD; if ( rbeg < 0 ) { vbeg += abs(rbeg); rbeg = 0; } - char *ref = tr->ref + rbeg; + char *ref = TSCRIPT_AUX(tr)->ref + rbeg; char *vcf = rec->d.allele[0] + vbeg; - assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - tr->ref < tr->end - tr->beg + 2*N_REF_PAD ); + assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - TSCRIPT_AUX(tr)->ref < tr->end - tr->beg + 2*N_REF_PAD ); int i = 0; while ( ref[i] && vcf[i] ) { @@ -3479,7 +2527,7 @@ static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec) int test_cds_local(args_t *args, bcf1_t *rec) { int i,j, ret = 0; - const char *chr = bcf_seqname(args->hdr,rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); // note that the off-by-one extension of rlen is deliberate to account for insertions if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; @@ -3491,12 +2539,13 @@ int test_cds_local(args_t *args, bcf1_t *rec) while ( regitr_overlap(args->itr) ) { gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); - tscript_t *tr = cds->tr; + gf_tscript_t *tr = cds->tr; if ( !GF_is_coding(tr->type) ) continue; ret = 1; - if ( !tr->ref ) + if ( !TSCRIPT_AUX(tr) ) { + tr->aux = calloc(sizeof(tscript_t),1); tscript_init_ref(args, tr, chr); tscript_splice_ref(tr); khp_insert(trhp, args->active_tr, &tr); // only to clean the reference afterwards @@ -3505,8 +2554,8 @@ int test_cds_local(args_t *args, bcf1_t *rec) sanity_check_ref(args, tr, rec); kstring_t sref; - sref.s = tr->sref; - sref.l = tr->nsref; + sref.s = TSCRIPT_AUX(tr)->sref; + sref.l = TSCRIPT_AUX(tr)->nsref; sref.m = sref.l; for (i=1; in_allele; i++) @@ -3614,8 +2663,8 @@ int test_cds_local(args_t *args, bcf1_t *rec) { // create the aa variant string kstring_t str = {0,0,0}; - int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1; - int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1; + int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1; + int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1; kputc_('|', &str); kputw(aa_rbeg, &str); kprint_aa_prediction(args,aa_rbeg,tref,&str); @@ -3633,11 +2682,11 @@ int test_cds_local(args_t *args, bcf1_t *rec) csq_stage(args, &csq, rec); // all this only to clean vstr when vrec is flushed - if ( !tr->root ) - tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); - tr->root->ncsq_list++; - hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list); - csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1; + if ( !TSCRIPT_AUX(tr)->root ) + TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); + TSCRIPT_AUX(tr)->root->ncsq_list++; + hts_expand0(csq_t,TSCRIPT_AUX(tr)->root->ncsq_list,TSCRIPT_AUX(tr)->root->mcsq_list,TSCRIPT_AUX(tr)->root->csq_list); + csq_t *rm_csq = TSCRIPT_AUX(tr)->root->csq_list + TSCRIPT_AUX(tr)->root->ncsq_list - 1; rm_csq->type.vstr = str; } if ( csq_type & ~CSQ_COMPOUND ) @@ -3659,27 +2708,28 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) static int overlaps_warned = 0, multiploid_warned = 0; int i, ret = 0, hap_ret; - const char *chr = bcf_seqname(args->hdr,rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); // note that the off-by-one extension of rlen is deliberate to account for insertions if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; while ( regitr_overlap(args->itr) ) { gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); - tscript_t *tr = cds->tr; + gf_tscript_t *tr = cds->tr; if ( !GF_is_coding(tr->type) ) continue; if ( vbuf->keep_until < tr->end ) vbuf->keep_until = tr->end; ret = 1; - if ( !tr->root ) + if ( !TSCRIPT_AUX(tr) ) { // initialize the transcript and its haplotype tree, fetch the reference sequence + tr->aux = calloc(sizeof(tscript_t),1); tscript_init_ref(args, tr, chr); - tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); - tr->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid - tr->hap = (hap_node_t**) malloc(tr->nhap*sizeof(hap_node_t*)); - for (i=0; inhap; i++) tr->hap[i] = NULL; - tr->root->nend = tr->nhap; - tr->root->type = HAP_ROOT; + TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); + TSCRIPT_AUX(tr)->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid + TSCRIPT_AUX(tr)->hap = (hap_node_t**) malloc(TSCRIPT_AUX(tr)->nhap*sizeof(hap_node_t*)); + for (i=0; inhap; i++) TSCRIPT_AUX(tr)->hap[i] = NULL; + TSCRIPT_AUX(tr)->root->nend = TSCRIPT_AUX(tr)->nhap; + TSCRIPT_AUX(tr)->root->type = HAP_ROOT; khp_insert(trhp, args->active_tr, &tr); } @@ -3689,7 +2739,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) if ( args->phase==PHASE_DROP_GT ) { if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } - hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root; + hap_node_t *parent = TSCRIPT_AUX(tr)->hap[0] ? TSCRIPT_AUX(tr)->hap[0] : TSCRIPT_AUX(tr)->root; hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t)); hap_ret = hap_init(args, parent, child, cds, rec, 1); if ( hap_ret!=0 ) @@ -3734,8 +2784,8 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) parent->mchild = 1; parent->child = (hap_node_t**) malloc(sizeof(hap_node_t*)); parent->child[0] = child; - tr->hap[0] = child; - tr->hap[0]->nend = 1; + TSCRIPT_AUX(tr)->hap[0] = child; + TSCRIPT_AUX(tr)->hap[0]->nend = 1; continue; } @@ -3793,12 +2843,12 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) assert( ial < rec->n_allele ); if ( rec->d.allele[ial][0]=='<' || rec->d.allele[ial][0]=='*' ) { continue; } - hap_node_t *parent = tr->hap[i] ? tr->hap[i] : tr->root; + hap_node_t *parent = TSCRIPT_AUX(tr)->hap[i] ? TSCRIPT_AUX(tr)->hap[i] : TSCRIPT_AUX(tr)->root; if ( parent->cur_rec==rec && parent->cur_child[ial]>=0 ) { // this haplotype has been seen in another sample - tr->hap[i] = parent->child[ parent->cur_child[ial] ]; - tr->hap[i]->nend++; + TSCRIPT_AUX(tr)->hap[i] = parent->child[ parent->cur_child[ial] ]; + TSCRIPT_AUX(tr)->hap[i]->nend++; parent->nend--; continue; } @@ -3852,8 +2902,8 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) hts_expand0(hap_node_t*,parent->nchild,parent->mchild,parent->child); parent->cur_child[ial] = j; parent->child[j] = child; - tr->hap[i] = child; - tr->hap[i]->nend++; + TSCRIPT_AUX(tr)->hap[i] = child; + TSCRIPT_AUX(tr)->hap[i]->nend++; parent->nend--; } } @@ -3933,7 +2983,7 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) } int test_utr(args_t *args, bcf1_t *rec) { - const char *chr = bcf_seqname(args->hdr,rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); // note that the off-by-one extension of rlen is deliberate to account for insertions if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; @@ -3944,7 +2994,7 @@ int test_utr(args_t *args, bcf1_t *rec) while ( regitr_overlap(args->itr) ) { gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*); - tscript_t *tr = splice.tr = utr->tr; + gf_tscript_t *tr = splice.tr = utr->tr; for (i=1; in_allele; i++) { if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } @@ -3971,7 +3021,7 @@ int test_utr(args_t *args, bcf1_t *rec) } int test_splice(args_t *args, bcf1_t *rec) { - const char *chr = bcf_seqname(args->hdr,rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0; splice_t splice; @@ -4003,7 +3053,7 @@ int test_splice(args_t *args, bcf1_t *rec) } int test_tscript(args_t *args, bcf1_t *rec) { - const char *chr = bcf_seqname(args->hdr,rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; splice_t splice; @@ -4012,7 +3062,7 @@ int test_tscript(args_t *args, bcf1_t *rec) int i, ret = 0; while ( regitr_overlap(args->itr) ) { - tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*); + gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*); for (i=1; in_allele; i++) { if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } @@ -4046,7 +3096,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) warned = 1; } - const char *chr = bcf_seqname(args->hdr,rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); // only insertions atm int beg = rec->pos + 1; @@ -4061,7 +3111,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) csq_t csq; memset(&csq, 0, sizeof(csq_t)); gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); - tscript_t *tr = cds->tr; + gf_tscript_t *tr = cds->tr; csq.type.type = (GF_is_coding(tr->type) ? CSQ_CODING_SEQUENCE : CSQ_NON_CODING) | csq_class; csq.pos = rec->pos; csq.type.biotype = tr->type; @@ -4079,7 +3129,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) csq_t csq; memset(&csq, 0, sizeof(csq_t)); gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*); - tscript_t *tr = utr->tr; + gf_tscript_t *tr = utr->tr; csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | csq_class; csq.pos = rec->pos; csq.type.biotype = tr->type; @@ -4118,7 +3168,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) { csq_t csq; memset(&csq, 0, sizeof(csq_t)); - tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*); + gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*); splice.vcf.alt = rec->d.allele[1]; splice.csq = csq_class; int splice_ret = splice_csq(args, &splice, tr->beg, tr->end); @@ -4179,7 +3229,10 @@ static void process(args_t *args, bcf1_t **rec_ptr) // Perform a simple sanity check (that does not catch much), the chromosome must be present in the // reference file if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) ) - error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname); + { + if ( !faidx_has_seq(args->fai,drop_chr_prefix(args,bcf_seqname(args->hdr,rec))) && !faidx_has_seq(args->fai,add_chr_prefix(args,bcf_seqname(args->hdr,rec))) ) + error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname); + } } if ( prev_pos > rec->pos ) error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); @@ -4254,9 +3307,12 @@ static const char *usage(void) " r: require phased GTs, throw an error on unphased het GTs\n" " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n" " s: skip unphased hets\n" - "Options:\n" - " -e, --exclude EXPR Exclude sites for which the expression is true\n" + "GFF options:\n" + " --dump-gff FILE.gz Dump the parsed GFF file (for debugging purposes)\n" " --force Run even if some sanity checks fail\n" + " --unify-chr-names 1|0 Automatically unify chromosome naming (e.g. chrX vs X) in GFF, fasta, and VCF [1]\n" + "General options:\n" + " -e, --exclude EXPR Exclude sites for which the expression is true\n" " -i, --include EXPR Select sites for which the expression is true\n" " --no-version Do not append version and command line to the header\n" " -o, --output FILE Write output to a file [standard output]\n" @@ -4272,6 +3328,7 @@ static const char *usage(void) " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n" " --threads INT Use multithreading with worker threads [0]\n" " -v, --verbose INT Verbosity level 0-2 [1]\n" + " --write-index Automatically index the output files [off]\n" "\n" "Example:\n" " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n" @@ -4292,6 +3349,7 @@ int main_csq(int argc, char *argv[]) args->verbosity = 1; args->record_cmd_line = 1; args->clevel = -1; + args->unify_chr_names = 1; static struct option loptions[] = { @@ -4321,6 +3379,9 @@ int main_csq(int argc, char *argv[]) {"targets-file",1,0,'T'}, {"targets-overlap",required_argument,NULL,5}, {"no-version",no_argument,NULL,3}, + {"write-index",no_argument,NULL,6}, + {"dump-gff",required_argument,NULL,7}, + {"unify-chr-names",required_argument,NULL,8}, {0,0,0,0} }; int c, targets_is_file = 0, regions_is_file = 0; @@ -4339,7 +3400,7 @@ int main_csq(int argc, char *argv[]) case 3 : args->record_cmd_line = 0; break; case 'b': args->brief_predictions = 1; - fprintf(stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n"); + fprintf(stderr,"Warning: The -b option will be removed in future versions. Please use -B 1 instead.\n"); break; case 'B': args->brief_predictions = strtol(optarg,&tmp,10); @@ -4409,6 +3470,13 @@ int main_csq(int argc, char *argv[]) targets_overlap = parse_overlap_option(optarg); if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; + case 6 : args->write_index = 1; break; + case 7 : args->dump_gff = optarg; break; + case 8 : + if ( !strcmp(optarg,"0") ) args->unify_chr_names = 0; + else if ( !strcmp(optarg,"1") ) args->unify_chr_names = 1; + else error("Could not parse: --unify-chr-names %s\n",optarg); + break; case 'h': case '?': error("%s",usage()); default: error("The option not recognised: %s\n\n", optarg); break; diff --git a/doc/bcftools.1 b/doc/bcftools.1 index 0e3d5290e..c940065fb 100644 --- a/doc/bcftools.1 +++ b/doc/bcftools.1 @@ -2,12 +2,12 @@ .\" Title: bcftools .\" Author: [see the "AUTHOR(S)" section] .\" Generator: Asciidoctor 2.0.16.dev -.\" Date: 2023-02-21 +.\" Date: 2023-07-25 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "BCFTOOLS" "1" "2023-02-21" "\ \&" "\ \&" +.TH "BCFTOOLS" "1" "2023-07-25" "\ \&" "\ \&" .ie \n(.g .ds Aq \(aq .el .ds Aq ' .ss \n[.ss] 0 @@ -51,10 +51,10 @@ standard input (stdin) and outputs to the standard output (stdout). Several commands can thus be combined with Unix pipes. .SS "VERSION" .sp -This manual page was last updated \fB2023\-02\-21\fP and refers to bcftools git version \fB1.17\fP. +This manual page was last updated \fB2023\-07\-25\fP and refers to bcftools git version \fB1.18\fP. .SS "BCF1" .sp -The BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP +The obsolete BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP compatible with this version of bcftools. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0.1.19 to convert to VCF, which can then be read by @@ -75,6 +75,9 @@ done with \fIbcftools view\fP. Users are now required to choose between the old samtools calling model (\fI\-c/\-\-consensus\-caller\fP) and the new multiallelic calling model (\fI\-m/\-\-multiallelic\-caller\fP). The multiallelic calling model is recommended for most tasks. +.SS "FILTERING EXPRESSIONS" +.sp +See \fBEXPRESSIONS\fP .SH "LIST OF COMMANDS" .sp For a full list of available commands, run \fBbcftools\fP without arguments. For a full @@ -344,6 +347,17 @@ Some helper scripts are bundled with the bcftools code. . sp -1 . IP \(bu 2.3 .\} +\fBgff2gff\fP .. converts a GFF file to the format required by \fBcsq\fP +.RE +.sp +.RS 4 +.ie n \{\ +\h'-04'\(bu\h'+03'\c +.\} +.el \{\ +. sp -1 +. IP \(bu 2.3 +.\} \fBplot\-vcfstats\fP .. plots the output of \fBstats\fP .RE .SH "COMMANDS AND OPTIONS" @@ -597,6 +611,11 @@ Same as \fB\-\-regions\-overlap\fP but for \fB\-t/\-T\fP. Use multithreading with \fIINT\fP worker threads. The option is currently used only for the compression of the output stream, only when \fI\-\-output\-type\fP is \fIb\fP or \fIz\fP. Default: 0. .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output files. Can be used only for compressed BCF and VCF output. +.RE .SS "bcftools annotate \fI[OPTIONS]\fP \fIFILE\fP" .sp Add or remove annotations. @@ -881,6 +900,11 @@ except GT. To remove all INFO tags except "FOO" and "BAR", use "INFO" can be abbreviated to "INF" and "FORMAT" to "FMT". .RE .sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE +.sp \fBExamples:\fP .sp .if n .RS 4 @@ -1017,6 +1041,11 @@ see \fBCommon Options\fP .RS 4 see \fBCommon Options\fP .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE .SS "Input/output options:" .sp \fB\-A, \-\-keep\-alts\fP @@ -1401,6 +1430,11 @@ see \fBCommon Options\fP .RS 4 see \fBCommon Options\fP .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE .SS "bcftools consensus \fI[OPTIONS]\fP \fIFILE\fP" .sp Create consensus sequence by applying VCF variants to a reference fasta file. @@ -1432,18 +1466,13 @@ exclude sites for which \fIEXPRESSION\fP is true. For valid expressions see reference sequence in fasta format .RE .sp -\fB\-H, \-\-haplotype\fP \fI1\fP|\fI2\fP|\fIR\fP|\fIA\fP|\fII\fP|\fILR\fP|\fILA\fP|\fISR\fP|\fISA\fP|\fI1pIu\fP|\fI2pIu\fP +\fB\-H, \-\-haplotype\fP N|\fIR\fP|\fIA\fP|\fII\fP|\fILR\fP|\fILA\fP|\fISR\fP|\fISA\fP|\fINpIu\fP .RS 4 choose which allele from the FORMAT/GT field to use (the codes are case\-insensitive): .sp -\fI1\fP -.RS 4 -the first allele, regardless of phasing -.RE -.sp -\fI2\fP +\fIN\fP .RS 4 -the second allele, regardless of phasing +N={1,2,3,...}, the allele index within the genotype, regardless of phasing .RE .sp \fIR\fP @@ -1471,20 +1500,15 @@ the longer allele. If both have the same length, use the REF allele (LR), or the the shorter allele. If both have the same length, use the REF allele (SR), or the ALT allele (SA) .RE .sp -\fI1pIu, 2pIu\fP +\fINpIu\fP .RS 4 -first/second allele for phased genotypes and IUPAC code for unphased genotypes -.sp -.if n .RS 4 -.nf -.fam C -This option requires *\-s*, unless exactly one sample is present in the VCF -.fam -.fi -.if n .RE +N={1,2,3,...}, the allele index within genotype for phased genotypes and IUPAC code for unphased genotypes. +For example, \fI1pIu\fP or \fI2pIu\fP .RE .RE .sp +Note that the \fB\-H, \-\-haplotype\fP option requires the \fB\-s, \-\-samples\fP option, unless exactly one sample is present in the VCF +.sp \fB\-i, \-\-include\fP \fIEXPRESSION\fP .RS 4 include only sites for which \fIEXPRESSION\fP is true. For valid expressions see @@ -1494,24 +1518,24 @@ include only sites for which \fIEXPRESSION\fP is true. For valid expressions see \fB\-I, \-\-iupac\-codes\fP .RS 4 output variants in the form of IUPAC ambiguity codes determined from FORMAT/GT fields. By default all -samples are used and can be subset with \f(CR\-s, \-\-samples\fP and \f(CR\-S, \-\-samples\-file\fP. Use \f(CR\-s \-\fP to ignore +samples are used and can be subset with \fB\-s, \-\-samples\fP and \fB\-S, \-\-samples\-file\fP. Use \fB\-s \-\fP to ignore samples and use only the REF and ALT columns. NOTE: prior to version 1.17 the IUPAC codes were determined solely from REF,ALT columns and sample genotypes were not considered. .RE .sp \fB\-\-mark\-del\fP \fICHAR\fP .RS 4 -instead of removing sequence, insert CHAR for deletions +instead of removing sequence, insert character CHAR for deletions .RE .sp -\fB\-\-mark\-ins\fP \fIuc\fP|\fIlc\fP +\fB\-\-mark\-ins\fP \fIuc\fP|\fIlc\fP|\fICHAR\fP .RS 4 -highlight inserted sequence in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is +highlight inserted sequence in uppercase (uc), lowercase (lc), or a provided character CHAR, leaving the rest of the sequence as is .RE .sp \fB\-\-mark\-snv\fP \fIuc\fP|\fIlc\fP .RS 4 -highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is +highlight substitutions in uppercase (uc), lowercase (lc), or a provided character CHAR, leaving the rest of the sequence as is .RE .sp \fB\-m, \-\-mask\fP \fIFILE\fP @@ -1539,12 +1563,12 @@ write output to a file .sp \fB\-s, \-\-samples\fP \fILIST\fP .RS 4 -apply variants of the listed samples. See also the option \f(CR\-I, \-\-iupac\-codes\fP +apply variants of the listed samples. See also the option \fB\-I, \-\-iupac\-codes\fP .RE .sp \fB\-S, \-\-samples\-file\fP \fIFILE\fP .RS 4 -apply variants of the samples listed in the file. See also the option \f(CR\-I, \-\-iupac\-codes\fP +apply variants of the samples listed in the file. See also the option \fB\-I, \-\-iupac\-codes\fP .RE .sp \fBExamples:\fP @@ -1563,6 +1587,44 @@ apply variants of the samples listed in the file. See also the option \f(CR\-I, .fam .fi .if n .RE +.sp +\fBNotes:\fP +.RS 4 +Masking options are applied in the following order +.sp +.RS 4 +.ie n \{\ +\h'-04' 1.\h'+01'\c +.\} +.el \{\ +. sp -1 +. IP " 1." 4.2 +.\} +mask regions with \fB\-\-mask\-with\fP character if \fB\-\-mask\fP is given. All overlapping VCF variants are ignored +.RE +.sp +.RS 4 +.ie n \{\ +\h'-04' 2.\h'+01'\c +.\} +.el \{\ +. sp -1 +. IP " 2." 4.2 +.\} +replace sequence not mentioned in the VCF with the requested character if \fB\-\-absent\fP is given +.RE +.sp +.RS 4 +.ie n \{\ +\h'-04' 3.\h'+01'\c +.\} +.el \{\ +. sp -1 +. IP " 3." 4.2 +.\} +finally apply \fB\-\-mark\-del\fP, \fB\-\-mark\-ins\fP, \fB\-\-mark\-snv\fP masks +.RE +.RE .SS "bcftools convert \fI[OPTIONS]\fP \fIFILE\fP" .SS "VCF input options:" .sp @@ -1617,6 +1679,11 @@ see \fBCommon Options\fP .RS 4 see \fBCommon Options\fP .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE .SS "VCF output options:" .sp \fB\-\-no\-version\fP @@ -1887,13 +1954,13 @@ convert from TSV (tab\-separated values) format (such as generated by \fB\-c, \-\-columns\fP \fIlist\fP .RS 4 comma\-separated list of fields in the input file. In the current -version, the fields CHROM, POS, ID, and AA are expected and -can appear in arbitrary order, columns which should be ignored in the input +version, the fields CHROM, POS, ID, and AA or REF, ALT are expected and +can appear in arbitrary order. Columns which should be ignored in the input file can be indicated by "\-". The AA field lists alleles on the forward reference strand, for example "CC" or "CT" for diploid genotypes or "C" for haploid genotypes (sex chromosomes). Insertions and deletions -are not supported yet, missing data can be indicated with "\-\-". +are supported only with REF and ALT but not with AA. Missing data can be indicated with "\-\-" or ".". .RE .sp \fB\-f, \-\-fasta\-ref\fP \fIfile\fP @@ -1917,7 +1984,10 @@ file of sample names. See \fBCommon Options\fP .nf .fam C # Convert 23andme results into VCF -bcftools convert \-c ID,CHROM,POS,AA \-s SampleName \-f 23andme\-ref.fa \-\-tsv2vcf 23andme.txt \-Oz \-o out.vcf.gz +bcftools convert \-c ID,CHROM,POS,AA \-s SampleName \-f 23andme\-ref.fa \-\-tsv2vcf 23andme.txt \-o out.vcf.gz + +# Convert tab\-delimited file into a sites\-only VCF (no genotypes), in this example first column to be ignored +bcftools convert \-c \-,CHROM,POS,REF,ALT \-f ref.fa \-\-tsv2vcf calls.txt \-o out.bcf .fam .fi .if n .RE @@ -1966,6 +2036,12 @@ aminoacids, with \fB\-B 1\fP only an abbreviated version such as \fI25E..329>25G written. .RE .sp +\fB\-\-dump\-gff\fP \fIFILE\fP +.RS 4 +dump the parsed GFF into a gzipped FILE. Intended for debugging purposes, +shows how is the input GFF viewed by the program. +.RE +.sp \fB\-e, \-\-exclude\fP \fIEXPRESSION\fP .RS 4 exclude sites for which \fIEXPRESSION\fP is true. For valid expressions see @@ -1987,6 +2063,7 @@ transcripts in malformatted GFFs with incorrect phase .RS 4 GFF3 annotation file (required), such as \c .URL "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens" "" "." +The script \fBgff2gff\fP can help with conversion from non\-standard GFF formats. An example of a minimal working GFF file: .RE .sp @@ -1998,6 +2075,17 @@ An example of a minimal working GFF file: # the gene (determined from the transcript\*(Aqs "Parent=gene:" attribute), and the biotype # (the most interesting is "protein_coding"). # + # Empty and commented lines are skipped, the following GFF columns are required + # 1. chromosome + # 2. IGNORED + # 3. type (CDS, exon, three_prime_UTR, five_prime_UTR, gene, transcript, etc.) + # 4. start of the feature (1\-based) + # 5. end of the feature (1\-based) + # 6. IGNORED + # 7. strand (+ or \-) + # 8. phase (0, 1, 2 or .) + # 9. semicolon\-separated attributes (see below) + # # Attributes required for # gene lines: # \- ID=gene: @@ -2137,6 +2225,18 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-unify\-chr\-names\fP \fI0\fP|\fI1\fP +.RS 4 +Automatically detect and unify chromosome naming conventions in the GFF, fasta +and VCF, such as "chrX" vs "X". The chromosome names in the output VCF will match +that of the input VCF. The default is to attempt the automatic translation. +.RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE +.sp \fBExamples:\fP .sp .if n .RS 4 @@ -2366,6 +2466,11 @@ see \fBCommon Options\fP .RS 4 see \fBCommon Options\fP .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE .SS "bcftools gtcheck [\fIOPTIONS\fP] [\fB\-g\fP \fIgenotypes.vcf.gz\fP] \fIquery.vcf.gz\fP" .sp Checks sample identity. The program can operate in two modes. If the \fB\-g\fP @@ -2373,6 +2478,10 @@ option is given, the identity of samples from \fIquery.vcf.gz\fP is checked against the samples in the \fB\-g\fP file. Without the \fB\-g\fP option, multi\-sample cross\-check of samples in \fIquery.vcf.gz\fP is performed. .sp +Note that the interpretation of the discordance score depends on the options provided (specifically \fB\-e\fP and +\fB\-u\fP) and on the available annotations (FORMAT/PL vs FORMAT/GT). +The discordance score can be interpreted as the number of mismatching genotypes if only GT\-vs\-GT matching is performed. +.sp \fB\-\-distinctive\-sites\fP \fINUM[,MEM[,DIR]]\fP .RS 4 Find sites that can distinguish between at least NUM sample pairs. If the number is smaller or equal to 1, @@ -2391,11 +2500,18 @@ Stop after first record to estimate required time. Interpret genotypes and genotype likelihoods probabilistically. The value of \fIINT\fP represents genotype quality when GT tag is used (e.g. Q=30 represents one error in 1,000 genotypes and Q=40 one error in 10,000 genotypes) and is ignored when PL tag is used (in that case an arbitrary -non\-zero integer can be provided). See also the \fB\-u, \-\-use\fP option below. If set to 0, -the discordance equals to the number of mismatching genotypes when GT vs GT is compared. -Note that the values with and without \fB\-e\fP are not comparable, only values generated -with \fB\-e 0\fP correspond to mismatching genotypes. -If performance is an issue, set to 0 for faster run but less accurate results. +non\-zero integer can be provided). +\~ +.br +\~ +.br +If \fB\-e\fP is set to 0, the discordance score can be interpreted as the number of mismatching genotypes, +but only in the GT\-vs\-GT matching mode. See the \fB\-u, \-\-use\fP option below for additional notes and caveats. +\~ +.br +\~ +.br +If performance is an issue, set \fB\-e 0\fP for faster run times but less accurate results. .RE .sp \fB\-g, \-\-genotypes\fP \fIFILE\fP @@ -2476,8 +2592,15 @@ see \fBCommon Options\fP \fB\-u, \-\-use\fP \fITAG1\fP[,\fITAG2\fP] .RS 4 specifies which tag to use in the query file (\fITAG1\fP) and the \fB\-g\fP (\fITAG2\fP) file. -By default, the PL tag is used in the query file and GT in the \fB\-g\fP file when -available. +By default, the PL tag is used in the query file and, when available, the GT tags in the +\fB\-g\fP file. +\~ +.br +\~ +.br +Note that when the requested tag is not available, the program will attempt to use +the other tag. The output includes the number of sites that were matched by the four +possible mode (for example GT\-vs\-GT or GT\-vs\-PL). .RE .sp \fBExamples:\fP @@ -2676,6 +2799,11 @@ see \fBCommon Options\fP list of input files to output given as 1\-based indices. With \fB\-p\fP and no \fB\-w\fP, all files are written. .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file. This is done automatically with the \fB\-p\fP option. +.RE .SS "Examples:" .sp Create intersection and complements of two sets saving the output in dir/* @@ -2785,7 +2913,8 @@ merge gVCF blocks, INFO/END tag is expected. If the reference fasta file \fIFILE\fP is not given and the dash (\fI\-\fP) is given, unknown reference bases generated at gVCF block splits will be substituted with N\(cqs. The \fB\-\-gvcf\fP option uses the following default INFO rules: -\fB\-i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\fP. +\fB\-i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\fP and the following missing +rules: \fB\-M PL:max,AD:0\fP. .RE .sp \fB\-i, \-\-info\-rules\fP \fI\-\fP|\fITAG:METHOD\fP[,...] @@ -2835,6 +2964,17 @@ The option controls what types of multiallelic records can be created: .fi .if n .RE .sp +\fB\-M, \-\-missing\-rules\fP \fI\-\fP|\fITAG:METHOD\fP[,...] +.RS 4 +Rules for merging vector tags at multiallelic sites. When input files have different alternate +alleles, vector fields pertaining to unobserved alleles are set to missing (\fI.\fP) by default. +The \fIMETHOD\fP is one of \fI.\fP (the default, use missing values), \fINUMBER\fP (use a constant value, e.g. 0), +\fImax\fP (the maximum value observed for other alleles in the sample). When \fB\-\-gvcf\fP option is set, +the rule \fB\-M PL:max,AD:0\fP is implied. This can be overriden with providing \fB\-M \-\fP or \fB\-M PL:.,AD:.\fP. +Note that if the unobserved allele is explicitly present as \fI<*>\fP or \fI\fP, then its corresponding +value will be used regardless of \fB\-M\fP settings. +.RE +.sp \fB\-\-no\-index\fP .RS 4 the option allows to merge files without indexing them first. In order for this @@ -2876,6 +3016,11 @@ see \fBCommon Options\fP .RS 4 see \fBCommon Options\fP .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE .SS "bcftools mpileup [\fIOPTIONS\fP] \fB\-f\fP \fIref.fa\fP \fIin.bam\fP [\fIin2.bam\fP [...]]" .sp Generate VCF or BCF containing genotype likelihoods for one or multiple @@ -3209,6 +3354,11 @@ BQB. .fi .if n .RE .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE .SS "Options for SNP/INDEL genotype likelihood computation" .sp \fB\-X, \-\-config\fP \fISTR\fP @@ -3431,6 +3581,13 @@ try to proceed with \fB\-m\-\fP even if malformed tags with incorrect number of are encountered, discarding such tags. (Experimental, use at your own risk.) .RE .sp +\fB\-g, \-\-gff\-annot\fP \fIFILE\fP +.RS 4 +when a GFF file is provided, follow HGVS 3\(cqrule and right\-align variants in transcripts on the forward +strand. In case of overlapping transcripts, the default mode is to left\-align the variant. For a +description of the supported GFF3 file format see \fBbcftools csq\fP. +.RE +.sp \fB\-\-keep\-sum\fP \fITAG\fP[,...] .RS 4 keep vector sum constant when splitting multiallelic sites. Only AD tag @@ -3528,6 +3685,11 @@ see \fBCommon Options\fP maximum distance between two records to consider when locally sorting variants which changed position during the realignment .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE .SS "bcftools [plugin \fINAME\fP|+\fINAME\fP] \fI[OPTIONS]\fP \fIFILE\fP \(em \fI[PLUGIN OPTIONS]\fP" .sp A common framework for various utilities. The plugins can be used @@ -3601,6 +3763,11 @@ see \fBCommon Options\fP .RS 4 see \fBCommon Options\fP .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE .SS "Plugin options:" .sp \fB\-h, \-\-help\fP @@ -4723,7 +4890,13 @@ see \fBCommon Options\fP .sp \fB\-T, \-\-temp\-dir\fP \fIDIR\fP .RS 4 -Use this directory to store temporary files +Use this directory to store temporary files. If the last six characters of the string DIR are XXXXXX, +then these are replaced with a string that makes the directory name unique. +.RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file .RE .SS "bcftools stats [\fIOPTIONS\fP] \fIA.vcf.gz\fP [\fIB.vcf.gz\fP]" .sp @@ -4943,6 +5116,11 @@ see \fBCommon Options\fP .RS 4 see \fBCommon Options\fP .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE .SS "Subset options:" .sp \fB\-a, \-\-trim\-alt\-alleles\fP @@ -5137,7 +5315,7 @@ important libraries used by bcftools. .SS "bcftools [\fI\-\-version\-only\fP]" .sp Display the full bcftools version number in a machine\-readable format. -.SH "EXPRESSIONS" +.SH "FILTERING EXPRESSIONS" .sp These filtering expressions are accepted by most of the commands. .sp @@ -5919,7 +6097,18 @@ bcftools view \-i \*(Aq%ID!="." & MAF[0]<0.01\*(Aq .if n .RE .sp Please refer to the documentation of your shell for details. -.SH "SCRIPTS AND OPTIONS" +.SH "SCRIPTS" +.SS "gff2gff" +.sp +Attempts to fix a GFF file to be correctly parsed by \fBcsq\fP. +.sp +.if n .RS 4 +.nf +.fam C +zcat in.gff.gz | gff2gff | gzip \-c > out.gff.gz +.fam +.fi +.if n .RE .SS "plot\-vcfstats [\fIOPTIONS\fP] \fIfile.vchk\fP [...]" .sp Script for processing output of \fBbcftools stats\fP. It can merge @@ -6013,8 +6202,10 @@ Please report any bugs you encounter on the github website: \c .sp Heng Li from the Sanger Institute wrote the original C version of htslib, samtools and bcftools. Bob Handsaker from the Broad Institute implemented the -BGZF library. Petr Danecek, Shane McCarthy and John Marshall are maintaining -and further developing bcftools. Many other people contributed to the program +BGZF library. Petr Danecek is maintaining and further developing bcftools, together +with the rest of the \c +.URL "https://www.sanger.ac.uk/tool/samtools\-bcftools\-htslib" "samtools team" "." +Many other people contributed to the program and to the file format specifications, both directly and indirectly by providing patches, testing and reporting bugs. We thank them all. .SH "RESOURCES" diff --git a/doc/bcftools.html b/doc/bcftools.html index 5a4f5ae51..0b4baab9e 100644 --- a/doc/bcftools.html +++ b/doc/bcftools.html @@ -50,13 +50,13 @@

DESCRIPTION

VERSION

-

This manual page was last updated 2023-02-21 and refers to bcftools git version 1.17.

+

This manual page was last updated 2023-07-25 and refers to bcftools git version 1.18.

BCF1

-

The BCF1 format output by versions of samtools <= 0.1.19 is not +

The obsolete BCF1 format output by versions of samtools <= 0.1.19 is not compatible with this version of bcftools. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0.1.19 to convert to VCF, which can then be read by @@ -79,6 +79,12 @@

VARIANT CALLING

is recommended for most tasks.

+
+

FILTERING EXPRESSIONS

+
+

See EXPRESSIONS

+
+
@@ -172,6 +178,9 @@

LIST OF SCRIPTS

@@ -417,6 +426,10 @@

Common Options

Use multithreading with INT worker threads. The option is currently used only for the compression of the output stream, only when --output-type is b or z. Default: 0.

+
--write-index
+
+

Automatically index the output files. Can be used only for compressed BCF and VCF output.

+
@@ -668,6 +681,10 @@

bcftools annotate [OPTIONS] FILE

"^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER). "INFO" can be abbreviated to "INF" and "FORMAT" to "FMT".

+
--write-index
+
+

Automatically index the output file

+
@@ -797,6 +814,10 @@

File format options:

see Common Options

+
--write-index
+
+

Automatically index the output file

+
@@ -1161,6 +1182,10 @@

bcftools concat [OPTIONS] FILE1 FILE2

see Common Options

+
--write-index
+
+

Automatically index the output file

+
@@ -1194,18 +1219,14 @@

bcftools consensus [OPTIONS] FILE

reference sequence in fasta format

-
-H, --haplotype 1|2|R|A|I|LR|LA|SR|SA|1pIu|2pIu
+
-H, --haplotype N|R|A|I|LR|LA|SR|SA|NpIu

choose which allele from the FORMAT/GT field to use (the codes are case-insensitive):

-
1
-
-

the first allele, regardless of phasing

-
-
2
+
N
-

the second allele, regardless of phasing

+

N={1,2,3,…​}, the allele index within the genotype, regardless of phasing

R
@@ -1227,18 +1248,21 @@

bcftools consensus [OPTIONS] FILE

the shorter allele. If both have the same length, use the REF allele (SR), or the ALT allele (SA)

-
1pIu, 2pIu
+
NpIu
-

first/second allele for phased genotypes and IUPAC code for unphased genotypes

-
-
-
This option requires *-s*, unless exactly one sample is present in the VCF
-
-
+

N={1,2,3,…​}, the allele index within genotype for phased genotypes and IUPAC code for unphased genotypes. +For example, 1pIu or 2pIu

+ + +
+

Note that the -H, --haplotype option requires the -s, --samples option, unless exactly one sample is present in the VCF

+
+
+
-i, --include EXPRESSION

include only sites for which EXPRESSION is true. For valid expressions see @@ -1247,21 +1271,21 @@

bcftools consensus [OPTIONS] FILE

-I, --iupac-codes

output variants in the form of IUPAC ambiguity codes determined from FORMAT/GT fields. By default all -samples are used and can be subset with -s, --samples and -S, --samples-file. Use -s - to ignore +samples are used and can be subset with -s, --samples and -S, --samples-file. Use -s - to ignore samples and use only the REF and ALT columns. NOTE: prior to version 1.17 the IUPAC codes were determined solely from REF,ALT columns and sample genotypes were not considered.

--mark-del CHAR
-

instead of removing sequence, insert CHAR for deletions

+

instead of removing sequence, insert character CHAR for deletions

-
--mark-ins uc|lc
+
--mark-ins uc|lc|CHAR
-

highlight inserted sequence in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is

+

highlight inserted sequence in uppercase (uc), lowercase (lc), or a provided character CHAR, leaving the rest of the sequence as is

--mark-snv uc|lc
-

highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is

+

highlight substitutions in uppercase (uc), lowercase (lc), or a provided character CHAR, leaving the rest of the sequence as is

-m, --mask FILE
@@ -1284,11 +1308,11 @@

bcftools consensus [OPTIONS] FILE

-s, --samples LIST
-

apply variants of the listed samples. See also the option -I, --iupac-codes

+

apply variants of the listed samples. See also the option -I, --iupac-codes

-S, --samples-file FILE
-

apply variants of the samples listed in the file. See also the option -I, --iupac-codes

+

apply variants of the samples listed in the file. See also the option -I, --iupac-codes

@@ -1307,6 +1331,27 @@

bcftools consensus [OPTIONS] FILE

# For more examples see http://samtools.github.io/bcftools/howtos/consensus-sequence.html +
+
+
Notes:
+
+

Masking options are applied in the following order

+
+
    +
  1. +

    mask regions with --mask-with character if --mask is given. All overlapping VCF variants are ignored

    +
  2. +
  3. +

    replace sequence not mentioned in the VCF with the requested character if --absent is given

    +
  4. +
  5. +

    finally apply --mark-del, --mark-ins, --mark-snv masks

    +
  6. +
+
+
+
+

bcftools convert [OPTIONS] FILE

@@ -1356,6 +1401,10 @@

VCF input options:

see Common Options

+
--write-index
+
+

Automatically index the output file

+
@@ -1637,13 +1686,13 @@

TSV conversion:

-c, --columns list

comma-separated list of fields in the input file. In the current -version, the fields CHROM, POS, ID, and AA are expected and -can appear in arbitrary order, columns which should be ignored in the input +version, the fields CHROM, POS, ID, and AA or REF, ALT are expected and +can appear in arbitrary order. Columns which should be ignored in the input file can be indicated by "-". The AA field lists alleles on the forward reference strand, for example "CC" or "CT" for diploid genotypes or "C" for haploid genotypes (sex chromosomes). Insertions and deletions -are not supported yet, missing data can be indicated with "--".

+are supported only with REF and ALT but not with AA. Missing data can be indicated with "--" or ".".

-f, --fasta-ref file
@@ -1665,7 +1714,10 @@

TSV conversion:

# Convert 23andme results into VCF
-bcftools convert -c ID,CHROM,POS,AA -s SampleName -f 23andme-ref.fa --tsv2vcf 23andme.txt -Oz -o out.vcf.gz
+bcftools convert -c ID,CHROM,POS,AA -s SampleName -f 23andme-ref.fa --tsv2vcf 23andme.txt -o out.vcf.gz + +# Convert tab-delimited file into a sites-only VCF (no genotypes), in this example first column to be ignored +bcftools convert -c -,CHROM,POS,REF,ALT -f ref.fa --tsv2vcf calls.txt -o out.bcf
@@ -1721,6 +1773,11 @@

bcftools csq [OPTIONS] FILE

aminoacids, with -B 1 only an abbreviated version such as 25E..329>25G..94 will be written.

+
--dump-gff FILE
+
+

dump the parsed GFF into a gzipped FILE. Intended for debugging purposes, +shows how is the input GFF viewed by the program.

+
-e, --exclude EXPRESSION

exclude sites for which EXPRESSION is true. For valid expressions see @@ -1738,6 +1795,7 @@

bcftools csq [OPTIONS] FILE

-g, --gff-annot FILE

GFF3 annotation file (required), such as ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens. +The script gff2gff can help with conversion from non-standard GFF formats. An example of a minimal working GFF file:

@@ -1749,6 +1807,17 @@

bcftools csq [OPTIONS] FILE

# the gene (determined from the transcript's "Parent=gene:" attribute), and the biotype # (the most interesting is "protein_coding"). # + # Empty and commented lines are skipped, the following GFF columns are required + # 1. chromosome + # 2. IGNORED + # 3. type (CDS, exon, three_prime_UTR, five_prime_UTR, gene, transcript, etc.) + # 4. start of the feature (1-based) + # 5. end of the feature (1-based) + # 6. IGNORED + # 7. strand (+ or -) + # 8. phase (0, 1, 2 or .) + # 9. semicolon-separated attributes (see below) + # # Attributes required for # gene lines: # - ID=gene:<gene_id> @@ -1871,6 +1940,16 @@

bcftools csq [OPTIONS] FILE

see Common Options

+
--unify-chr-names 0|1
+
+

Automatically detect and unify chromosome naming conventions in the GFF, fasta +and VCF, such as "chrX" vs "X". The chromosome names in the output VCF will match +that of the input VCF. The default is to attempt the automatic translation.

+
+
--write-index
+
+

Automatically index the output file

+
@@ -2084,6 +2163,10 @@

bcftools filter [OPTIONS] FILE

see Common Options

+
--write-index
+
+

Automatically index the output file

+
@@ -2095,6 +2178,11 @@

bcftools gtcheck [OPTIONS] [-g ge is checked against the samples in the -g file. Without the -g option, multi-sample cross-check of samples in query.vcf.gz is performed.

+
+

Note that the interpretation of the discordance score depends on the options provided (specifically -e and +-u) and on the available annotations (FORMAT/PL vs FORMAT/GT). +The discordance score can be interpreted as the number of mismatching genotypes if only GT-vs-GT matching is performed.

+
--distinctive-sites NUM[,MEM[,DIR]]
@@ -2113,11 +2201,14 @@

bcftools gtcheck [OPTIONS] [-g ge

Interpret genotypes and genotype likelihoods probabilistically. The value of INT represents genotype quality when GT tag is used (e.g. Q=30 represents one error in 1,000 genotypes and Q=40 one error in 10,000 genotypes) and is ignored when PL tag is used (in that case an arbitrary -non-zero integer can be provided). See also the -u, --use option below. If set to 0, -the discordance equals to the number of mismatching genotypes when GT vs GT is compared. -Note that the values with and without -e are not comparable, only values generated -with -e 0 correspond to mismatching genotypes. -If performance is an issue, set to 0 for faster run but less accurate results.

+non-zero integer can be provided). + 

+If -e is set to 0, the discordance score can be interpreted as the number of mismatching genotypes, +but only in the GT-vs-GT matching mode. See the -u, --use option below for additional notes and caveats. + 

+If performance is an issue, set -e 0 for faster run times but less accurate results.

-g, --genotypes FILE
@@ -2191,8 +2282,13 @@

bcftools gtcheck [OPTIONS] [-g ge
-u, --use TAG1[,TAG2]

specifies which tag to use in the query file (TAG1) and the -g (TAG2) file. -By default, the PL tag is used in the query file and GT in the -g file when -available.

+By default, the PL tag is used in the query file and, when available, the GT tags in the +-g file. + 

+Note that when the requested tag is not available, the program will attempt to use +the other tag. The output includes the number of sites that were matched by the four +possible mode (for example GT-vs-GT or GT-vs-PL).

@@ -2394,6 +2490,10 @@

bcftools isec [OPTIONS] A.vcf.gz B.vcf.gzlist of input files to output given as 1-based indices. With -p and no -w, all files are written.

+
--write-index
+
+

Automatically index the output file. This is done automatically with the -p option.

+
@@ -2497,7 +2597,8 @@

bcftools merge [OPTIONS] A.vcf.gz B.vcf.gz< file FILE is not given and the dash (-) is given, unknown reference bases generated at gVCF block splits will be substituted with N’s. The --gvcf option uses the following default INFO rules: --i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max.

+-i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max and the following missing +rules: -M PL:max,AD:0.

-i, --info-rules -|TAG:METHOD[,…​]
@@ -2543,6 +2644,16 @@

bcftools merge [OPTIONS] A.vcf.gz B.vcf.gz<

+
-M, --missing-rules -|TAG:METHOD[,…​]
+
+

Rules for merging vector tags at multiallelic sites. When input files have different alternate +alleles, vector fields pertaining to unobserved alleles are set to missing (.) by default. +The METHOD is one of . (the default, use missing values), NUMBER (use a constant value, e.g. 0), +max (the maximum value observed for other alleles in the sample). When --gvcf option is set, +the rule -M PL:max,AD:0 is implied. This can be overriden with providing -M - or -M PL:.,AD:.. +Note that if the unobserved allele is explicitly present as <*> or <NON_REF>, then its corresponding +value will be used regardless of -M settings.

+
--no-index

the option allows to merge files without indexing them first. In order for this @@ -2577,6 +2688,10 @@

bcftools merge [OPTIONS] A.vcf.gz B.vcf.gz<

see Common Options

+
--write-index
+
+

Automatically index the output file

+

@@ -2889,6 +3004,10 @@

Output options

+
--write-index
+
+

Automatically index the output file

+
@@ -3099,6 +3218,12 @@

bcftools norm [OPTIONS] file.vcf.gz

try to proceed with -m- even if malformed tags with incorrect number of fields are encountered, discarding such tags. (Experimental, use at your own risk.)

+
-g, --gff-annot FILE
+
+

when a GFF file is provided, follow HGVS 3’rule and right-align variants in transcripts on the forward +strand. In case of overlapping transcripts, the default mode is to left-align the variant. For a +description of the supported GFF3 file format see bcftools csq.

+
--keep-sum TAG[,…​]

keep vector sum constant when splitting multiallelic sites. Only AD tag @@ -3179,6 +3304,10 @@

bcftools norm [OPTIONS] file.vcf.gz

maximum distance between two records to consider when locally sorting variants which changed position during the realignment

+
--write-index
+
+

Automatically index the output file

+
@@ -3254,6 +3383,10 @@

VCF output options:

see Common Options

+
--write-index
+
+

Automatically index the output file

+
@@ -4134,7 +4267,12 @@

bcftools sort [OPTIONS] file.bcf

-T, --temp-dir DIR
-

Use this directory to store temporary files

+

Use this directory to store temporary files. If the last six characters of the string DIR are XXXXXX, +then these are replaced with a string that makes the directory name unique.

+
+
--write-index
+
+

Automatically index the output file

@@ -4339,6 +4477,10 @@

Output options

see Common Options

+
--write-index
+
+

Automatically index the output file

+
@@ -4538,7 +4680,7 @@

bcftools [--version-only]

-

EXPRESSIONS

+

FILTERING EXPRESSIONS

These filtering expressions are accepted by most of the commands.

@@ -4974,9 +5116,24 @@

EXPRESSIONS

-

SCRIPTS AND OPTIONS

+

SCRIPTS

+

gff2gff

+
+

Attempts to fix a GFF file to be correctly parsed by csq.

+
+
+
+
+
+
zcat in.gff.gz | gff2gff | gzip -c > out.gff.gz
+
+
+
+
+
+

plot-vcfstats [OPTIONS] file.vchk […​]

Script for processing output of bcftools stats. It can merge @@ -5077,8 +5234,9 @@

AUTHORS

Heng Li from the Sanger Institute wrote the original C version of htslib, samtools and bcftools. Bob Handsaker from the Broad Institute implemented the -BGZF library. Petr Danecek, Shane McCarthy and John Marshall are maintaining -and further developing bcftools. Many other people contributed to the program +BGZF library. Petr Danecek is maintaining and further developing bcftools, together +with the rest of the samtools team. +Many other people contributed to the program and to the file format specifications, both directly and indirectly by providing patches, testing and reporting bugs. We thank them all.

@@ -5119,7 +5277,7 @@

COPYING

diff --git a/doc/bcftools.txt b/doc/bcftools.txt index b1a5f07c4..b2dcaf6c2 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -52,7 +52,7 @@ commands can thus be combined with Unix pipes. This manual page was last updated *{date}* and refers to bcftools git version *{version}*. === BCF1 -The BCF1 format output by versions of samtools \<= 0.1.19 is *not* +The obsolete BCF1 format output by versions of samtools \<= 0.1.19 is *not* compatible with this version of bcftools. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions \<= 0.1.19 to convert to VCF, which can then be read by @@ -71,6 +71,10 @@ calling model ('-m/--multiallelic-caller'). The multiallelic calling model is recommended for most tasks. +=== FILTERING EXPRESSIONS +See *<>* + + LIST OF COMMANDS ---------------- For a full list of available commands, run *bcftools* without arguments. For a full @@ -105,6 +109,7 @@ LIST OF SCRIPTS --------------- Some helper scripts are bundled with the bcftools code. +- *<>* .. converts a GFF file to the format required by *<>* - *<>* .. plots the output of *<>* @@ -298,6 +303,9 @@ Such a file can be easily created from a VCF using: Use multithreading with 'INT' worker threads. The option is currently used only for the compression of the output stream, only when '--output-type' is 'b' or 'z'. Default: 0. +*--write-index*:: + Automatically index the output files. Can be used only for compressed BCF and VCF output. + [[annotate]] === bcftools annotate '[OPTIONS]' 'FILE' @@ -501,6 +509,9 @@ Add or remove annotations. "^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER). "INFO" can be abbreviated to "INF" and "FORMAT" to "FMT". +*--write-index*:: + Automatically index the output file + *Examples:* ---- # Remove three fields @@ -604,6 +615,9 @@ demand. The original calling model can be invoked with the *-c* option. *--threads* 'INT':: see *<>* +*--write-index*:: + Automatically index the output file + ==== Input/output options: *-A, --keep-alts*:: @@ -878,6 +892,9 @@ are concatenated without being recompressed, which is very fast.. *--threads* 'INT':: see *<>* +*--write-index*:: + Automatically index the output file + [[consensus]] === bcftools consensus '[OPTIONS]' 'FILE' @@ -902,14 +919,11 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the *-f, --fasta-ref* 'FILE':: reference sequence in fasta format -*-H, --haplotype* '1'|'2'|'R'|'A'|'I'|'LR'|'LA'|'SR'|'SA'|'1pIu'|'2pIu':: +*-H, --haplotype* N|'R'|'A'|'I'|'LR'|'LA'|'SR'|'SA'|'NpIu':: choose which allele from the FORMAT/GT field to use (the codes are case-insensitive): - '1';; - the first allele, regardless of phasing - - '2';; - the second allele, regardless of phasing + 'N';; + N={1,2,3,...}, the allele index within the genotype, regardless of phasing 'R';; the REF allele (in heterozygous genotypes) @@ -926,10 +940,11 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the 'SR, SA';; the shorter allele. If both have the same length, use the REF allele (SR), or the ALT allele (SA) - '1pIu, 2pIu';; - first/second allele for phased genotypes and IUPAC code for unphased genotypes + 'NpIu';; + N={1,2,3,...}, the allele index within genotype for phased genotypes and IUPAC code for unphased genotypes. + For example, '1pIu' or '2pIu' - This option requires *-s*, unless exactly one sample is present in the VCF +Note that the *-H, --haplotype* option requires the *-s, --samples* option, unless exactly one sample is present in the VCF *-i, --include* 'EXPRESSION':: include only sites for which 'EXPRESSION' is true. For valid expressions see @@ -937,18 +952,18 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the *-I, --iupac-codes*:: output variants in the form of IUPAC ambiguity codes determined from FORMAT/GT fields. By default all - samples are used and can be subset with `-s, --samples` and `-S, --samples-file`. Use `-s -` to ignore + samples are used and can be subset with *-s, --samples* and *-S, --samples-file*. Use *-s -* to ignore samples and use only the REF and ALT columns. NOTE: prior to version 1.17 the IUPAC codes were determined solely from REF,ALT columns and sample genotypes were not considered. *--mark-del* 'CHAR':: - instead of removing sequence, insert CHAR for deletions + instead of removing sequence, insert character CHAR for deletions -*--mark-ins* 'uc'|'lc':: - highlight inserted sequence in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is +*--mark-ins* 'uc'|'lc'|'CHAR':: + highlight inserted sequence in uppercase (uc), lowercase (lc), or a provided character CHAR, leaving the rest of the sequence as is *--mark-snv* 'uc'|'lc':: - highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is + highlight substitutions in uppercase (uc), lowercase (lc), or a provided character CHAR, leaving the rest of the sequence as is *-m, --mask* 'FILE':: BED file or TAB file with regions to be replaced with N (the default) or as specified by @@ -966,10 +981,10 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the write output to a file *-s, --samples* 'LIST':: - apply variants of the listed samples. See also the option `-I, --iupac-codes` + apply variants of the listed samples. See also the option *-I, --iupac-codes* *-S, --samples-file* 'FILE':: - apply variants of the samples listed in the file. See also the option `-I, --iupac-codes` + apply variants of the samples listed in the file. See also the option *-I, --iupac-codes* *Examples:* ---- @@ -983,6 +998,14 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the # For more examples see http://samtools.github.io/bcftools/howtos/consensus-sequence.html ---- +*Notes:*:: + Masking options are applied in the following order + 1. mask regions with *--mask-with* character if *--mask* is given. All overlapping VCF variants are ignored + 2. replace sequence not mentioned in the VCF with the requested character if *--absent* is given + 3. finally apply *--mark-del*, *--mark-ins*, *--mark-snv* masks + + + [[convert]] === bcftools convert '[OPTIONS]' 'FILE' @@ -1021,6 +1044,9 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the *--targets-overlap* '0'|'1'|'2':: see *<>* +*--write-index*:: + Automatically index the output file + ==== VCF output options: *--no-version*:: @@ -1210,13 +1236,13 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the *-c, --columns* 'list':: comma-separated list of fields in the input file. In the current - version, the fields CHROM, POS, ID, and AA are expected and - can appear in arbitrary order, columns which should be ignored in the input + version, the fields CHROM, POS, ID, and AA or REF, ALT are expected and + can appear in arbitrary order. Columns which should be ignored in the input file can be indicated by "-". The AA field lists alleles on the forward reference strand, for example "CC" or "CT" for diploid genotypes or "C" for haploid genotypes (sex chromosomes). Insertions and deletions - are not supported yet, missing data can be indicated with "--". + are supported only with REF and ALT but not with AA. Missing data can be indicated with "--" or ".". *-f, --fasta-ref* 'file':: reference sequence in fasta format. Must be indexed with samtools faidx @@ -1230,7 +1256,10 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the *Example:* ---- # Convert 23andme results into VCF -bcftools convert -c ID,CHROM,POS,AA -s SampleName -f 23andme-ref.fa --tsv2vcf 23andme.txt -Oz -o out.vcf.gz +bcftools convert -c ID,CHROM,POS,AA -s SampleName -f 23andme-ref.fa --tsv2vcf 23andme.txt -o out.vcf.gz + +# Convert tab-delimited file into a sites-only VCF (no genotypes), in this example first column to be ignored +bcftools convert -c -,CHROM,POS,REF,ALT -f ref.fa --tsv2vcf calls.txt -o out.bcf ---- @@ -1277,6 +1306,10 @@ output VCF and are ignored for the prediction analysis. aminoacids, with *-B 1* only an abbreviated version such as '25E..329>25G..94' will be written. +*--dump-gff* 'FILE':: + dump the parsed GFF into a gzipped FILE. Intended for debugging purposes, + shows how is the input GFF viewed by the program. + *-e, --exclude* 'EXPRESSION':: exclude sites for which 'EXPRESSION' is true. For valid expressions see *<>*. @@ -1290,6 +1323,7 @@ output VCF and are ignored for the prediction analysis. *-g, --gff-annot* 'FILE':: GFF3 annotation file (required), such as ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens. + The script *<>* can help with conversion from non-standard GFF formats. An example of a minimal working GFF file: ---- # The program looks for "CDS", "exon", "three_prime_UTR" and "five_prime_UTR" lines, @@ -1297,6 +1331,17 @@ output VCF and are ignored for the prediction analysis. # the gene (determined from the transcript's "Parent=gene:" attribute), and the biotype # (the most interesting is "protein_coding"). # + # Empty and commented lines are skipped, the following GFF columns are required + # 1. chromosome + # 2. IGNORED + # 3. type (CDS, exon, three_prime_UTR, five_prime_UTR, gene, transcript, etc.) + # 4. start of the feature (1-based) + # 5. end of the feature (1-based) + # 6. IGNORED + # 7. strand (+ or -) + # 8. phase (0, 1, 2 or .) + # 9. semicolon-separated attributes (see below) + # # Attributes required for # gene lines: # - ID=gene: @@ -1392,6 +1437,14 @@ output VCF and are ignored for the prediction analysis. *--targets-overlap* '0'|'1'|'2':: see *<>* +*--unify-chr-names* '0'|'1':: + Automatically detect and unify chromosome naming conventions in the GFF, fasta + and VCF, such as "chrX" vs "X". The chromosome names in the output VCF will match + that of the input VCF. The default is to attempt the automatic translation. + +*--write-index*:: + Automatically index the output file + *Examples:* ---- # Basic usage @@ -1559,6 +1612,9 @@ And similarly here, the second is filtered: *--threads* 'INT':: see *<>* +*--write-index*:: + Automatically index the output file + [[gtcheck]] @@ -1568,6 +1624,10 @@ option is given, the identity of samples from 'query.vcf.gz' is checked against the samples in the *-g* file. Without the *-g* option, multi-sample cross-check of samples in 'query.vcf.gz' is performed. +Note that the interpretation of the discordance score depends on the options provided (specifically *-e* and +*-u*) and on the available annotations (FORMAT/PL vs FORMAT/GT). +The discordance score can be interpreted as the number of mismatching genotypes if only GT-vs-GT matching is performed. + *--distinctive-sites* 'NUM[,MEM[,DIR]]':: Find sites that can distinguish between at least NUM sample pairs. If the number is smaller or equal to 1, it is interpreted as the fraction of pairs. The optional MEM string sets the maximum memory used for @@ -1581,11 +1641,14 @@ Without the *-g* option, multi-sample cross-check of samples in 'query.vcf.gz' i Interpret genotypes and genotype likelihoods probabilistically. The value of 'INT' represents genotype quality when GT tag is used (e.g. Q=30 represents one error in 1,000 genotypes and Q=40 one error in 10,000 genotypes) and is ignored when PL tag is used (in that case an arbitrary - non-zero integer can be provided). See also the *-u, --use* option below. If set to 0, - the discordance equals to the number of mismatching genotypes when GT vs GT is compared. - Note that the values with and without *-e* are not comparable, only values generated - with *-e 0* correspond to mismatching genotypes. - If performance is an issue, set to 0 for faster run but less accurate results. + non-zero integer can be provided). + {nbsp} + + {nbsp} + + If *-e* is set to 0, the discordance score can be interpreted as the number of mismatching genotypes, + but only in the GT-vs-GT matching mode. See the *-u, --use* option below for additional notes and caveats. + {nbsp} + + {nbsp} + + If performance is an issue, set *-e 0* for faster run times but less accurate results. *-g, --genotypes* 'FILE':: VCF/BCF file with reference genotypes to compare against @@ -1640,8 +1703,13 @@ Without the *-g* option, multi-sample cross-check of samples in 'query.vcf.gz' i *-u, --use* 'TAG1'[,'TAG2']:: specifies which tag to use in the query file ('TAG1') and the *-g* ('TAG2') file. - By default, the PL tag is used in the query file and GT in the *-g* file when - available. + By default, the PL tag is used in the query file and, when available, the GT tags in the + *-g* file. + {nbsp} + + {nbsp} + + Note that when the requested tag is not available, the program will attempt to use + the other tag. The output includes the number of sites that were matched by the four + possible mode (for example GT-vs-GT or GT-vs-PL). *Examples:* ---- @@ -1815,6 +1883,9 @@ in the other files. list of input files to output given as 1-based indices. With *-p* and no *-w*, all files are written. +*--write-index*:: + Automatically index the output file. This is done automatically with the *-p* option. + ==== Examples: Create intersection and complements of two sets saving the output in dir/* @@ -1889,7 +1960,8 @@ For "vertical" merge take a look at *<>* or *<>* or *<' or '', then its corresponding + value will be used regardless of *-M* settings. + *--no-index*:: the option allows to merge files without indexing them first. In order for this option to work, the user must ensure that the input files have chromosomes in @@ -1951,6 +2032,8 @@ For "vertical" merge take a look at *<>* or *<>* +*--write-index*:: + Automatically index the output file [[mpileup]] === bcftools mpileup ['OPTIONS'] *-f* 'ref.fa' 'in.bam' ['in2.bam' [...]] @@ -2199,6 +2282,9 @@ INFO/DPR .. Deprecated in favor of INFO/AD; Number of high-quality bases for used by the earlier Bcftools releases. For excample BQBZ becomes BQB. +*--write-index*:: + Automatically index the output file + ==== Options for SNP/INDEL genotype likelihood computation *-X, --config* 'STR':: @@ -2365,6 +2451,11 @@ the *<>* option is supplied. try to proceed with *-m-* even if malformed tags with incorrect number of fields are encountered, discarding such tags. (Experimental, use at your own risk.) +*-g, --gff-annot* 'FILE':: + when a GFF file is provided, follow HGVS 3'rule and right-align variants in transcripts on the forward + strand. In case of overlapping transcripts, the default mode is to left-align the variant. For a + description of the supported GFF3 file format see *<>*. + *--keep-sum* 'TAG'[,...]:: keep vector sum constant when splitting multiallelic sites. Only AD tag is currently supported. See also https://github.com/samtools/bcftools/issues/360 @@ -2428,6 +2519,8 @@ the *<>* option is supplied. maximum distance between two records to consider when locally sorting variants which changed position during the realignment +*--write-index*:: + Automatically index the output file [[plugin]] @@ -2485,6 +2578,9 @@ the usage examples that each plugin comes with. *--threads* 'INT':: see *<>* +*--write-index*:: + Automatically index the output file + ==== Plugin options: *-h, --help*:: @@ -3103,7 +3199,11 @@ Transition probabilities: see *<>* *-T, --temp-dir* 'DIR':: - Use this directory to store temporary files + Use this directory to store temporary files. If the last six characters of the string DIR are XXXXXX, + then these are replaced with a string that makes the directory name unique. + +*--write-index*:: + Automatically index the output file @@ -3252,6 +3352,9 @@ Convert between VCF and BCF. Former *bcftools subset*. *--threads* 'INT':: see *<>* +*--write-index*:: + Automatically index the output file + ==== Subset options: *-a, --trim-alt-alleles*:: @@ -3403,8 +3506,8 @@ Display the full bcftools version number in a machine-readable format. [[expressions]] -EXPRESSIONS ------------ +FILTERING EXPRESSIONS +--------------------- These filtering expressions are accepted by most of the commands. @@ -3662,8 +3765,17 @@ that the whole expression is passed to the program as intended: Please refer to the documentation of your shell for details. -SCRIPTS AND OPTIONS -------------------- +SCRIPTS +------- + +[[gff2gff]] +=== gff2gff +Attempts to fix a GFF file to be correctly parsed by *<>*. + +-- + zcat in.gff.gz | gff2gff | gzip -c > out.gff.gz +-- + [[plot-vcfstats]] === plot-vcfstats ['OPTIONS'] 'file.vchk' [...] @@ -3729,8 +3841,9 @@ AUTHORS ------- Heng Li from the Sanger Institute wrote the original C version of htslib, samtools and bcftools. Bob Handsaker from the Broad Institute implemented the -BGZF library. Petr Danecek, Shane McCarthy and John Marshall are maintaining -and further developing bcftools. Many other people contributed to the program +BGZF library. Petr Danecek is maintaining and further developing bcftools, together +with the rest of the https://www.sanger.ac.uk/tool/samtools-bcftools-htslib[samtools team]. +Many other people contributed to the program and to the file format specifications, both directly and indirectly by providing patches, testing and reporting bugs. We thank them all. diff --git a/filter.c b/filter.c index 3925475b7..b6547f81f 100644 --- a/filter.c +++ b/filter.c @@ -109,8 +109,8 @@ struct _filter_t #if ENABLE_PERL_FILTERS PerlInterpreter *perl; #endif - char **undef_tag; - int nundef_tag; + char **undef_tag, **used_tag; + int nundef_tag, nused_tag; int status, exit_on_error; }; @@ -328,6 +328,32 @@ const char **filter_list_undef_tags(filter_t *filter, int *ntags) *ntags = filter->nundef_tag; return (const char**)filter->undef_tag; } +static void filter_add_used_tag(filter_t *filter, const char *prefix, char *str) +{ + int i; + kstring_t tmp = {0,0,0}; + if ( prefix ) kputs(prefix,&tmp); + kputs(str,&tmp); + for (i=0; inused_tag; i++) + if ( !strcmp(tmp.s,filter->used_tag[i]) ) break; + if ( inused_tag ) + { + free(tmp.s); + return; + } + + filter->nused_tag++; + filter->used_tag = (char**)realloc(filter->used_tag,sizeof(*filter->used_tag)*filter->nused_tag); + if ( !filter->used_tag ) error("Could not allocate memory\n"); + filter->used_tag[filter->nused_tag-1] = tmp.s; + if ( !filter->used_tag[filter->nused_tag-1] ) error("Could not allocate memory\n"); +} +const char **filter_list_used_tags(filter_t *filter, int *ntags) +{ + *ntags = filter->nused_tag; + return (const char**)filter->used_tag; +} + /* @@ -2841,6 +2867,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { tok->setter = filters_set_qual; tok->tag = strdup("QUAL"); + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"TYPE",len) || !strncmp(str,"%TYPE",len) /* for backward compatibility */ ) @@ -2855,24 +2882,28 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) tok->tag = strdup("FILTER"); filter->max_unpack |= BCF_UN_FLT; tok->tag_type = BCF_HL_FLT; + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"ID",len) || !strncasecmp(str,"%ID",len) /* for backward compatibility */ ) { tok->comparator = filters_cmp_id; tok->tag = strdup("ID"); + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"CHROM",len) ) { tok->setter = &filters_set_chrom; tok->tag = strdup("CHROM"); + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"POS",len) ) { tok->setter = &filters_set_pos; tok->tag = strdup("POS"); + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"REF",len) ) @@ -2880,6 +2911,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) tok->setter = &filters_set_ref_string; tok->is_str = 1; tok->tag = strdup("REF"); + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"ALT",len) ) @@ -2891,6 +2923,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) tok->idxs[0] = -1; tok->nidxs = 1; tok->idx = -2; + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"N_ALT",len) ) @@ -3018,6 +3051,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) } tok->tag = strdup(tmp.s); if ( tmp.s ) free(tmp.s); + filter_add_used_tag(filter,is_fmt ? "FORMAT/" : "INFO/",tok->tag); return 0; } else if ( !strcasecmp(tmp.s,"ALT") ) @@ -3026,6 +3060,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) tok->is_str = 1; tok->tag = strdup(tmp.s); free(tmp.s); + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strcasecmp(tmp.s,"AN") ) @@ -3669,7 +3704,9 @@ void filter_destroy(filter_t *filter) } } for (i=0; inundef_tag; i++) free(filter->undef_tag[i]); + for (i=0; inused_tag; i++) free(filter->used_tag[i]); free(filter->undef_tag); + free(filter->used_tag); free(filter->cached_GT.buf); free(filter->cached_GT.mask); free(filter->filters); diff --git a/filter.h b/filter.h index 7be842a3a..cc60d6b96 100644 --- a/filter.h +++ b/filter.h @@ -79,5 +79,6 @@ filter_t *filter_parse(bcf_hdr_t *hdr, const char *str); */ int filter_status(filter_t *filter); const char **filter_list_undef_tags(filter_t *filter, int *nundef); +const char **filter_list_used_tags(filter_t *filter, int *nused); #endif diff --git a/gff.c b/gff.c new file mode 100644 index 000000000..90da84ba9 --- /dev/null +++ b/gff.c @@ -0,0 +1,1098 @@ +/* The MIT License + + Copyright (c) 2023 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "gff.h" + +/* + Helper structures, only for initialization + + ftr_t + temporary list of all exons, CDS, UTRs +*/ +KHASH_MAP_INIT_INT(int2tscript, gf_tscript_t*) +KHASH_MAP_INIT_INT(int2gene, gf_gene_t*) +typedef struct +{ + int type; // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR + uint32_t beg; + uint32_t end; + uint32_t trid; + uint32_t strand:1; // STRAND_REV,STRAND_FWD + uint32_t phase:2; // 0, 1, 2, or 3 for unknown + uint32_t iseq:29; +} +ftr_t; + +/* + Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001) + to integer id. To keep the memory requirements low, the original version + relied on IDs in the form of a string prefix and a numerical id. However, + it turns out that this assumption is not valid for some ensembl GFFs, see + for example Zea_mays.AGPv4.36.gff3.gz + */ +typedef struct +{ + void *str2id; // khash_str2int + int nstr, mstr; + char **str; // numeric id to string +} +id_tbl_t; + +typedef struct +{ + // all exons, CDS, UTRs + ftr_t *ftr; + int nftr, mftr; + + // mapping from gene id to gf_gene_t + kh_int2gene_t *gid2gene; + + // mapping from transcript id to tscript, for quick CDS anchoring + kh_int2tscript_t *id2tr; + + // sequences + void *seq2int; // str2int hash + char **seq; + int nseq, mseq; + + // ignored biotypes + void *ignored_biotypes; + + id_tbl_t gene_ids; // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx + + // pointers to the current partially processed line + char *id, *id_end, *parent, *parent_end, *biotype, *biotype_end, + *chr, *chr_end, *name, *name_end, *type, *type_end; +} +aux_t; + +struct gff_t_ +{ + const char *fname, *dump_fname; + + // the main regidx lookups, from chr:beg-end to overlapping features and + // index iterator + regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript; + + // temporary structures, deleted after initializtion + aux_t init; + + // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx + id_tbl_t tscript_ids; + + int strip_chr_names, verbosity; + int force; // force run under various conditions. Currently only to skip out-of-phase transcripts + + struct { + int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id; + int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds; + } warned; +}; + +static const char *gf_strings_noncoding[] = +{ + "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript", + "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping", + "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene", + "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene", + "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene", + "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene", + "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene", + "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf", + "lncRNA" +}; +static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"}; +static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" }; + +int gff_set(gff_t *gff, gff_opt_t key, ...) +{ + va_list args; + switch (key) + { + case dump_fname: + va_start(args, key); + gff->dump_fname = va_arg(args,char*); + va_end(args); + return 0; + + case force_out_of_phase: + va_start(args, key); + gff->force = va_arg(args,int); + va_end(args); + return 0; + + case strip_chr_names: + va_start(args, key); + gff->strip_chr_names = va_arg(args,int); + va_end(args); + return 0; + + case verbosity: + va_start(args, key); + gff->verbosity = va_arg(args,int); + va_end(args); + return 0; + + default: + error("The key %d is not supported with gff_set\n",key); + } + return 0; +} + +void *gff_get(gff_t *gff, gff_opt_t key) +{ + switch (key) + { + case idx_cds: return gff->idx_cds; + case idx_utr: return gff->idx_utr; + case idx_exon: return gff->idx_exon; + case idx_tscript: return gff->idx_tscript; + default: + error("The key %d is not supported with gff_get\n",key); + } + return NULL; +} + +const char *gff_id2string(gff_t *gff, id_type_t type, int id) // currently only transcript ids +{ + return gff->tscript_ids.str[id]; +} + +const char *gf_type2gff_string(int type) +{ + if ( !GF_is_coding(type) ) + { + if ( type < (1<init; + char tmp = chr_end[1]; + chr_end[1] = 0; + int iseq; + if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 ) + { + char *new_chr = strdup(chr_beg); + hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq); + aux->seq[aux->nseq] = new_chr; + iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]); + aux->nseq++; + assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq + } + chr_end[1] = tmp; + return iseq; +} +static inline char *gff_skip(const char *line, char *ss) +{ + while ( *ss && *ss!='\t' ) ss++; + if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + return ss+1; +} +static inline void gff_parse_chr(gff_t *gff, const char *line, char **chr_beg, char **chr_end) +{ + char *se = (char*) line; + while ( *se && *se!='\t' ) se++; + if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + if ( gff->strip_chr_names && !strncasecmp("chr",line,3) ) line += 3; + *chr_beg = (char*) line; + *chr_end = se-1; +} +static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end) +{ + char *se = ss; + *beg = strtol(ss, &se, 10) - 1; + if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss); + ss = se+1; + *end = strtol(ss, &se, 10) - 1; + if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + return se+1; +} +static void gff_id_init(id_tbl_t *tbl) +{ + memset(tbl, 0, sizeof(*tbl)); + tbl->str2id = khash_str2int_init(); +} +static void gff_id_destroy(id_tbl_t *tbl) +{ + khash_str2int_destroy_free(tbl->str2id); + free(tbl->str); +} +static inline int gff_id_register(id_tbl_t *tbl, char *beg, char *end, uint32_t *id_ptr) +{ + char tmp = end[1]; + end[1] = 0; + int id; + if ( khash_str2int_get(tbl->str2id, beg, &id) < 0 ) + { + id = tbl->nstr++; + hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str); + tbl->str[id] = strdup(beg); + khash_str2int_set(tbl->str2id, tbl->str[id], id); + } + end[1] = tmp; + *id_ptr = id; + return 0; +} +static inline int gff_parse_biotype(char *line) +{ + if ( !line ) return -1; + switch (*line) + { + case 'p': + if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING; + else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE; + else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT; + else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE; + break; + case 'a': + if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT; + else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE; + else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF; + break; + case 'I': + if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE; + else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE; + else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE; + else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE; + else if ( !strncmp(line,"IG_C",4) ) return GF_IG_C; + else if ( !strncmp(line,"IG_D",4) ) return GF_IG_D; + else if ( !strncmp(line,"IG_J",4) ) return GF_IG_J; + else if ( !strncmp(line,"IG_V",4) ) return GF_IG_V; + else if ( !strncmp(line,"IG_LV",5) ) return GF_IG_LV; + break; + case 'T': + if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE; + else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE; + else if ( !strncmp(line,"TR_C",4) ) return GF_TR_C; + else if ( !strncmp(line,"TR_D",4) ) return GF_TR_D; + else if ( !strncmp(line,"TR_J",4) ) return GF_TR_J; + else if ( !strncmp(line,"TR_V",4) ) return GF_TR_V; + break; + case 'M': + if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE; + else if ( !strncasecmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA; + else if ( !strncasecmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA; + else if ( !strncasecmp(line,"MRNA",4) ) return GF_PROTEIN_CODING; + break; + case 'l': + if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA; + if ( !strncmp(line,"lncRNA",7) ) return GF_lncRNA; + break; + case 'm': + if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA; + else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE; + else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE; + else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA; + else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA; + else if ( !strncasecmp(line,"mRNA",4) ) return GF_PROTEIN_CODING; + break; + case 'r': + if ( !strncmp(line,"rRNA",4) ) return GF_rRNA; + else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME; + else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON; + else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED; + break; + case 's': + if ( !strncmp(line,"snRNA",5) ) return GF_snRNA; + else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA; + else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA; + else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA; + else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA; + else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC; + else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING; + break; + case 't': + if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE; + else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE; + else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE; + break; + case 'n': + if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD; + else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY; + break; + case 'N': + if ( !strncmp(line,"NMD",3) ) return GF_NMD; + break; + case 'k': + if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA; + break; + case 'u': + if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE; + else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE; + break; + case 'L': + if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE; + break; + case '3': + if ( !strncasecmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA; + else if ( !strncasecmp(line,"3_prime_overlapping_ncRNA",25) ) return GF_3PRIME_OVERLAPPING_ncRNA; + break; + case 'd': + if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN; + break; + case 'v': + if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA; + break; + case 'b': + if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA; + break; + } + return 0; +} +static inline int gff_ignored_biotype(gff_t *gff, char *ss, char *se) +{ + if ( !ss ) return 0; + + char tmp = se[1]; + se[1] = 0; + + char *key = ss; + int n = 0; + if ( khash_str2int_get(gff->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss); + khash_str2int_set(gff->init.ignored_biotypes, key, n+1); + + se[1] = tmp; + return 1; +} +static gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id) +{ + khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id); + gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k); + if ( !gene ) + { + gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t)); + int ret; + k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret); + kh_val(aux->gid2gene,k) = gene; + } + return gene; +} +static void gff_parse_transcript(gff_t *gff, const char *line, ftr_t *ftr) +{ + aux_t *aux = &gff->init; + + ftr->type = gff_parse_biotype(aux->biotype); + if ( ftr->type <= 0 ) + { + char tmp = aux->type_end[1]; + aux->type_end[1] = 0; + ftr->type = gff_parse_biotype(aux->type); + aux->type_end[1] = tmp; + } + if ( ftr->type <= 0 ) + { + if ( !gff_ignored_biotype(gff,aux->biotype,aux->biotype_end) ) + { + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.unknown_tscript_biotype || gff->verbosity > 1 ) + fprintf(stderr,"Warning: Ignoring transcript with unknown biotype .. %s\n", line); + gff->warned.unknown_tscript_biotype++; + } + } + return; + } + + if ( !aux->id ) + error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + if ( !aux->parent ) + error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + + uint32_t trid,gene_id; + gff_id_register(&gff->tscript_ids, aux->id, aux->id_end, &trid); + gff_id_register(&aux->gene_ids, aux->parent, aux->parent_end, &gene_id); + + gf_tscript_t *tr = (gf_tscript_t*) calloc(1,sizeof(gf_tscript_t)); + tr->id = trid; + tr->strand = ftr->strand; + tr->gene = gene_init(aux, gene_id); + tr->type = ftr->type; + tr->beg = ftr->beg; + tr->end = ftr->end; + + khint_t k; + int ret; + k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret); + kh_val(aux->id2tr,k) = tr; +} +// register exon, CDS, UTR +static void gff_parse_exon(gff_t *gff, const char *line, ftr_t *ftr) +{ + aux_t *aux = &gff->init; + if ( !aux->parent ) + error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring found: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + + // associate with transcript id + gff_id_register(&gff->tscript_ids, aux->parent, aux->parent_end, &ftr->trid); + + if ( ftr->strand==-1 && gff->verbosity > 0 ) + { + if ( !gff->warned.unknown_strand || gff->verbosity > 1 ) + fprintf(stderr,"Warning: Ignoring GFF feature with unknown strand .. %s\n",line); + gff->warned.unknown_strand++; + } + if ( ftr->phase==-1 && gff->verbosity > 0 ) + { + if ( !gff->warned.unknown_phase|| gff->verbosity > 1 ) + fprintf(stderr,"Warning: Ignoring GFF feature with unknown phase .. %s\n",line); + gff->warned.unknown_phase++; + } + ftr->iseq = feature_set_seq(gff, aux->chr,aux->chr_end); +} +static void gff_parse_gene(gff_t *gff, const char *line, ftr_t *ftr) +{ + aux_t *aux = &gff->init; + if ( !aux->id ) return; + + uint32_t gene_id; + gff_id_register(&aux->gene_ids, aux->id, aux->id_end, &gene_id); + + gf_gene_t *gene = gene_init(aux, gene_id); + if ( gene->name ) + { + if ( !gff->warned.duplicate_id || gff->verbosity > 1 ) + fprintf(stderr,"Warning: The GFF contains features with duplicate id .. %s\n",line); + gff->warned.duplicate_id++; + return; + } + + gene->iseq = feature_set_seq(gff, aux->chr,aux->chr_end); + gene->beg = ftr->beg; + gene->end = ftr->end; + gene->strand = ftr->strand; + gene->id = gene_id; + + if ( aux->name ) + { + gene->name = (char*) malloc(aux->name_end - aux->name + 2); + memcpy(gene->name,aux->name,aux->name_end - aux->name + 1); + gene->name[aux->name_end - aux->name + 1] = 0; + } + else + gene->name = strdup(aux->gene_ids.str[gene_id]); // Name= field is not present, use the gene ID instead +} + +// Returns 0 for exons,CDS,UTRs to indiciate these need to be pruned later and regidx built on them, +// or -1 to indiciate the structure needs not be saved (either because of an error or because saved +// as transcript or gene.) +static int gff_parse_line(gff_t *gff, char *line, ftr_t *ftr) +{ + // - skip empty lines and commented lines + // - columns + // 1. chr + // 2. + // 3. CDS, transcript, gene, ... + // 4-5. beg,end + // 6. + // 7. strand + // 8. phase + // 9. Parent=transcript:ENST(\d+);ID=...;biotype=... etc + + char *ss = line; + if ( !*ss ) return -1; // skip blank lines + if ( *ss=='#' ) return -1; // skip comments + + aux_t *aux = &gff->init; + gff_parse_chr(gff, line, &aux->chr, &aux->chr_end); + ss = gff_skip(line, aux->chr_end + 2); + + // 3rd column: is this a CDS, transcript, gene, etc.. The parsing order by frequency in Homo_sapiens.GRCh37.87.gff3 + int is_gene_line = 0; + ftr->type = 0; + aux->type = ss; + if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; } + else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; } + else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; } + else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; } + else if ( !strncmp("biological_region\t",ss,18) ) { return -1; } // skip + else if ( !strncmp("gene\t",ss,5) ) { is_gene_line = 1; ss += 5; } + else ss = gff_skip(line, ss); + aux->type_end = ss - 1; + + // 4-5th columns: beg,end + ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end); + + // 6th column: skip + ss = gff_skip(line, ss); + + // 7th column: strand + ftr->strand = -1; + if ( *ss == '+' ) ftr->strand = STRAND_FWD; + else if ( *ss == '-' ) ftr->strand = STRAND_REV; + ss += 2; + + // 8th column: phase (codon offset) + ftr->phase = -1; + if ( *ss == '0' ) ftr->phase = 0; + else if ( *ss == '1' ) ftr->phase = 1; + else if ( *ss == '2' ) ftr->phase = 2; + else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN; // exons and even CDS in some GFFs do not have phase + ss += 2; + + // 9th column: id, parent, name, biotype + aux->name = NULL, aux->id = NULL, aux->parent = NULL, aux->biotype = NULL; + while ( *ss ) + { + char *es = ss; + while ( *es && *es!=';' ) es++; + if ( !strncmp(ss,"ID=",3) ) + { + ss += 3; + aux->id_end = es - 1; + aux->id = ss; + if ( !strncmp(ss,"gene:",5) ) { aux->id += 5; is_gene_line = 1; } + else if ( !strncmp(ss,"transcript:",11) ) aux->id += 11; + } + else if ( !strncmp(ss,"Name=",5) ) { aux->name = ss + 5; aux->name_end = es - 1; } + else if ( !strncmp(ss,"Parent=",7) ) + { + ss += 7; + aux->parent_end = es - 1; + aux->parent = ss; + if ( !strncmp(ss,"gene:",5) ) aux->parent += 5; + else if ( !strncmp(ss,"transcript:",11) ) aux->parent += 11; + } + else if ( !strncmp(ss,"biotype=",8) ) { aux->biotype = ss + 8; aux->biotype_end = es - 1; } + else if ( !strncmp(ss,"gene_biotype=",13) ) { aux->biotype = ss + 13; aux->biotype_end = es - 1; } + if ( !*es ) break; + ss = es + 1; + } + + if ( is_gene_line || !aux->parent ) + { + gff_parse_gene(gff, line, ftr); + return -1; + } + + if ( ftr->type ) + { + gff_parse_exon(gff, line, ftr); + return 0; + } + + gff_parse_transcript(gff, line, ftr); + return -1; +} + +static int cmp_cds_ptr(const void *a, const void *b) +{ + // comparison function for qsort of transcripts's CDS + if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1; + if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1; + return 0; +} + +static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end) +{ + *chr_beg = *chr_end = aux->seq[iseq]; + while ( (*chr_end)[1] ) (*chr_end)++; +} +static gf_tscript_t *tscript_init(aux_t *aux, uint32_t trid) +{ + khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid); + gf_tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k); + assert( tr ); + return tr; +} +static void register_cds(gff_t *gff, ftr_t *ftr) +{ + // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet. + // ftr is the result of parsing a gff CDS line + aux_t *aux = &gff->init; + + gf_tscript_t *tr = tscript_init(aux, ftr->trid); + if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand); + + gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t)); + cds->tr = tr; + cds->beg = ftr->beg; + cds->len = ftr->end - ftr->beg + 1; + cds->icds = 0; // to keep valgrind on mac happy + cds->phase = ftr->phase; + + hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds); + tr->cds[tr->ncds++] = cds; +} +static void register_utr(gff_t *gff, ftr_t *ftr) +{ + aux_t *aux = &gff->init; + gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t)); + utr->which = ftr->type==GF_UTR3 ? prime3 : prime5; + utr->beg = ftr->beg; + utr->end = ftr->end; + utr->tr = tscript_init(aux, ftr->trid); + + char *chr_beg, *chr_end; + chr_beg_end(&gff->init, utr->tr->gene->iseq, &chr_beg, &chr_end); + regidx_push(gff->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr); +} +static void register_exon(gff_t *gff, ftr_t *ftr) +{ + aux_t *aux = &gff->init; + gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t)); + exon->beg = ftr->beg; + exon->end = ftr->end; + exon->tr = tscript_init(aux, ftr->trid); + + char *chr_beg, *chr_end; + chr_beg_end(&gff->init, exon->tr->gene->iseq, &chr_beg, &chr_end); + regidx_push(gff->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon); +} + +static void tscript_init_cds(gff_t *gff) +{ + aux_t *aux = &gff->init; + + // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds) + khint_t k; + for (k=0; kid2tr); k++) + { + if ( !kh_exist(aux->id2tr, k) ) continue; + gf_tscript_t *tr = (gf_tscript_t*) kh_val(aux->id2tr, k); + + // position-to-tscript lookup + char *chr_beg, *chr_end; + chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end); + regidx_push(gff->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr); + + if ( !tr->ncds ) continue; // transcript with no CDS + + // sort CDs + qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr); + + // trim non-coding start + int i, len = 0; + if ( tr->strand==STRAND_FWD ) + { + if ( tr->cds[0]->phase != CDS_PHASE_UNKN ) + { + if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME; + tr->cds[0]->beg += tr->cds[0]->phase; + tr->cds[0]->len -= tr->cds[0]->phase; + tr->cds[0]->phase = 0; + } + + // sanity check phase; the phase number in gff tells us how many bases to skip in this + // feature to reach the first base of the next codon + int tscript_ok = 1; + for (i=0; incds; i++) + { + if ( tr->cds[i]->phase == CDS_PHASE_UNKN ) + { + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.unknown_cds_phase || gff->verbosity > 1 ) + fprintf(stderr,"Warning: CDS with unknown phase, could not verify reading frame in transcript %s\n",gff->tscript_ids.str[tr->id]); + gff->warned.unknown_cds_phase++; + } + len += tr->cds[i]->len; + continue; + } + int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; + if ( phase!=len%3 ) + { + if ( !gff->force ) + error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", + gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.wrong_phase || gff->verbosity > 1 ) + fprintf(stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n", + gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + gff->warned.wrong_phase++; + } + tscript_ok = 0; + break; + } + len += tr->cds[i]->len; + } + if ( !tscript_ok ) continue; // skip this transcript + } + else + { + if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN ) + { + // Check that the phase is not bigger than CDS length. Curiously, this can really happen, + // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141. + // This also fixes phase of 5' incomplete CDS, see test/csq/ENST00000520868/ENST00000520868.gff + // todo: the same for the fwd strand + i = tr->ncds - 1; + int phase = tr->cds[i]->phase; + if ( phase ) tr->trim |= TRIM_5PRIME; + while ( i>=0 && phase > tr->cds[i]->len ) + { + phase -= tr->cds[i]->len; + tr->cds[i]->phase = 0; + tr->cds[i]->len = 0; + i--; + } + if ( gff->verbosity > 0 && tr->cds[i]->phase ) + { + if ( !gff->warned.incomplete_cds || gff->verbosity > 1 ) + fprintf(stderr,"Note: truncated transcript %s with incomplete CDS (this is very common)\n",gff->tscript_ids.str[tr->id]); + gff->warned.incomplete_cds++; + } + tr->cds[i]->len -= tr->cds[i]->phase; + tr->cds[i]->phase = 0; + } + + // sanity check phase + int tscript_ok = 1; + for (i=tr->ncds-1; i>=0; i--) + { + if ( tr->cds[i]->phase == CDS_PHASE_UNKN ) + { + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.unknown_cds_phase || gff->verbosity > 1 ) + fprintf(stderr,"Warning: CDS with unknown phase, could not verify reading frame in transcript %s\n",gff->tscript_ids.str[tr->id]); + gff->warned.unknown_cds_phase++; + } + len += tr->cds[i]->len; + continue; + } + int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; + if ( phase!=len%3 ) + { + if ( !gff->force ) + error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", + gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.wrong_phase || gff->verbosity > 1 ) + fprintf(stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n", + gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + gff->warned.wrong_phase++; + } + tscript_ok = 0; + break; + } + len += tr->cds[i]->len; + } + if ( !tscript_ok ) continue; // skip this transcript + } + + // set len. At the same check that CDS within a transcript do not overlap + len = 0; + for (i=0; incds; i++) + { + tr->cds[i]->icds = i; + len += tr->cds[i]->len; + if ( !i ) continue; + + gf_cds_t *a = tr->cds[i-1]; + gf_cds_t *b = tr->cds[i]; + if ( a->beg + a->len - 1 >= b->beg ) + { + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.overlapping_cds || gff->verbosity > 1 ) + fprintf(stderr,"Warning: GFF contains overlapping CDS %s, %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32" (ribosomal slippage?)\n", + gff->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len); + gff->warned.overlapping_cds++; + } + } + } + + if ( len%3 != 0 ) + { + // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289 + // http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289 + // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one. + + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.incomplete_cds || gff->verbosity > 1 ) + fprintf(stderr,"Note: truncated transcript %s with incomplete CDS (this is very common)\n",gff->tscript_ids.str[tr->id]); + gff->warned.incomplete_cds++; + } + + tr->trim |= TRIM_3PRIME; + if ( tr->strand==STRAND_FWD ) + { + i = tr->ncds - 1; + while ( i>=0 && len%3 ) + { + int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len; + tr->cds[i]->len -= dlen; + len -= dlen; + i--; + } + } + else + { + i = 0; + while ( incds && len%3 ) + { + int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len; + tr->cds[i]->len -= dlen; + tr->cds[i]->beg += dlen; + len -= dlen; + i++; + } + } + } + + // set CDS offsets and insert into regidx + len=0; + for (i=0; incds; i++) + { + tr->cds[i]->pos = len; + len += tr->cds[i]->len; + regidx_push(gff->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]); + } + } +} + +static void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); } +static void regidx_free_tscript(void *payload) { gf_tscript_t *tr = *((gf_tscript_t**)payload); free(tr->cds); free(tr); } + +static int gff_dump(gff_t *gff, const char *fname) +{ + BGZF *out = bgzf_open(fname,"wg"); + if ( !out ) error("Failed to open %s: %s\n", fname, strerror(errno)); + + kstring_t str = {0,0,0}; + + khint_t k; + for (k=0; kinit.gid2gene); k++) + { + if ( !kh_exist(gff->init.gid2gene, k) ) continue; + gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k); + char *gene_id = gff->init.gene_ids.str[gene->id]; + str.l = 0; + ksprintf(&str,"%s\t.\tgene\t%d\t%d\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->init.seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':'-',gene_id,gene->name,gene->used); + if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno)); + } + + regitr_t *itr = regitr_init(gff->idx_tscript); + while ( regitr_loop(itr) ) + { + gf_tscript_t *tr = regitr_payload(itr, gf_tscript_t*); + char *gene_id = gff->init.gene_ids.str[tr->gene->id]; + const char *type = tr->type==GF_PROTEIN_CODING ? "mRNA" : gf_type2gff_string(tr->type); + str.l = 0; + ksprintf(&str,"%s\t.\t%s\t%d\t%d\t.\t%c\t.\tID=%s;Parent=%s;biotype=%s;used=%d\n",itr->seq,type,itr->beg+1,itr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id],gene_id,gf_type2gff_string(tr->type),tr->used); + if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno)); + } + regitr_destroy(itr); + + itr = regitr_init(gff->idx_cds); + while ( regitr_loop(itr) ) + { + gf_cds_t *cds = regitr_payload(itr,gf_cds_t*); + gf_tscript_t *tr = cds->tr; + str.l = 0; + ksprintf(&str,"%s\t.\tCDS\t%d\t%d\t.\t%c\t%c\tParent=%s\n",itr->seq,cds->beg+1,cds->beg+cds->len,tr->strand==STRAND_FWD?'+':'-',cds->phase==3?'.':cds->phase+(int)'0',gff->tscript_ids.str[tr->id]); + if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno)); + } + regitr_destroy(itr); + + itr = regitr_init(gff->idx_utr); + while ( regitr_loop(itr) ) + { + gf_utr_t *utr = regitr_payload(itr,gf_utr_t*); + gf_tscript_t *tr = utr->tr; + str.l = 0; + ksprintf(&str,"%s\t.\t%s_prime_UTR\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,utr->which==prime3?"three":"five",utr->beg+1,utr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]); + if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno)); + } + regitr_destroy(itr); + + itr = regitr_init(gff->idx_exon); + while ( regitr_loop(itr) ) + { + gf_exon_t *exon = regitr_payload(itr,gf_exon_t*); + gf_tscript_t *tr = exon->tr; + str.l = 0; + ksprintf(&str,"%s\t.\texon\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,exon->beg+1,exon->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]); + if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno)); + } + regitr_destroy(itr); + + if ( bgzf_close(out)!=0 ) error("Error: close failed .. %s\n", fname); + free(str.s); + + return 0; +} + +int gff_parse(gff_t *gff) +{ + if ( gff->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", gff->fname); + + aux_t *aux = &gff->init; + aux->seq2int = khash_str2int_init(); // chrom's numeric id + aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene + aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t + gff->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(gf_tscript_t*), NULL); + aux->ignored_biotypes = khash_str2int_init(); + gff_id_init(&aux->gene_ids); + gff_id_init(&gff->tscript_ids); + + // parse gff + kstring_t str = {0,0,0}; + htsFile *fp = hts_open(gff->fname,"r"); + if ( !fp ) error("Failed to read %s\n", gff->fname); + while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 ) + { + hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr); + int ret = gff_parse_line(gff, str.s, aux->ftr + aux->nftr); + if ( !ret ) aux->nftr++; + } + free(str.s); + if ( hts_close(fp)!=0 ) error("Close failed: %s\n", gff->fname); + + + // process gff information: connect CDS and exons to transcripts + gff->idx_cds = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL); + gff->idx_utr = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL); + gff->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL); + + int i; + for (i=0; inftr; i++) + { + ftr_t *ftr = &aux->ftr[i]; + + // check whether to keep this feature: is there a mapping trid -> gene_id -> gene? + khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid); + if ( k==kh_end(aux->id2tr) ) continue; // no corresponding transcript registered, must be an unsupported biotype + + gf_tscript_t *tr = kh_val(aux->id2tr,k); + tr->used = 1; + tr->gene->used = 1; + + // populate regidx by category: + // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5 + // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ... + if ( ftr->type==GF_CDS ) register_cds(gff, ftr); + else if ( ftr->type==GF_EXON ) register_exon(gff, ftr); + else if ( ftr->type==GF_UTR5 ) register_utr(gff, ftr); + else if ( ftr->type==GF_UTR3 ) register_utr(gff, ftr); + else + error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type)); + } + tscript_init_cds(gff); + + if ( gff->verbosity > 0 ) + { + fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n", + regidx_nregs(gff->idx_tscript), + regidx_nregs(gff->idx_exon), + regidx_nregs(gff->idx_cds), + regidx_nregs(gff->idx_utr)); + } + + if ( gff->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) ) + { + khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes; + fprintf(stderr,"Ignored the following biotypes:\n"); + for (i = kh_begin(ign); i < kh_end(ign); i++) + { + if ( !kh_exist(ign,i)) continue; + const char *biotype = kh_key(ign,i); + if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")"; + fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype); + } + } + khash_str2int_destroy_free(aux->ignored_biotypes); + + // warned about unprinted warnings + if ( gff->verbosity > 0 ) + { + int nwarn = 0; + #define INC_NWARN(X) if (gff->warned.X) nwarn += gff->verbosity > 1 ? 0 : gff->warned.X - 1; + INC_NWARN(unknown_chr); + INC_NWARN(unknown_tscript_biotype); + INC_NWARN(unknown_strand); + INC_NWARN(unknown_phase); + INC_NWARN(duplicate_id); + INC_NWARN(unknown_cds_phase); + INC_NWARN(incomplete_cds); + INC_NWARN(wrong_phase); + INC_NWARN(overlapping_cds); + if ( nwarn > 0 ) + fprintf(stderr,"Warning: %d warnings were supressed, run with `--verbose 2` to see them all\n",nwarn); + } + + if ( gff->dump_fname ) gff_dump(gff, gff->dump_fname); + + if ( !regidx_nregs(gff->idx_tscript) ) + error("Error: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n" + " or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n" + " of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n"); + + free(aux->seq); + free(aux->ftr); + khash_str2int_destroy_free(aux->seq2int); + // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene); + kh_destroy(int2tscript,aux->id2tr); + gff_id_destroy(&aux->gene_ids); + + return 0; +} + +gff_t *gff_init(const char *fname) +{ + gff_t *gff = calloc(sizeof(gff_t),1); + gff->fname = fname; + return gff; +} +void gff_destroy(gff_t *gff) +{ + khint_t k; + if ( gff->init.gid2gene ) + { + for (k=0; kinit.gid2gene); k++) + { + if ( !kh_exist(gff->init.gid2gene, k) ) continue; + gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k); + free(gene->name); + free(gene); + } + kh_destroy(int2gene,gff->init.gid2gene); + } + + regidx_destroy(gff->idx_cds); + regidx_destroy(gff->idx_utr); + regidx_destroy(gff->idx_exon); + regidx_destroy(gff->idx_tscript); + + gff_id_destroy(&gff->tscript_ids); + free(gff); +} + diff --git a/gff.h b/gff.h new file mode 100644 index 000000000..ebb64634a --- /dev/null +++ b/gff.h @@ -0,0 +1,332 @@ +/* The MIT License + + Copyright (c) 2023 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ +/* + GFF parsing code refactored from csq.c + + Things that would be nice to have + - dynamic N_REF_PAD + - for stop-lost events (also in frameshifts) report the number of truncated aa's + - memory could be greatly reduced by indexing gff (but it is quite compact already) + - deletions that go beyond transcript boundaries are not checked at sequence level + - alloc tscript->ref in hap_finalize, introduce fa_off_beg:16,fa_off_end:16 + - see test/csq/ENST00000573314/insertion-overlap.vcf #1476288882 + + Read about transcript types here + http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html + http://www.ensembl.org/info/genome/variation/predicted_data.html + https://www.gencodegenes.org/pages/biotypes.html + + List of supported biotypes + antisense + IG_C_gene + IG_D_gene + IG_J_gene + IG_LV_gene + IG_V_gene + lincRNA + lncRNA .. generic term for 3prime_overlapping_ncRNA, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic, sense_overlapping + macro_lncRNA + miRNA + misc_RNA + Mt_rRNA + Mt_tRNA + polymorphic_pseudogene + processed_transcript + protein_coding, mRNA + ribozyme + rRNA + sRNA + scRNA + scaRNA + sense_intronic + sense_overlapping + snRNA + snoRNA + TR_C_gene + TR_D_gene + TR_J_gene + TR_V_gene + + The gff parsing logic + We collect features such by combining gff lines A,B,C as follows: + A .. gene line with a supported biotype + A.ID=~/^gene:/ + + B .. transcript line referencing A with supported biotype + B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/ + + C .. corresponding CDS, exon, and UTR lines: + C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/ + + For coding biotypes ("protein_coding" or "polymorphic_pseudogene") the + complete chain link C -> B -> A is required. For the rest, link B -> A suffices. + + + The supported consequence types, sorted by impact: + splice_acceptor_variant .. end region of an intron changed (2bp at the 3' end of an intron) + splice_donor_variant .. start region of an intron changed (2bp at the 5' end of an intron) + stop_gained .. DNA sequence variant resulting in a stop codon + frameshift_variant .. number of inserted/deleted bases not a multiple of three, disrupted translational frame + stop_lost .. elongated transcript, stop codon changed + start_lost .. the first codon changed + inframe_altering .. combination of indels leading to unchanged reading frame and length + inframe_insertion .. inserted coding sequence, unchanged reading frame + inframe_deletion .. deleted coding sequence, unchanged reading frame + missense_variant .. amino acid (aa) change, unchanged length + splice_region_variant .. change within 1-3 bases of the exon or 3-8 bases of the intron + synonymous_variant .. DNA sequence variant resulting in no amino acid change + stop_retained_variant .. different stop codon + start_retained_variant .. start codon retained by indel realignment + non_coding_variant .. variant in non-coding sequence, such as RNA gene + 5_prime_UTR_variant + 3_prime_UTR_variant + intron_variant .. reported only if none of the above + intergenic_variant .. reported only if none of the above + + + The annotation algorithm. + The algorithm checks if the variant falls in a region of a supported type. The + search is performed in the following order, until a match is found: + 1. idx_cds(gf_cds_t) - lookup CDS by position, create haplotypes, call consequences + 2. idx_utr(gf_utr_t) - check UTR hits + 3. idx_exon(gf_exon_t) - check for splice variants + 4. idx_tscript(tscript_t) - check for intronic variants, RNAs, etc. + + These regidx indexes are created by parsing a gff3 file as follows: + 1. create the array "ftr" of all UTR, CDS, exons. This will be + processed later and pruned based on transcript types we want to keep. + In the same go, create the hash "id2tr" of transcripts to keep + (based on biotype) which maps from transcript_id to a transcript. At + the same time also build the hash "gid2gene" which maps from gene_id to + gf_gene_t pointer. + + 2. build "idx_cds", "idx_tscript", "idx_utr" and "idx_exon" indexes. + Use only features from "ftr" which are present in "id2tr". + + 3. clean data that won't be needed anymore: ftr, id2tr, gid2gene. + + Data structures. + idx_cds, idx_utr, idx_exon, idx_tscript: + as described above, regidx structures for fast lookup of exons/transcripts + overlapping a region, the payload is a pointer to tscript.cds +*/ + +#ifndef GFF_H__ +#define GFF_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bcftools.h" +#include "regidx.h" + +#ifndef __FUNCTION__ +# define __FUNCTION__ __func__ +#endif + +// Definition of splice_region, splice_acceptor and splice_donor +#define N_SPLICE_DONOR 2 +#define N_SPLICE_REGION_EXON 3 +#define N_SPLICE_REGION_INTRON 8 + +#define STRAND_REV 0 +#define STRAND_FWD 1 + +#define TRIM_NONE 0 +#define TRIM_5PRIME 1 +#define TRIM_3PRIME 2 + + +// GFF line types +#define GFF_UNKN_LINE 0 +#define GFF_TSCRIPT_LINE 1 +#define GFF_GENE_LINE 2 + + +/* + Genomic features, for fast lookup by position to overlapping features +*/ +#define GF_coding_bit 6 +#define GF_is_coding(x) ((x) & (1< +// @author Nicola Asuni +// @link https://github.com/tecnickcom/variantkey +// @license MIT [LICENSE](https://raw.githubusercontent.com/tecnickcom/variantkey/main/LICENSE) // @copyright 2017-2018 GENOMICS plc -// @license MIT (see LICENSE) -// @link https://github.com/genomicsplc/variantkey // // LICENSE // diff --git a/misc/gff2gff b/misc/gff2gff index 27485fad8..bfcda412d 100755 --- a/misc/gff2gff +++ b/misc/gff2gff @@ -39,11 +39,12 @@ sub error my (@msg) = @_; if ( scalar @msg ) { confess @msg; } print - "About: Attempt to fix a GFF file to be correctly parse by bcftools/csq, see\n", + "About: Attempt to fix a GFF file to be correctly parsed by bcftools/csq, see\n", " the man page for the description of the expected format\n", " http://samtools.github.io/bcftools/bcftools-man.html#csq\n", "Usage: gff2gff [OPTIONS]\n", "Options:\n", + " -v, --verbose Increase verbosity\n", " -h, -?, --help This help message\n", "Example:\n", " zcat in.gff.gz | gff2gff | gzip -c > out.gff.gz\n", @@ -52,10 +53,11 @@ sub error } sub parse_params { - my $opts = {}; + my $opts = { verbose=>0, warned=>{}, fixed=>{} }; if ( -t STDIN && !@ARGV ) { error(); } while (defined(my $arg=shift(@ARGV))) { + if ( $arg eq '-v' or $arg eq '--verbose' ) { $$opts{verbose}++; next; } if ( $arg eq '-?' or $arg eq '-h' or $arg eq '--help' ) { error(); } error("Unknown parameter \"$arg\". Run -h for help.\n"); } @@ -72,8 +74,29 @@ sub gff2gff chomp($row[-1]); if ( $row[2] eq 'gene' ) { fix_gene($opts,$line,\@row); } elsif ( $row[2] eq 'transcript' ) { fix_transcript($opts,$line,\@row); } + elsif ( $row[8]=~/biotype=/ && !($row[8]=~/Parent=/) ) { fix_gene($opts,$line,\@row); } + elsif ( $row[8]=~/Parent=/ ) { fix_transcript($opts,$line,\@row); } print join("\t",@row)."\n"; } + if ( !$$opts{verbose} ) + { + my $nwarn = 0; + for my $key (keys %{$$opts{warned}}) + { + if ( $$opts{warned}{$key} > 1 ) { $nwarn += $$opts{warned}{$key} - 1; } + } + if ( $nwarn ) { print STDERR "Suppressed $nwarn warnings, run with -v to see them all\n"; } + } + my $nfixed = 0; + for my $key (keys %{$$opts{fixed}}) + { + $nfixed += $$opts{fixed}{$key}; + } + print STDERR "Fixed $nfixed records\n"; + for my $key (sort {$$opts{fixed}{$b}<=>$$opts{fixed}{$a}} keys %{$$opts{fixed}}) + { + print STDERR "\t$$opts{fixed}{$key}x .. $key\n"; + } } sub fix_gene { @@ -88,35 +111,28 @@ sub fix_gene if ( $$row[8] =~ /biotype=([^;]+)/ ) { $biotype = $1; $biotype_ok = 1; } if ( !$biotype_ok && $$row[8] =~ /gene_type=([^;]+)/i ) { $biotype = $1; } if ( $$row[8] =~ /Name=([^;]+)/ ) { $name = $1; $name_ok = 1; } - if ( !$biotype_ok && $$row[8] =~ /gene_name=([^;]+)/i ) { $name = $1; } + if ( !$name_ok && $$row[8] =~ /gene_name=([^;]+)/i ) { $name = $1; } if ( !$id_ok ) { - if ( defined $id ) { $$row[8] .= ";ID=$id"; } - elsif ( !$$opts{gene_id_warned} ) + if ( defined $id ) { $$row[8] .= ";ID=$id"; $$opts{fixed}{gene_id}++; } + else { - print STDERR "Unable to determine gene ID, see e.g. $line\n"; - $$opts{gene_id_warned} = 1; + if ( $$opts{verbose}>0 or !$$opts{warned}{gene_id} ) { print STDERR "Unable to determine gene ID: $line"; } + $$opts{warned}{gene_id}++; } } - if ( !$biotype_ok ) + if ( !$biotype_ok && defined $biotype ) { - if ( defined $biotype ) { $$row[8] .= ";biotype=$biotype"; } - elsif ( !$$opts{gene_biotype_warned} ) - { - print STDERR "Unable to determine gene biotype/type, see e.g. $line\n"; - $$opts{gene_biotype_warned} = 1; - } + $$row[8] .= ";biotype=$biotype"; + $$opts{fixed}{gene_biotype}++; } - if ( !$name_ok ) + if ( !$name_ok && defined $name ) { - if ( defined $name ) { $$row[8] .= ";Name=$name"; } - elsif ( !$$opts{gene_name_warned} ) - { - print STDERR "Unable to determine gene name, see e.g. $line\n"; - $$opts{gene_name_warned} = 1; - } + $$row[8] .= ";Name=$name"; + $$opts{fixed}{gene_name}++; } + if ( defined $biotype ) { $$opts{gene_id2biotype}{$id} = $biotype; } } sub fix_transcript { @@ -134,29 +150,30 @@ sub fix_transcript if ( !$id_ok ) { - if ( defined $id ) { $$row[8] .= ";ID=$id"; } - elsif ( !$$opts{tscript_id_warned} ) + if ( defined $id ) { $$row[8] .= ";ID=$id"; $$opts{fixed}{transcript_id}++; } + else { - print STDERR "Unable to determine transcript ID, see e.g. $line\n"; - $$opts{tscript_id_warned} = 1; + if ( $$opts{verbose}>0 or !$$opts{warned}{tscript_id} ) { print STDERR "Unable to determine transcript ID: $line"; } + $$opts{warned}{tscript_id}++; } } if ( !$biotype_ok ) { - if ( defined $biotype ) { $$row[8] .= ";biotype=$biotype"; } - elsif ( !$$opts{tscript_biotype_warned} ) + if ( defined $biotype ) { $$row[8] .= ";biotype=$biotype"; $$opts{fixed}{transcript_biotype}++; } + elsif ( defined $parent && exists($$opts{gene_id2biotype}{$parent}) ) { $$row[8] .= ";biotype=$$opts{gene_id2biotype}{$parent}"; $$opts{fixed}{transcript_biotype}++; } + else { - print STDERR "Unable to determine transcript biotype/type, see e.g. $line\n"; - $$opts{tscript_biotype_warned} = 1; + if ( $$opts{verbose}>0 or !$$opts{warned}{tscript_biotype} ) { print STDERR "Unable to determine transcript biotype/type: $line"; } + $$opts{warned}{tscript_biotype}++; } } if ( !$parent_ok ) { - if ( defined $parent ) { $$row[8] .= ";Parent=$parent"; } # currently cannot happen - elsif ( !$$opts{tscript_parent_warned} ) + if ( defined $parent ) { $$row[8] .= ";Parent=$parent"; $$opts{fixed}{transcript_parent}++; } # currently cannot happen + else { - print STDERR "Unable to determine transcript Parent, see e.g. $line\n"; - $$opts{tscript_parent_warned} = 1; + if ( $$opts{verbose}>0 or !$$opts{warned}{tscript_parent} ) { print STDERR "Unable to determine transcript Parent: $line"; } + $$opts{warned}{tscript_parent}++; } } } diff --git a/misc/plot-vcfstats b/misc/plot-vcfstats index 58e8a3bd2..990a56ffd 100755 --- a/misc/plot-vcfstats +++ b/misc/plot-vcfstats @@ -64,9 +64,13 @@ if ( $$opts{make_plots} ) plot_indel_distribution($opts,$id); plot_indel_vaf_distribution($opts,$id); plot_substitutions($opts,$id); + plot_vaf($opts,$id,'snv'); + plot_vaf($opts,$id,'indel'); plot_per_sample_stats($opts,$id); plot_DP($opts,$id); plot_hwe($opts,$id); + plot_vaf25_by_sample($opts,$id,'snv'); + plot_vaf25_by_sample($opts,$id,'indel'); } plot($opts); } @@ -245,6 +249,11 @@ sub parse_params header=>'HWE', exp=>"# HWE\t[2]id\t[3]1st ALT allele frequency\t[4]Number of observations\t[5]25th percentile\t[6]median\t[7]75th percentile", }, + { + id=>'VAF', + header=>'VAF', + exp=>"# VAF\t[2]id\t[3]sample\t[4]SNV VAF distribution\t[5]indel VAF distribution", + }, ], SN_keys=>[ 'number of samples:', @@ -750,7 +759,7 @@ sub init_plots plot_hethom_by_sample = 1 plot_snps_by_sample = 1 plot_indels_by_sample = 1 - plot_singletons_by_sample = 1 + plot_singletons_by_sample = 1 plot_depth_by_sample = 1 plot_SNP_count_by_af = 1 plot_Indel_count_by_af = 1 @@ -767,6 +776,10 @@ sub init_plots plot_tstv_by_qual = 1 plot_tstv_by_usr = 1 plot_substitutions = 1 + plot_vaf_snv = 1 + plot_vaf_indel = 1 + plot_vaf25_snv = 1 + plot_vaf25_indel = 1 # Set to 1 to use sample names for xticks instead of numeric sequential IDs @@ -1166,7 +1179,7 @@ sub plot_tstv_by_AF my $img = "tstv_by_af.$id"; my $vals = rebin_values(\@vals,8,0); - + open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); print $tfh "# [1]Allele frequency\t[2]Number of sites\t[3]ts/tv\n"; for (my $i=0; $i<@$vals; $i++) @@ -1554,7 +1567,7 @@ sub plot_counts_by_AF_col id = int(row[0]) if id not in dat: dat[id] = [] dat[id].append([float(row[1]),float(row[2])]) - + if plot_${title}_count_by_af: fig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) ax1 = fig.add_subplot(111) @@ -1792,15 +1805,33 @@ sub plot_substitutions if plot_substitutions: fig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) - cm = mpl.cm.get_cmap('autumn') + ts = [ 'A>G','G>A','C>T','T>C' ] + nts = 0 + ntv = 0 + for x in dat: + if x[1] in ts: nts += 1 + else: ntv += 1 n = 12 - col = [] - for i in list(range(n)): col.append(cm(1.*i/n)) + col = list(range(n)) + ecol = list(range(n)) + for i in range(n): + col[i] = '#ffce84' + ecol[i] = '#f5c781' + col[1] = col[5] = col[6] = col[10] = '#ff9900' + ecol[1] = ecol[5] = ecol[6] = ecol[10] = '#ef8f00' ax1 = fig.add_subplot(111) - ax1.bar([row[0] for row in dat], [row[2] for row in dat], color=col) + ax1.bar([row[0] for row in dat], [row[2] for row in dat], color=col, edgecolor=ecol) ax1.set_ylabel('Count') ax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y') - ax1.set_xlim(-0.5,n+0.5) + ax1.spines['top'].set_visible(False) + ax1.spines['right'].set_visible(False) + ax1.get_xaxis().tick_bottom() + ax1.get_yaxis().tick_left() + ax1.spines['bottom'].set_color('grey') + ax1.spines['left'].set_color('grey') + mpl.rcParams['text.color'] = '555555' + ax1.patch.set_visible(False) + ax1.set_xlim(-0.5,n-0.5) plt.xticks([row[0] for row in dat],[row[1] for row in dat],rotation=45) plt.title('$$opts{title}{$id}') plt.savefig('$img.png') @@ -1810,6 +1841,126 @@ sub plot_substitutions "; } +sub plot_vaf +{ + my ($opts,$id,$type) = @_; + + my @vals = get_values($opts,$id,'VAF'); + if ( !@vals ) { return; } + + my @sum = (); + for my $row (@vals) + { + my @cnt = split(/,/,$type eq 'snv' ? $$row[1] : $$row[2]); + for (my $i=0; $i<@cnt; $i++) + { + $sum[$i] += $cnt[$i]; + } + } + + my $fh = $$opts{plt_fh}; + my $img = "vaf.$type.$id"; + + tprint $fh, " + dat = [ + "; + for (my $i=0; $i<@sum; $i++) { tprint $fh, "\t[".1.0*$i/(scalar @sum).",$sum[$i]],\n"; } + tprint $fh, "]\n"; + tprint $fh, " + + if plot_vaf_$type: + fig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) + ax1 = fig.add_subplot(111) + wd = 0.7 # fraction of dx distance + min_dx = None + for i in range(len(dat)-1): + if min_dx==None or min_dx > abs(dat[i+1][0]-dat[i][0]): min_dx = abs(dat[i+1][0]-dat[i][0]) + if min_dx==None: min_dx = 1 + wd = min_dx*wd + ax1.bar([x[0] for x in dat],[x[1] for x in dat],wd) #,**plt_args) + + ax1.set_ylabel('Count') + ax1.set_xlabel('Variant Allele Frequency') + ax1.ticklabel_format(style='sci', scilimits=(-2,2), axis='y') + + ax1.spines['top'].set_visible(False) + ax1.spines['right'].set_visible(False) + ax1.get_xaxis().tick_bottom() + ax1.get_yaxis().tick_left() + ax1.spines['bottom'].set_color('grey') + ax1.spines['left'].set_color('grey') + mpl.rcParams['text.color'] = '555555' + ax1.patch.set_visible(False) + + plt.subplots_adjust(right=0.95,bottom=0.15) + plt.title('$$opts{title}{$id}') + plt.savefig('$img.png') + if img_fmt != 'png': plt.savefig('$img.' + img_fmt) + plt.close() + "; +} + +sub plot_vaf25_by_sample +{ + my ($opts,$id,$type) = @_; + + my @vals = get_values($opts,$id,'VAF'); + if ( !@vals ) { return; } + + my @vaf = (); + for my $row (@vals) + { + my @cnt = split(/,/,$type eq 'snv' ? $$row[1] : $$row[2]); + my $sum = 0; + my $sum25 = 0; + for (my $i=0; $i<@cnt; $i++) + { + if ( 1.0*$i/(scalar @cnt) < 0.25 ) { $sum25 += $cnt[$i] } + $sum += $cnt[$i]; + } + push @vaf, $sum ? $sum25/$sum : 0; + } + + my $fh = $$opts{plt_fh}; + my $img = "vaf25.$type.$id"; + + tprint $fh, " + dat = [ + "; + for (my $i=0; $i<@vaf; $i++) { tprint $fh, "\t[$i,$vaf[$i]],\n"; } + tprint $fh, "]\n"; + tprint $fh, " + + if plot_vaf25_$type: + fig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7)) + ax1 = fig.add_subplot(111) + ax1.plot([row[0] for row in dat], [row[1] for row in dat], 'o', color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]') + ax1.set_ylabel('nVAF<0.25') + ax1.set_ylim(-0.1,1.1) + if sample_names: + plt.xticks([int(row[0]) for row in dat],[row[7] for row in dat],**sample_font) + plt.subplots_adjust(**sample_margins) + else: + plt.subplots_adjust(right=0.98,left=0.07,bottom=0.17) + ax1.set_xlabel('Sample ID') + + ax1.spines['top'].set_visible(False) + ax1.spines['right'].set_visible(False) + ax1.get_xaxis().tick_bottom() + ax1.get_yaxis().tick_left() + ax1.spines['bottom'].set_color('grey') + ax1.spines['left'].set_color('grey') + mpl.rcParams['text.color'] = '555555' + ax1.patch.set_visible(False) + + plt.title('$$opts{title}{$id}') + plt.savefig('$img.png') + if img_fmt != 'png': plt.savefig('$img.' + img_fmt) + plt.close() + "; +} + + sub singletons { my ($opts,$id) = @_; @@ -2118,6 +2269,8 @@ sub create_pdf } tprint $tex, fmt_slide3v($opts, "tstv_by_sample", 'Ts/Tv by sample'); + tprint $tex, fmt_slide3v($opts, "vaf25.snv", 'Fraction of SNVs with VAF$<$25\% by sample'); + tprint $tex, fmt_slide3v($opts, "vaf25.indel", 'Fraction of indels with VAF$<$25\% by sample'); tprint $tex, fmt_slide3v($opts, "hets_by_sample", 'Hets vs non-ref Homs by sample'); tprint $tex, fmt_slide3v($opts, "singletons_by_sample", 'Singletons by sample {\normalsize(hets and homs)}'); tprint $tex, fmt_slide3v($opts, "dp_by_sample", 'Average depth by sample'); @@ -2193,6 +2346,8 @@ sub create_pdf tprint $tex, fmt_slide3h($opts, "depth", 'Depth distribution'); tprint $tex, fmt_slide3h($opts, "hwe", 'Number of HETs by AF'); tprint $tex, fmt_slide3h($opts, "substitutions", 'Substitution types'); + tprint $tex, fmt_slide3h($opts, "vaf.snv", 'SNV Variant Allele Frequency'); + tprint $tex, fmt_slide3h($opts, "vaf.indel", 'Indel Variant Allele Frequency'); #tprint $tex, fmt_slide3h($opts, "irc_by_af", 'Indel Repeat Consistency by AF'); #tprint $tex, fmt_slide3h($opts, "irc_by_rlen", 'Indel Consistency by Repeat Type'); @@ -2203,7 +2358,7 @@ sub create_pdf my $cmd = "$engine $tex_file >$$opts{logfile} 2>&1"; print STDERR "Creating PDF: $cmd\n" unless !$$opts{verbose}; system($cmd); - if ( $? ) { error("The command exited with non-zero status, please consult the output of $engine: $$opts{dir}$$opts{logfile}\n\n"); } + if ( $? ) { error("The command exited with non-zero status, please consult the output of $engine: $$opts{dir}/$$opts{logfile}\n\n"); } print STDERR "Finished: $$opts{dir}/$pdf_file\n" unless !$$opts{verbose}; } diff --git a/mpileup.c b/mpileup.c index 9b21b1873..d42a6a360 100644 --- a/mpileup.c +++ b/mpileup.c @@ -1,6 +1,6 @@ /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools - Copyright (C) 2008-2022 Genome Research Ltd. + Copyright (C) 2008-2023 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -101,6 +101,8 @@ typedef struct { int indels_v20; int argc; char **argv; + int write_index; + char *index_fn; } mplp_conf_t; typedef struct { @@ -489,37 +491,43 @@ static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp, if ((flag & MPLP_REALN_PARTIAL) && nt > 15 && ncig > 1) { // Left & right cigar op match. int lr = b->core.l_qseq > 500; - int lm = 0, rm = 0, k; + int lm = 0, rm = 0, k, nm = 0; for (k = 0; k < ncig; k++) { int cop = bam_cigar_op(cig[k]); if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP)) continue; if (cop == BAM_CMATCH || cop == BAM_CDIFF || - cop == BAM_CEQUAL) + cop == BAM_CEQUAL) { lm += bam_cigar_oplen(cig[k]); - else + nm++; + } else { break; + } } - for (k = ncig-1; k >= 0; k--) { - int cop = bam_cigar_op(cig[k]); - if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP)) + // if everything is a match (or sequence (mis)match) then move on + // because we don't have an indel in the middle + if (nm != ncig) { + for (k = ncig-1; k >= 0; k--) { + int cop = bam_cigar_op(cig[k]); + if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP)) + continue; + + if (cop == BAM_CMATCH || cop == BAM_CDIFF || + cop == BAM_CEQUAL) + rm += bam_cigar_oplen(cig[k]); + else + break; + } + + if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4) continue; - if (cop == BAM_CMATCH || cop == BAM_CDIFF || - cop == BAM_CEQUAL) - rm += bam_cigar_oplen(cig[k]); - else - break; + if (lm >= REALN_DIST && rm >= REALN_DIST && + has_clip < (0.15+0.05*(nt>20))*nt) + continue; } - - if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4) - continue; - - if (lm >= REALN_DIST && rm >= REALN_DIST && - has_clip < (0.15+0.05*(nt>20))*nt) - continue; } if (b->core.l_qseq > 500) { @@ -849,6 +857,7 @@ static int mpileup(mplp_conf_t *conf) for (i=0; ibcf_hdr, smpl[i]); if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output"); + if ( conf->write_index && init_index(conf->bcf_fp,conf->bcf_hdr,conf->output_fname,&conf->index_fn)<0 ) error("Error: failed to initialise index for %s\n",conf->output_fname); conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ, conf->delta_baseQ); @@ -958,6 +967,15 @@ static int mpileup(mplp_conf_t *conf) bcf_destroy1(conf->bcf_rec); if (conf->bcf_fp) { + if ( conf->write_index ) + { + if ( bcf_idx_save(conf->bcf_fp)<0 ) + { + if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname); + error("Error: cannot write to index %s\n",conf->index_fn); + } + free(conf->index_fn); + } if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname); bcf_hdr_destroy(conf->bcf_hdr); bcf_call_destroy(conf->bca); @@ -1227,6 +1245,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" " 'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n" " --threads INT Use multithreading with INT worker threads [0]\n" + " --write-index Automatically index the output files [off]\n" "\n" "SNP/INDEL genotype likelihoods options:\n" " -X, --config STR Specify platform specific profiles (see below)\n" @@ -1375,6 +1394,7 @@ int main_mpileup(int argc, char *argv[]) {"seed", required_argument, NULL, 13}, {"ambig-reads", required_argument, NULL, 14}, {"ar", required_argument, NULL, 14}, + {"write-index",no_argument,NULL,21}, {NULL, 0, NULL, 0} }; while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) { @@ -1497,6 +1517,7 @@ int main_mpileup(int argc, char *argv[]) } break; case 20: mplp.indels_v20 = 1; break; + case 21: mplp.write_index = 1; break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; diff --git a/plugins/add-variantkey.c b/plugins/add-variantkey.c index 1b1bce8b2..af9efd7a8 100644 --- a/plugins/add-variantkey.c +++ b/plugins/add-variantkey.c @@ -2,7 +2,7 @@ Copyright (C) 2017-2018 GENOMICS plc. - Author: Nicola Asuni + Author: Nicola Asuni Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/plugins/contrast.c b/plugins/contrast.c index 71d9d3d45..624bfeead 100644 --- a/plugins/contrast.c +++ b/plugins/contrast.c @@ -1,19 +1,19 @@ /* The MIT License - Copyright (c) 2018-2021 Genome Research Ltd. + Copyright (c) 2018-2023 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -69,6 +69,8 @@ typedef struct int ncontrol_gts, mcontrol_gts, ntotal, nskipped, ntested, ncase_al, ncase_gt; kstring_t case_als_smpl, case_gts_smpl; int max_AC, nals[4]; // nals: number of control-ref, control-alt, case-ref and case-alt alleles in the region + char *index_fn; + int write_index; } args_t; @@ -81,7 +83,7 @@ const char *about(void) static const char *usage_text(void) { - return + return "\n" "About: Runs a basic association test, per-site or in a region, and checks for novel alleles and\n" " genotypes in two groups of samples. Adds the following INFO annotations:\n" @@ -108,6 +110,7 @@ static const char *usage_text(void) " -t, --targets REG Similar to -r but streams rather than index-jumps\n" " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n" + " --write-index Automatically index the output files [off]\n" "\n" "Example:\n" " # Test if any of the samples a,b is different from the samples c,d,e\n" @@ -233,6 +236,7 @@ static void init_data(args_t *args) args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); if ( args->max_AC_str ) { @@ -251,6 +255,15 @@ static void init_data(args_t *args) static void destroy_data(args_t *args) { bcf_hdr_destroy(args->hdr_out); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); free(args->case_als_smpl.s); free(args->case_gts_smpl.s); @@ -314,7 +327,7 @@ static int process_record(args_t *args, bcf1_t *rec) for (j=0; j 31 ) { @@ -353,7 +366,7 @@ static int process_record(args_t *args, bcf1_t *rec) for (j=0; j 31 ) { @@ -365,7 +378,7 @@ static int process_record(args_t *args, bcf1_t *rec) args->nskipped++; return -1; } - if ( !(control_als & (1<annots & PRINT_NASSOC ) bcf_update_info_int32(args->hdr_out, rec, "NASSOC", nals, 4); - if ( args->case_als_smpl.l ) + if ( args->case_als_smpl.l ) { bcf_update_info_string(args->hdr_out, rec, "NOVELAL", args->case_als_smpl.s); args->ncase_al++; } - if ( args->case_gts_smpl.l ) + if ( args->case_gts_smpl.l ) { bcf_update_info_string(args->hdr_out, rec, "NOVELGT", args->case_gts_smpl.s); args->ncase_gt++; @@ -472,13 +485,14 @@ int run(int argc, char **argv) {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"targets-overlap",required_argument,NULL,4}, + {"write-index",no_argument,NULL,5}, {NULL,0,NULL,0} }; int c; char *tmp; while ((c = getopt_long(argc, argv, "O:o:i:e:r:R:t:T:0:1:a:f:",loptions,NULL)) >= 0) { - switch (c) + switch (c) { case 1 : args->force_samples = 1; break; case 'f': args->max_AC_str = optarg; break; @@ -522,6 +536,7 @@ int run(int argc, char **argv) args->targets_overlap = parse_overlap_option(optarg); if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; + case 5 : args->write_index = 1; break; case 'h': case '?': default: error("%s", usage_text()); break; diff --git a/plugins/fill-tags.c b/plugins/fill-tags.c index 740313f5f..b9f9b6908 100644 --- a/plugins/fill-tags.c +++ b/plugins/fill-tags.c @@ -473,6 +473,10 @@ uint32_t parse_tags(args_t *args, const char *str) if ( !strcasecmp(tags[i],"all") ) { flag |= ~(SET_END|SET_TYPE); + // include F_MISSING as part of 'all', which requires explicitly + // initialising it as a filter expression not just setting a + // bitfield flag. + flag |= parse_func(args,"F_MISSING=F_MISSING","F_MISSING"); args->warned = ~(SET_END|SET_TYPE); args->unpack |= BCF_UN_FMT; } diff --git a/plugins/gvcfz.c b/plugins/gvcfz.c index d9ddb6643..abb25d997 100644 --- a/plugins/gvcfz.c +++ b/plugins/gvcfz.c @@ -1,5 +1,5 @@ -/* - Copyright (C) 2017-2021 Genome Research Ltd. +/* + Copyright (C) 2017-2023 Genome Research Ltd. Author: Petr Danecek @@ -9,10 +9,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -78,6 +78,8 @@ typedef struct char **argv, *region, *target, *fname, *output_fname, *keep_tags; bcf_hdr_t *hdr_in, *hdr_out; bcf_srs_t *sr; + char *index_fn; + int write_index; } args_t; @@ -88,18 +90,19 @@ const char *about(void) static const char *usage_text(void) { - return + return "\n" "About: Compress gVCF file by resizing gVCF blocks according to specified criteria.\n" "\n" "Usage: bcftools +gvcfz [Options]\n" "Plugin options:\n" - " -a, --trim-alt-alleles trim alternate alleles not seen in the genotypes\n" - " -e, --exclude exclude sites for which the expression is true\n" - " -i, --include include sites for which the expression is true\n" - " -g, --group-by EXPR group gVCF blocks according to the expression\n" - " -o, --output FILE write gVCF output to the FILE\n" + " -a, --trim-alt-alleles Trim alternate alleles not seen in the genotypes\n" + " -e, --exclude Exclude sites for which the expression is true\n" + " -i, --include Include sites for which the expression is true\n" + " -g, --group-by EXPR Group gVCF blocks according to the expression\n" + " -o, --output FILE Write gVCF output to the FILE\n" " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n" + " --write-index Automatically index the output files [off]\n" "Examples:\n" " # Compress blocks by GQ and DP. Multiple blocks separated by a semicolon can be defined\n" " bcftools +gvcfz input.bcf -g'PASS:GQ>60 & DP<20; PASS:GQ>40 & DP<15; Flt1:QG>20; Flt2:-'\n" @@ -136,7 +139,7 @@ static void init_groups(args_t *args) beg = ++end; while ( *end && *end!=';' ) end++; char tmp = *end; *end = 0; - if ( strcmp(flt,"PASS") ) + if ( strcmp(flt,"PASS") ) { bcf_hdr_printf(args->hdr_out, "##FILTER=", flt, hdr_str); if (bcf_hdr_sync(args->hdr_out) < 0) @@ -174,6 +177,15 @@ static void destroy_data(args_t *args) free(args->grp); if ( args->filter ) filter_destroy(args->filter); + if ( args->write_index ) + { + if ( bcf_idx_save(args->fh_out)<0 ) + { + if ( hts_close(args->fh_out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->fh_out)!=0 ) error("failed to close %s\n", args->output_fname); bcf_sr_destroy(args->sr); @@ -203,7 +215,7 @@ static void flush_block(args_t *args, bcf1_t *rec) if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,"PL",&gvcf->pl,3) != 0 ) error("Could not update FORMAT/PL at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1); } - if ( gvcf->grp < args->ngrp && args->grp[gvcf->grp].flt_id >= 0 ) + if ( gvcf->grp < args->ngrp && args->grp[gvcf->grp].flt_id >= 0 ) bcf_add_filter(args->hdr_out, gvcf->rec, args->grp[gvcf->grp].flt_id); if ( bcf_write(args->fh_out, args->hdr_out, gvcf->rec)!=0 ) error("Failed to write the header\n"); @@ -323,13 +335,14 @@ int run(int argc, char **argv) {"stats",required_argument,NULL,'s'}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, + {"write-index",no_argument,NULL,1}, {NULL,0,NULL,0} }; int c; char *tmp; while ((c = getopt_long(argc, argv, "vr:R:t:T:o:O:g:i:e:a",loptions,NULL)) >= 0) { - switch (c) + switch (c) { case 'a': args->trim_alts = 1; break; case 'e': @@ -358,6 +371,7 @@ int run(int argc, char **argv) if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; + case 1 : args->write_index = 1; break; case 'h': case '?': default: error("%s", usage_text()); break; @@ -385,6 +399,7 @@ int run(int argc, char **argv) set_wmode(wmode,args->output_type,args->output_fname,args->clevel); args->fh_out = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( bcf_hdr_write(args->fh_out, args->hdr_out)!=0 ) error("Failed to write the header\n"); + if ( args->write_index && init_index(args->fh_out,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); while ( bcf_sr_next_line(args->sr) ) process_gvcf(args); flush_block(args, NULL); diff --git a/plugins/isecGT.c b/plugins/isecGT.c index c31af38ec..d83e8fdf8 100644 --- a/plugins/isecGT.c +++ b/plugins/isecGT.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2016-2021 Genome Research Ltd. + Copyright (C) 2016-2023 Genome Research Ltd. Author: Petr Danecek @@ -45,6 +45,8 @@ typedef struct bcf_srs_t *sr; bcf_hdr_t *hdr_a, *hdr_b; htsFile *out_fh; + char *index_fn; + int write_index; } args_t; @@ -67,6 +69,7 @@ static const char *usage_text(void) " -R, --regions-file FILE Restrict to regions listed in a file\n" " -t, --targets REGION Similar to -r but streams rather than index-jumps\n" " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" + " --write-index Automatically index the output files [off]\n" "\n"; } @@ -84,6 +87,7 @@ int run(int argc, char **argv) {"targets-file",required_argument,NULL,'T'}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, + {"write-index",no_argument,NULL,1}, {NULL,0,NULL,0} }; int c; @@ -115,6 +119,7 @@ int run(int argc, char **argv) case 'R': args->regions_list = optarg; args->regions_is_file = 1; break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; args->targets_is_file = 1; break; + case 1 : args->write_index = 1; break; case 'h': case '?': default: error("%s", usage_text()); break; @@ -146,6 +151,7 @@ int run(int argc, char **argv) args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( bcf_hdr_write(args->out_fh, args->hdr_a)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr_a,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); while ( bcf_sr_next_line(args->sr) ) { @@ -179,7 +185,15 @@ int run(int argc, char **argv) if ( dirty ) bcf_update_genotypes(args->hdr_a, line_a, args->arr_a, ngt_a*smpl->n); if ( bcf_write(args->out_fh, args->hdr_a, line_a)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); } - + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("Close failed: %s\n",args->output_fname); smpl_ilist_destroy(smpl); bcf_sr_destroy(args->sr); diff --git a/plugins/mendelian.c b/plugins/mendelian.c deleted file mode 100644 index 65a65fe1c..000000000 --- a/plugins/mendelian.c +++ /dev/null @@ -1,689 +0,0 @@ -/* The MIT License - - Copyright (c) 2015-2022 Genome Research Ltd. - - Author: Petr Danecek - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include // for isatty -#include "../bcftools.h" -#include "../regidx.h" - -#define MODE_COUNT 1 -#define MODE_LIST_GOOD 2 -#define MODE_LIST_BAD 4 -#define MODE_DELETE 8 -#define MODE_ANNOTATE 16 -#define MODE_LIST_SKIP 32 - -typedef struct -{ - int nok, nbad; - int imother,ifather,ichild; -} -trio_t; - -typedef struct -{ - int mpl, fpl, cpl; // ploidies - mother, father, child - int mal, fal; // expect an allele from mother and father -} -rule_t; - -typedef struct _args_t -{ - regidx_t *rules; - regitr_t *itr, *itr_ori; - bcf_hdr_t *hdr; - htsFile *out_fh; - int32_t *gt_arr; - int mode; - int ngt_arr, nrec; - trio_t *trios; - int ntrios, mtrios; - int output_type, clevel; - char *output_fname; - bcf_srs_t *sr; -} -args_t; - -static args_t args; -static int parse_rules(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr); -static bcf1_t *process(bcf1_t *rec); - -const char *about(void) -{ - return "Count Mendelian consistent / inconsistent genotypes [DEPRECATED, use mendelian2 instead]\n"; -} - -typedef struct -{ - const char *alias, *about, *rules; -} -rules_predef_t; - -static rules_predef_t rules_predefs[] = -{ - { .alias = "GRCh37", - .about = "Human Genome reference assembly GRCh37 / hg19, both chr naming conventions", - .rules = - " X:1-60000 M/M + F > M\n" - " X:1-60000 M/M + F > M/F\n" - " X:2699521-154931043 M/M + F > M\n" - " X:2699521-154931043 M/M + F > M/F\n" - " Y:1-59373566 . + F > F\n" - " MT:1-16569 M + F > M\n" - "\n" - " chrX:1-60000 M/M + F > M\n" - " chrX:1-60000 M/M + F > M/F\n" - " chrX:2699521-154931043 M/M + F > M\n" - " chrX:2699521-154931043 M/M + F > M/F\n" - " chrY:1-59373566 . + F > F\n" - " chrM:1-16569 M + F > M\n" - }, - { .alias = "GRCh38", - .about = "Human Genome reference assembly GRCh38 / hg38, both chr naming conventions", - .rules = - " X:1-9999 M/M + F > M\n" - " X:1-9999 M/M + F > M/F\n" - " X:2781480-155701381 M/M + F > M\n" - " X:2781480-155701381 M/M + F > M/F\n" - " Y:1-57227415 . + F > F\n" - " MT:1-16569 M + F > M\n" - "\n" - " chrX:1-9999 M/M + F > M\n" - " chrX:1-9999 M/M + F > M/F\n" - " chrX:2781480-155701381 M/M + F > M\n" - " chrX:2781480-155701381 M/M + F > M/F\n" - " chrY:1-57227415 . + F > F\n" - " chrM:1-16569 M + F > M\n" - }, - { - .alias = NULL, - .about = NULL, - .rules = NULL, - } -}; - - -const char *usage(void) -{ - return - "\n" - "About: Count Mendelian consistent / inconsistent genotypes. Note that this plugin is DEPRECATED and\n" - " will not be supported in the future. Please use the newer plugin +mendelian2 instead.\n" - "Usage: bcftools +mendelian [Options]\n" - "Options:\n" - " -c, --count Count the number of consistent sites [DEPRECATED, use `-m c` instead]\n" - " -d, --delete Delete inconsistent genotypes (set to \"./.\") [DEPRECATED, use `-m d` instead]\n" - " -l, --list [+x] List consistent (+) or inconsistent (x) sites [DEPRECATED, use `-m +` or `-m x` instead]\n" - " -m, --mode [+acdux] Output mode (the default is `-m c`):\n" - " + .. list consistent sites\n" - " a .. add INFO/MERR annotation with the number of inconsistent samples\n" - " c .. print counts, a text summary with the number of errors per trio\n" - " d .. delete inconsistent genotypes (set to \"./.\")\n" - " u .. list uninformative sites\n" - " x .. list inconsistent sites\n" - " -o, --output FILE Write output to a file [standard output]\n" - " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n" - " -r, --rules ASSEMBLY[?] Predefined rules, 'list' to print available settings, append '?' for details\n" - " -R, --rules-file FILE Inheritance rules, see example below\n" - " -t, --trio M,F,C Names of mother, father and the child\n" - " -T, --trio-file FILE List of trios, one per line (mother,father,child)\n" - " -p, --ped FILE PED file\n" - "\n" - "Example:\n" - " # Default inheritance patterns, override with -r\n" - " # region maternal_ploidy + paternal > offspring\n" - " X:1-60000 M/M + F > M\n" - " X:1-60000 M/M + F > M/F\n" - " X:2699521-154931043 M/M + F > M\n" - " X:2699521-154931043 M/M + F > M/F\n" - " Y:1-59373566 . + F > F\n" - " MT:1-16569 M + F > M\n" - "\n" - " bcftools +mendelian in.vcf -t Mother,Father,Child -c\n" - "\n"; -} - -regidx_t *init_rules(args_t *args, char *alias) -{ - const rules_predef_t *rules = rules_predefs; - if ( !alias ) alias = "GRCh37"; - - int detailed = 0, len = strlen(alias); - if ( alias[len-1]=='?' ) { detailed = 1; alias[len-1] = 0; } - - while ( rules->alias && strcasecmp(alias,rules->alias) ) rules++; - - if ( !rules->alias ) - { - fprintf(stderr,"\nPRE-DEFINED INHERITANCE RULES\n\n"); - fprintf(stderr," * Columns are: CHROM:BEG-END MATERNAL_PLOIDY + PATERNAL_PLOIDY > OFFSPRING\n"); - fprintf(stderr," * Coordinates are 1-based inclusive.\n\n"); - rules = rules_predefs; - while ( rules->alias ) - { - fprintf(stderr,"%s\n .. %s\n\n", rules->alias,rules->about); - if ( detailed ) - fprintf(stderr,"%s\n", rules->rules); - rules++; - } - fprintf(stderr,"Run as --rules (e.g. --rules GRCh37).\n"); - fprintf(stderr,"To see the detailed ploidy definition, append a question mark (e.g. --rules GRCh37?).\n"); - fprintf(stderr,"\n"); - exit(-1); - } - else if ( detailed ) - { - fprintf(stderr,"%s", rules->rules); - exit(-1); - } - return regidx_init_string(rules->rules, parse_rules, NULL, sizeof(rule_t), &args); -} - -static int parse_rules(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) -{ - // e.g. "Y:1-59373566 . + F > . # daugther" - - // eat any leading spaces - char *ss = (char*) line; - while ( *ss && isspace(*ss) ) ss++; - if ( !*ss ) return -1; // skip empty lines - - // chromosome name, beg, end - char *tmp, *se = ss; - while ( se[1] && !isspace(se[1]) ) se++; - while ( se > ss && isdigit(*se) ) se--; - if ( *se!='-' ) error("Could not parse the region: %s\n", line); - *end = strtol(se+1, &tmp, 10) - 1; - if ( tmp==se+1 ) error("Could not parse the region:%s\n",line); - while ( se > ss && *se!=':' ) se--; - *beg = strtol(se+1, &tmp, 10) - 1; - if ( tmp==se+1 ) error("Could not parse the region:%s\n",line); - - *chr_beg = ss; - *chr_end = se-1; - - // skip region - while ( *ss && !isspace(*ss) ) ss++; - while ( *ss && isspace(*ss) ) ss++; - - rule_t *rule = (rule_t*) payload; - memset(rule, 0, sizeof(rule_t)); - - // maternal ploidy - se = ss; - while ( *se && !isspace(*se) ) se++; - int err = 0; - if ( se - ss == 1 ) - { - if ( *ss=='M' ) rule->mpl = 1; - else if ( *ss=='.' ) rule->mpl = 0; - else err = 1; - } - else if ( se - ss == 3 ) - { - if ( !strncmp(ss,"M/M",3) ) rule->mpl = 2; - else err = 1; - } - else err = 1; - if ( err ) error("Could not parse the maternal ploidy, only \"M\", \"M/M\" and \".\" currently supported: %s\n",line); - - // skip "+" - while ( *se && isspace(*se) ) se++; - if ( *se != '+' ) error("Could not parse the line: %s\n",line); - se++; - while ( *se && isspace(*se) ) se++; - - // paternal ploidy - ss = se; - while ( *se && !isspace(*se) ) se++; - if ( se - ss == 1 ) - { - if ( *ss=='F' ) rule->fpl = 1; - else err = 1; - } - else err = 1; - if ( err ) error("Could not parse the paternal ploidy, only \"F\" is currently supported: %s [%s]\n",line, ss); - - // skip ">" - while ( *se && isspace(*se) ) se++; - if ( *se != '>' ) error("Could not parse the line: %s\n",line); - se++; - while ( *se && isspace(*se) ) se++; - - // ploidy in offspring - ss = se; - while ( *se && !isspace(*se) ) se++; - if ( se - ss == 3 ) - { - if ( !strncmp(ss,"M/F",3) ) { rule->cpl = 2; rule->fal = 1; rule->mal = 1; } - else err = 1; - } - else if ( se - ss == 1 ) - { - if ( *ss=='F' ) { rule->cpl = 1; rule->fal = 1; } - else if ( *ss=='M' ) { rule->cpl = 1; rule->mal = 1; } - else err = 1; - } - else err = 1; - if ( err ) error("Could not parse the offspring's ploidy, only \"M\", \"F\" or \"M/F\" is currently supported: %s\n",line); - - return 0; -} - -void parse_ped(args_t *args, char *fname) -{ - htsFile *fp = hts_open(fname, "r"); - if ( !fp ) error("Could not read: %s\n", fname); - - kstring_t str = {0,0,0}; - if ( hts_getline(fp, KS_SEP_LINE, &str) <= 0 ) error("Empty file: %s\n", fname); - - int moff = 0, *off = NULL; - do - { - int ncols = ksplit_core(str.s,0,&moff,&off); - if ( ncols<4 ) error("Could not parse the ped file: %s\n", str.s); - - int ifather = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[2]]); - int imother = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[3]]); - int ichild = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[1]]); - - // The code in process() makes an attempt to work with partial families, - // the support is not complete though and can lead to core dumps. Therefore - // enforcing full trios for now. - // if ( ( ifather<0 && imother<0 ) || ichild<0 ) continue; - if ( ifather<0 || imother<0 || ichild<0 ) continue; - - args->ntrios++; - hts_expand0(trio_t,args->ntrios,args->mtrios,args->trios); - trio_t *trios = &args->trios[args->ntrios-1]; - trios->ifather = ifather; - trios->imother = imother; - trios->ichild = ichild; - - } while ( hts_getline(fp, KS_SEP_LINE, &str)>=0 ); - if ( !args->ntrios ) error("No complete trios found in the PED and VCF\n"); - - free(str.s); - free(off); - hts_close(fp); -} - -int run(int argc, char **argv) -{ - char *trio_samples = NULL, *trio_file = NULL, *ped_fname = NULL, *rules_fname = NULL, *rules_string = NULL; - memset(&args,0,sizeof(args_t)); - args.mode = 0; - args.output_fname = "-"; - args.clevel = -1; - - static struct option loptions[] = - { - {"trio",1,0,'t'}, - {"trio-file",1,0,'T'}, - {"ped",1,0,'p'}, - {"delete",0,0,'d'}, - {"list",1,0,'l'}, - {"mode",1,0,'m'}, - {"count",0,0,'c'}, - {"rules",1,0,'r'}, - {"rules-file",1,0,'R'}, - {"output",required_argument,NULL,'o'}, - {"output-type",required_argument,NULL,'O'}, - {0,0,0,0} - }; - int c; - char *tmp; - while ((c = getopt_long(argc, argv, "?ht:T:p:l:m:cdr:R:o:O:",loptions,NULL)) >= 0) - { - switch (c) - { - case 'o': args.output_fname = optarg; break; - case 'O': - switch (optarg[0]) { - case 'b': args.output_type = FT_BCF_GZ; break; - case 'u': args.output_type = FT_BCF; break; - case 'z': args.output_type = FT_VCF_GZ; break; - case 'v': args.output_type = FT_VCF; break; - default: - { - args.clevel = strtol(optarg,&tmp,10); - if ( *tmp || args.clevel<0 || args.clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); - } - }; - if ( optarg[1] ) - { - args.clevel = strtol(optarg+1,&tmp,10); - if ( *tmp || args.clevel<0 || args.clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); - } - break; - case 'R': rules_fname = optarg; break; - case 'r': rules_string = optarg; break; - case 'd': - args.mode |= MODE_DELETE; - fprintf(stderr,"Warning: -d will be deprecated, please use `-m d` instead.\n"); - break; - case 'c': - args.mode |= MODE_COUNT; - fprintf(stderr,"Warning: -c will be deprecated, please use `-m c` instead.\n"); - break; - case 'l': - if ( !strcmp("+",optarg) ) args.mode |= MODE_LIST_GOOD; - else if ( !strcmp("x",optarg) ) args.mode |= MODE_LIST_BAD; - else error("The argument not recognised: --list %s\n", optarg); - fprintf(stderr,"Warning: -l will be deprecated, please use -m instead.\n"); - break; - case 'm': - if ( !strcmp("+",optarg) ) args.mode |= MODE_LIST_GOOD; - else if ( !strcmp("x",optarg) ) args.mode |= MODE_LIST_BAD; - else if ( !strcmp("a",optarg) ) args.mode |= MODE_ANNOTATE; - else if ( !strcmp("d",optarg) ) args.mode |= MODE_DELETE; - else if ( !strcmp("c",optarg) ) args.mode |= MODE_COUNT; - else if ( !strcmp("u",optarg) ) args.mode |= MODE_LIST_SKIP; - else error("The argument not recognised: --mode %s\n", optarg); - break; - case 't': trio_samples = optarg; break; - case 'T': trio_file = optarg; break; - case 'p': ped_fname = optarg; break; - case 'h': - case '?': - default: error("%s",usage()); break; - } - } - if ( rules_fname ) - args.rules = regidx_init(rules_fname, parse_rules, NULL, sizeof(rule_t), &args); - else - args.rules = init_rules(&args, rules_string); - if ( !args.rules ) return -1; - args.itr = regitr_init(args.rules); - args.itr_ori = regitr_init(args.rules); - - char *fname = NULL; - if ( optind>=argc || argv[optind][0]=='-' ) - { - if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin - else error("%s",usage()); - } - else - fname = argv[optind]; - - if ( !trio_samples && !trio_file && !ped_fname ) error("Expected the -t/T or -p option\n"); - if ( !args.mode ) args.mode = MODE_COUNT; - if ( args.mode&MODE_DELETE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD|MODE_LIST_SKIP)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD|MODE_LIST_SKIP; - if ( args.mode&MODE_ANNOTATE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD|MODE_LIST_SKIP)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD|MODE_LIST_SKIP; - - FILE *log_fh = stderr; - if ( args.mode==MODE_COUNT ) - { - log_fh = strcmp("-",args.output_fname) ? fopen(args.output_fname,"w") : stdout; - if ( !log_fh ) error("Error: cannot write to %s\n", args.output_fname); - } - - args.sr = bcf_sr_init(); - if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args.sr->errnum)); - args.hdr = bcf_sr_get_header(args.sr, 0); - if ( args.mode!=MODE_COUNT ) - { - char wmode[8]; - set_wmode(wmode,args.output_type,args.output_fname,args.clevel); - args.out_fh = hts_open(args.output_fname ? args.output_fname : "-", wmode); - if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno)); - if ( args.mode&MODE_ANNOTATE ) - bcf_hdr_append(args.hdr, "##INFO="); - if ( bcf_hdr_write(args.out_fh, args.hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args.output_fname); - } - - int i, n = 0; - char **list; - if ( trio_samples ) - { - args.ntrios = 1; - args.trios = (trio_t*) calloc(1,sizeof(trio_t)); - list = hts_readlist(trio_samples, 0, &n); - if ( n!=3 ) error("Expected three sample names with -t\n"); - args.trios[0].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]); - args.trios[0].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]); - args.trios[0].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[2]); - if ( args.trios[0].imother<0 ) error("The sample is not present in the VCF: %s\n",list[0]); - if ( args.trios[0].ifather<0 ) error("The sample is not present in the VCF: %s\n",list[1]); - if ( args.trios[0].ichild<0 ) error("The sample is not present in the VCF: %s\n",list[2]); - for (i=0; ierrcode ) error("TODO: Unchecked error (%d), exiting\n",line->errcode); - if ( args.out_fh && bcf_write1(args.out_fh, args.hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args.output_fname); - } - } - if ( args.out_fh && hts_close(args.out_fh)!=0 ) error("Error: close failed\n"); - - if ( args.mode & MODE_COUNT ) - { - fprintf(log_fh,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio (mother,father,child)\n"); - for (i=0; inok,trio->nbad,args.nrec-(trio->nok+trio->nbad), - bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->imother), - bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ifather), - bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ichild) - ); - } - } - if ( log_fh!=stderr && log_fh!=stdout && fclose(log_fh) ) error("Error: close failed for %s\n", args.output_fname); - - free(args.gt_arr); - free(args.trios); - regitr_destroy(args.itr); - regitr_destroy(args.itr_ori); - regidx_destroy(args.rules); - bcf_sr_destroy(args.sr); - return 0; -} - -static void warn_ploidy(bcf1_t *rec) -{ - static int warned = 0; - if ( warned ) return; - fprintf(stderr,"Incorrect ploidy at %s:%"PRId64", skipping the trio. (This warning is printed only once.)\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); - warned = 1; -} - -bcf1_t *process(bcf1_t *rec) -{ - bcf1_t *dflt = args.mode&MODE_LIST_SKIP ? rec : NULL; - args.nrec++; - - if ( rec->n_allele > 63 ) return dflt; // we use 64bit bitmask below - - int ngt = bcf_get_genotypes(args.hdr, rec, &args.gt_arr, &args.ngt_arr); - if ( ngt<0 ) return dflt; - if ( ngt!=2*bcf_hdr_nsamples(args.hdr) && ngt!=bcf_hdr_nsamples(args.hdr) ) return dflt; - ngt /= bcf_hdr_nsamples(args.hdr); - - int itr_set = regidx_overlap(args.rules, bcf_seqname(args.hdr,rec),rec->pos,rec->pos, args.itr_ori); - - int i, nbad = 0, ngood = 0, needs_update = 0; - for (i=0; iimother<0 ) - { - a = bcf_gt_missing; - b = bcf_int32_vector_end; - } - else - { - a = args.gt_arr[ngt*trio->imother]; - b = ngt==2 ? args.gt_arr[ngt*trio->imother+1] : bcf_int32_vector_end; - } - if ( trio->ifather<0 ) - { - c = bcf_gt_missing; - d = bcf_int32_vector_end; - } - else - { - c = args.gt_arr[ngt*trio->ifather]; - d = ngt==2 ? args.gt_arr[ngt*trio->ifather+1] : bcf_int32_vector_end; - } - e = args.gt_arr[ngt*trio->ichild]; - f = ngt==2 ? args.gt_arr[ngt*trio->ichild+1] : bcf_int32_vector_end; - - // skip sites with missing data in child - if ( bcf_gt_is_missing(e) || bcf_gt_is_missing(f) ) continue; - - uint64_t mother = 0, father = 0,child1,child2; - - int is_ok = 0; - if ( !itr_set ) - { - if ( f==bcf_int32_vector_end ) { warn_ploidy(rec); continue; } - - // All M,F,C genotypes are diploid. Missing data are considered consistent. - child1 = 1<mal || !rule->fal ) continue; // wrong rule (haploid), but this is a diploid GT - if ( !mother ) mother = child1|child2; - if ( !father ) father = child1|child2; - if ( (mother&child1 && father&child2) || (mother&child2 && father&child1) ) is_ok = 1; - continue; - } - if ( rule->mal ) - { - if ( mother && !(child1&mother) ) continue; - } - if ( rule->fal ) - { - if ( father && !(child1&father) ) continue; - } - is_ok = 1; - } - } - if ( is_ok ) - { - trio->nok++; - ngood++; - } - else - { - trio->nbad++; - nbad++; - if ( args.mode&MODE_DELETE ) - { - args.gt_arr[ngt*trio->imother] = bcf_gt_missing; - if ( b!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->imother+1] = bcf_gt_missing; // should be always true - args.gt_arr[ngt*trio->ifather] = bcf_gt_missing; - if ( d!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ifather+1] = bcf_gt_missing; - args.gt_arr[ngt*trio->ichild] = bcf_gt_missing; - if ( f!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ichild+1] = bcf_gt_missing; - needs_update = 1; - } - } - } - - if ( needs_update && bcf_update_genotypes(args.hdr,rec,args.gt_arr,ngt*bcf_hdr_nsamples(args.hdr)) ) - error("Could not update GT field at %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); - - if ( args.mode&MODE_ANNOTATE ) bcf_update_info_int32(args.hdr, rec, "MERR", &nbad, 1); - if ( args.mode&MODE_LIST_GOOD && ngood ) return rec; - if ( args.mode&MODE_LIST_BAD && nbad ) return rec; - if ( args.mode&MODE_LIST_SKIP && !ngood && !nbad ) return rec; - - return NULL; -} diff --git a/plugins/mendelian2.c b/plugins/mendelian2.c index f1d5c7b02..30df1186b 100644 --- a/plugins/mendelian2.c +++ b/plugins/mendelian2.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2015-2022 Genome Research Ltd. + Copyright (c) 2015-2023 Genome Research Ltd. Author: Petr Danecek @@ -114,6 +114,8 @@ typedef struct _args_t int ngt_arr; stats_t stats; // common per-site and per-sample stats int nref_only, nmany_als; // per-site stats + char *index_fn; + int write_index; } args_t; @@ -140,6 +142,7 @@ static const char *usage_text(void) " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n" " --no-version Do not append version and command line to the header\n" + " --write-index Automatically index the output files [off]\n" "\n" "Options:\n" " -m, --mode c|[adeEgmMS] Output mode, the default is `-m c`. Multiple modes can be combined in VCF/BCF\n" @@ -476,6 +479,7 @@ static void init_data(args_t *args) args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); } } @@ -488,7 +492,19 @@ static void destroy_data(args_t *args) free(args->trio); free(args->gt_arr); free(args->rule); - if ( args->out_fh && hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + if ( args->out_fh ) + { + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + } bcf_hdr_destroy(args->hdr_out); bcf_sr_destroy(args->sr); free(args); @@ -765,13 +781,9 @@ int run(int argc, char **argv) static struct option loptions[] = { - {"trio",1,0,'t'}, - {"trio-file",1,0,'T'}, - {"ped",1,0,'p'}, - {"delete",0,0,'d'}, - {"list",1,0,'l'}, + {"pfm",1,0,'p'}, + {"ped",1,0,'P'}, {"mode",1,0,'m'}, - {"count",0,0,'c'}, {"rules",1,0,1}, {"rules-file",1,0,2}, {"output",required_argument,NULL,'o'}, @@ -784,11 +796,12 @@ int run(int argc, char **argv) {"targets-overlap",required_argument,NULL,15}, {"include",required_argument,0,'i'}, {"exclude",required_argument,0,'e'}, + {"write-index",no_argument,NULL,3}, {0,0,0,0} }; int c; char *tmp; - while ((c = getopt_long(argc, argv, "?ht:T:p:m:o:O:i:e:t:T:r:R:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "?hp:P:m:o:O:i:e:t:T:r:R:",loptions,NULL)) >= 0) { switch (c) { @@ -843,6 +856,7 @@ int run(int argc, char **argv) case 'p': args->pfm = optarg; break; case 1 : args->rules_str = optarg; break; case 2 : args->rules_fname = optarg; break; + case 3 : args->write_index = 1; break; case 'h': case '?': default: error("%s",usage_text()); break; diff --git a/plugins/prune.c b/plugins/prune.c index 57ae83a5a..1593e7306 100644 --- a/plugins/prune.c +++ b/plugins/prune.c @@ -1,5 +1,5 @@ -/* - Copyright (C) 2017-2021 Genome Research Ltd. +/* + Copyright (C) 2017-2023 Genome Research Ltd. Author: Petr Danecek @@ -9,10 +9,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -69,6 +69,8 @@ typedef struct htsFile *out_fh; bcf_hdr_t *hdr; bcf_srs_t *sr; + char *index_fn; + int write_index; } args_t; @@ -79,7 +81,7 @@ const char *about(void) static const char *usage_text(void) { - return + return "\n" "About: Prune sites by missingness or linkage disequilibrium.\n" "\n" @@ -103,6 +105,7 @@ static const char *usage_text(void) " -t, --targets REGION Similar to -r but streams rather than index-jumps\n" " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" " -w, --window INT[bp|kb|Mb] The window size of INT sites or INT bp/kb/Mb for the -n/-l options [100kb]\n" + " --write-index Automatically index the output files [off]\n" "Examples:\n" " # Discard records with r2 bigger than 0.6 in a window of 1000 sites\n" " bcftools +prune -m 0.6 -w 1000 input.bcf -Ob -o output.bcf\n" @@ -183,6 +186,7 @@ static void init_data(args_t *args) } } if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); args->ld_filter_id = -1; if ( args->ld_filter && strcmp(".",args->ld_filter) ) args->ld_filter_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, args->ld_filter); @@ -211,6 +215,15 @@ static void destroy_data(args_t *args) { if ( args->filter ) filter_destroy(args->filter); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); vcfbuf_destroy(args->vcfbuf); bcf_sr_destroy(args->sr); @@ -303,20 +316,22 @@ int run(int argc, char **argv) {"nsites-per-win",required_argument,NULL,'n'}, {"nsites-per-win-mode",required_argument,NULL,'N'}, {"window",required_argument,NULL,'w'}, + {"write-index",no_argument,NULL,4}, {NULL,0,NULL,0} }; int c; char *tmp; while ((c = getopt_long(argc, argv, "vr:R:t:T:m:o:O:a:f:i:e:n:N:w:k",loptions,NULL)) >= 0) { - switch (c) + switch (c) { case 1 : args->rand_missing = 1; break; case 2 : args->af_tag = optarg; break; - case 3 : + case 3 : args->rseed = strtol(optarg,&tmp,10); if ( tmp==optarg || *tmp ) error("Could not parse: --random-seed %s\n", optarg); break; + case 4 : args->write_index = 1; break; case 'k': args->keep_sites = 1; break; case 'e': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); @@ -324,7 +339,7 @@ int run(int argc, char **argv) case 'i': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; - case 'a': + case 'a': { int n, i; char **tag = hts_readlist(optarg,0,&n); @@ -352,9 +367,9 @@ int run(int argc, char **argv) free(tag); args->ld_mask |= LD_ANNOTATE; } - break; + break; case 'f': args->ld_filter = optarg; break; - case 'n': + case 'n': args->nsites = strtod(optarg,&tmp); if ( tmp==optarg || *tmp ) error("Could not parse: --nsites-per-win %s\n", optarg); break; @@ -364,7 +379,7 @@ int run(int argc, char **argv) else if ( !strcasecmp(optarg,"rand") ) args->nsites_mode = optarg; else error("The mode \"%s\" is not recognised\n",optarg); break; - case 'm': + case 'm': if ( !strncasecmp("R2=",optarg,3) ) { args->ld_max_set[VCFBUF_LD_IDX_R2] = 1; @@ -388,7 +403,7 @@ int run(int argc, char **argv) if ( !tmp || *tmp ) error("Could not parse: --max %s\n", optarg); args->ld_mask |= LD_SET_MAX; break; - case 'w': + case 'w': args->ld_win = strtod(optarg,&tmp); if ( !*tmp ) break; if ( tmp==optarg ) error("Could not parse: --window %s\n", optarg); @@ -398,9 +413,9 @@ int run(int argc, char **argv) else error("Could not parse: --window %s\n", optarg); break; case 'T': args->target_is_file = 1; // fall-through - case 't': args->target = optarg; break; + case 't': args->target = optarg; break; case 'R': args->region_is_file = 1; // fall-through - case 'r': args->region = optarg; break; + case 'r': args->region = optarg; break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { @@ -439,7 +454,7 @@ int run(int argc, char **argv) else args->fname = argv[optind]; init_data(args); - + while ( bcf_sr_next_line(args->sr) ) process(args); flush(args,1); diff --git a/plugins/remove-overlaps.c b/plugins/remove-overlaps.c index 2e8e6b0dd..bd0304497 100644 --- a/plugins/remove-overlaps.c +++ b/plugins/remove-overlaps.c @@ -1,5 +1,5 @@ -/* - Copyright (C) 2017-2021 Genome Research Ltd. +/* + Copyright (C) 2017-2023 Genome Research Ltd. Author: Petr Danecek @@ -9,10 +9,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -52,6 +52,8 @@ typedef struct htsFile *out_fh; bcf_hdr_t *hdr; bcf_srs_t *sr; + char *index_fn; + int write_index; } args_t; @@ -62,7 +64,7 @@ const char *about(void) static const char *usage_text(void) { - return + return "\n" "About: Remove overlapping variants.\n" "\n" @@ -80,6 +82,7 @@ static const char *usage_text(void) " -R, --regions-file FILE restrict to regions listed in a file\n" " -t, --targets REGION similar to -r but streams rather than index-jumps\n" " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" + " --write-index Automatically index the output files [off]\n" "\n"; } @@ -100,6 +103,7 @@ static void init_data(args_t *args) args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); args->vcfbuf = vcfbuf_init(args->hdr, 0); if ( args->rmdup ) @@ -114,6 +118,15 @@ static void destroy_data(args_t *args) { if ( args->filter ) filter_destroy(args->filter); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); vcfbuf_destroy(args->vcfbuf); bcf_sr_destroy(args->sr); @@ -168,13 +181,14 @@ int run(int argc, char **argv) {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {"verbose",no_argument,NULL,'v'}, + {"write-index",no_argument,NULL,1}, {NULL,0,NULL,0} }; int c; char *tmp; while ((c = getopt_long(argc, argv, "r:R:t:T:o:O:i:e:vpd",loptions,NULL)) >= 0) { - switch (c) + switch (c) { case 'd': args->rmdup = 1; break; case 'p': args->print_overlaps = 1; break; @@ -186,9 +200,9 @@ int run(int argc, char **argv) if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'T': args->target_is_file = 1; // fall-through - case 't': args->target = optarg; break; + case 't': args->target = optarg; break; case 'R': args->region_is_file = 1; // fall-through - case 'r': args->region = optarg; break; + case 'r': args->region = optarg; break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { @@ -208,6 +222,7 @@ int run(int argc, char **argv) if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; + case 1 : args->write_index = 1; break; case 'h': case '?': default: error("%s", usage_text()); break; @@ -223,7 +238,7 @@ int run(int argc, char **argv) else args->fname = argv[optind]; init_data(args); - + while ( bcf_sr_next_line(args->sr) ) process(args); flush(args,1); diff --git a/plugins/scatter.c b/plugins/scatter.c index af358fc4f..e42edd877 100644 --- a/plugins/scatter.c +++ b/plugins/scatter.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (C) 2020-2021 Giulio Genovese + Copyright (C) 2020-2023 Giulio Genovese Author: Giulio Genovese @@ -39,6 +39,7 @@ typedef struct { htsFile *fh; // output file handle char *fname; // output file name + char *index_fn; } subset_t; @@ -60,6 +61,7 @@ typedef struct char **hts_opts; int nhts_opts; bcf_hdr_t *hdr; + int write_index; } args_t; @@ -95,6 +97,7 @@ static const char *usage_text(void) " -x, --extra STRING Output records not overlapping listed regions in separate file\n" " -p, --prefix STRING Prepend string to output VCF names\n" " --hts-opts LIST Low-level options to pass to HTSlib, e.g. block_size=32768\n" + " --write-index Automatically index the output files [off]\n" "\n" "Examples:\n" " # Scatter a VCF file by shards with 10000 variants each\n" @@ -200,6 +203,7 @@ static void open_set(subset_t *set, args_t *args) if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_plugin"); } if ( bcf_hdr_write(set->fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__, args->str.s); + if ( args->write_index && init_index(set->fh,args->hdr,args->str.s,&set->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->str.s); } static void init_data(args_t *args) @@ -260,7 +264,17 @@ static void destroy_data(args_t *args) for (i=0; insets; i++) { subset_t *set = &args->sets[i]; - if (set->fname) { + if (set->fname) + { + if ( args->write_index ) + { + if ( bcf_idx_save(set->fh)<0 ) + { + if ( hts_close(set->fh)!=0 ) error("Error: close failed .. %s\n", set->fname); + error("Error: cannot write to index %s\n", set->index_fn); + } + free(set->index_fn); + } if ( hts_close(set->fh)!=0 ) error("Error: close failed .. %s\n", set->fname); free(set->fname); } @@ -338,6 +352,7 @@ int run(int argc, char **argv) {"extra",required_argument,NULL,'x'}, {"prefix",required_argument,NULL,'p'}, {"hts-opts",required_argument,NULL,5}, + {"write-index",no_argument,NULL,6}, {NULL,0,NULL,0} }; int c; @@ -395,6 +410,7 @@ int run(int argc, char **argv) case 'x': args->extra = optarg; break; case 'p': args->prefix = optarg; break; case 5 : args->hts_opts = hts_readlist(optarg, 0, &args->nhts_opts); break; + case 6 : args->write_index = 1; break; case 'h': case '?': default: error("%s", usage_text()); break; diff --git a/plugins/split-vep.c b/plugins/split-vep.c index e5dfeb14a..82c1ff0bf 100644 --- a/plugins/split-vep.c +++ b/plugins/split-vep.c @@ -127,6 +127,8 @@ typedef struct int allow_undef_tags; int genes_mode; // --gene-list +FILE, one of GENES_* mode, prioritize or restrict int print_header; + char *index_fn; + int write_index; } args_t; @@ -208,8 +210,8 @@ static const char *usage_text(void) " -d, --duplicate Output per transcript/allele consequences on a new line rather rather than\n" " as comma-separated fields on a single line\n" " -f, --format STR Create non-VCF output; similar to `bcftools query -f` but drops lines w/o consequence\n" - " -g, --gene-list [+]FILE Consider only genes listed in FILE, or prioritize if FILE is prefixed with \"+\"\n" - " --gene-list-fields LIST Use these fields when matching genes from the -g list [SYMBOL,Gene,gene]\n" + " -g, --gene-list [+]FILE Consider only features listed in FILE, or prioritize if FILE is prefixed with \"+\"\n" + " --gene-list-fields LIST Fields to match against by the -g list, by default gene names [SYMBOL,Gene,gene]\n" " -H, --print-header Print header\n" " -l, --list Parse the VCF header and list the annotation fields\n" " -p, --annot-prefix STR Before doing anything else, prepend STR to all CSQ fields to avoid tag name conflicts\n" @@ -220,8 +222,8 @@ static const char *usage_text(void) " -S, --severity -|FILE Pass \"-\" to print the default severity scale or FILE to override\n" " the default scale\n" " -u, --allow-undef-tags Print \".\" for undefined tags\n" - " -x, --drop-sites Drop sites with none of the consequences matching the severity specified by -s.\n" - " This switch is intended for use with VCF/BCF output (i.e. -f not given).\n" + " -x, --drop-sites Drop sites without consequences (the default with -f)\n" + " -X, --keep-sites Do not drop sites without consequences (the default without -f)\n" "Common options:\n" " -e, --exclude EXPR Exclude sites and samples for which the expression is true\n" " -i, --include EXPR Include sites and samples for which the expression is true\n" @@ -234,6 +236,7 @@ static const char *usage_text(void) " -t, --targets REG Similar to -r but streams rather than index-jumps\n" " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n" + " --write-index Automatically index the output files [off]\n" "\n" "Examples:\n" " # List available fields of the INFO/CSQ annotation\n" @@ -546,6 +549,7 @@ static void parse_format_str(args_t *args) // The program was requested to extract one or more columns via -c. It can contain names, 0-based indexes or ranges of indexes static void parse_column_str(args_t *args) { + if ( args->nannot ) return; // already called from parse_filter_str int i,j; int *column = NULL; int *types = NULL; @@ -693,7 +697,6 @@ static void parse_column_str(args_t *args) // as if the user passed them via the -c option. static void parse_filter_str(args_t *args) { - int max_unpack = args->convert ? convert_max_unpack(args->convert) : 0; args->filter = filter_parse(args->hdr_out, args->filter_str); if ( !args->filter ) error(NULL); // this type of error would have been reported int ret = filter_status(args->filter); @@ -706,9 +709,7 @@ static void parse_filter_str(args_t *args) const char **tags = filter_list_undef_tags(args->filter, &ntags); kstring_t str; str.s = args->column_str; - str.l = str.m = strlen(str.s); - destroy_annot(args); - destroy_column2type(args); + str.l = str.m = str.s ? strlen(str.s) : 0; for (i=0; ifield2idx,tags[i],&j)!=0 ) @@ -721,11 +722,10 @@ static void parse_filter_str(args_t *args) filter_destroy(args->filter); args->filter = filter_init(args->hdr_out, args->filter_str); } - max_unpack |= filter_max_unpack(args->filter); - if ( !args->format_str ) max_unpack |= BCF_UN_FMT; // don't drop FMT fields on VCF input when VCF/BCF is output - args->sr->max_unpack = max_unpack; - if ( args->convert && (max_unpack & BCF_UN_FMT) ) - convert_set_option(args->convert, subset_samples, &args->smpl_pass); + int ntags, i; + const char **tags = filter_list_used_tags(args->filter, &ntags); + for (i=0; ivep_tag) ) args->raw_vep_request = 1; } static void init_data(args_t *args) { @@ -864,6 +864,7 @@ static void init_data(args_t *args) free(tmp); if ( args->format_str ) parse_format_str(args); // Text output, e.g. bcftools +split-vep -f '%Consequence\n' + if ( args->filter_str ) parse_filter_str(args); if ( args->column_str ) parse_column_str(args); // The --columns option was given, update the header if ( args->format_str ) { @@ -871,10 +872,18 @@ static void init_data(args_t *args) args->convert = convert_init(args->hdr_out, NULL, 0, args->format_str); if ( !args->convert ) error("Could not parse the expression: %s\n", args->format_str); if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1); + convert_set_option(args->convert, force_newline, 1); } - if ( args->filter_str ) parse_filter_str(args); if ( args->genes_fname ) init_gene_list(args); + int max_unpack = BCF_UN_SHR; + if ( args->convert ) max_unpack |= convert_max_unpack(args->convert); + if ( args->filter ) max_unpack |= filter_max_unpack(args->filter); + if ( !args->format_str ) max_unpack |= BCF_UN_FMT; // don't drop FMT fields on VCF input when VCF/BCF is output + args->sr->max_unpack = max_unpack; + if ( args->convert && (max_unpack & BCF_UN_FMT) ) + convert_set_option(args->convert, subset_samples, &args->smpl_pass); + free(str.s); } static void destroy_data(args_t *args) @@ -903,7 +912,19 @@ static void destroy_data(args_t *args) free(args->csq_str); if ( args->filter ) filter_destroy(args->filter); if ( args->convert ) convert_destroy(args->convert); - if ( args->fh_vcf && hts_close(args->fh_vcf)!=0 ) error("Error: close failed .. %s\n",args->output_fname); + if ( args->fh_vcf ) + { + if ( args->write_index ) + { + if ( bcf_idx_save(args->fh_vcf)<0 ) + { + if ( hts_close(args->fh_vcf)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->fh_vcf)!=0 ) error("Error: close failed .. %s\n",args->output_fname); + } if ( args->fh_bgzf && bgzf_close(args->fh_bgzf)!=0 ) error("Error: close failed .. %s\n",args->output_fname); free(args); } @@ -1096,7 +1117,7 @@ static void filter_and_output(args_t *args, bcf1_t *rec, int severity_pass, int { if ( args->nannot ) { - if ( !updated || all_missing ) return; // the standard case: using -f to print the CSQ subfields, skipping if missing + if ( args->drop_sites && (!updated || all_missing) ) return; // the standard case: using -f to print the CSQ subfields, skipping if missing } else { @@ -1301,6 +1322,7 @@ int run(int argc, char **argv) static struct option loptions[] = { {"drop-sites",no_argument,0,'x'}, + {"keep-sites",no_argument,0,'X'}, {"all-fields",no_argument,0,'A'}, {"duplicate",no_argument,0,'d'}, {"format",required_argument,0,'f'}, @@ -1325,11 +1347,12 @@ int run(int argc, char **argv) {"targets-overlap",required_argument,NULL,4}, {"no-version",no_argument,NULL,2}, {"allow-undef-tags",no_argument,0,'u'}, + {"write-index",no_argument,NULL,6}, {NULL,0,NULL,0} }; - int c; + int c, drop_sites = -1; char *tmp; - while ((c = getopt_long(argc, argv, "o:O:i:e:r:R:t:T:lS:s:c:p:a:f:dA:xuHg:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "o:O:i:e:r:R:t:T:lS:s:c:p:a:f:dA:xXuHg:",loptions,NULL)) >= 0) { switch (c) { @@ -1341,7 +1364,8 @@ int run(int argc, char **argv) else args->all_fields_delim = optarg; break; case 'H': args->print_header = 1; break; - case 'x': args->drop_sites = 1; break; + case 'x': drop_sites = 1; break; + case 'X': drop_sites = 0; break; case 'd': args->duplicate = 1; break; case 'f': args->format_str = strdup(optarg); break; case 'g': args->genes_fname = optarg; break; @@ -1390,12 +1414,14 @@ int run(int argc, char **argv) if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 5 : args->gene_fields_str = optarg; break; + case 6 : args->write_index = 1; break; case 'h': case '?': default: error("%s", usage_text()); break; } } - if ( args->drop_sites && args->format_str ) error("Error: the -x behavior is the default (and only supported) with -f\n"); + if ( drop_sites==-1 ) drop_sites = args->format_str ? 1 : 0; + args->drop_sites = drop_sites; if ( args->print_header && !args->format_str ) error("Error: the -H header printing is supported only with -f\n"); if ( args->all_fields_delim && !args->format_str ) error("Error: the -A option must be used with -f\n"); if ( args->severity && (!strcmp("?",args->severity) || !strcmp("-",args->severity)) ) error("%s", default_severity()); @@ -1440,6 +1466,7 @@ int run(int argc, char **argv) args->fh_vcf = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_split-vep"); if ( bcf_hdr_write(args->fh_vcf, args->hdr_out)!=0 ) error("Failed to write the header to %s\n", args->output_fname); + if ( args->write_index && init_index(args->fh_vcf,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); } while ( bcf_sr_next_line(args->sr) ) process_record(args, bcf_sr_get_line(args->sr,0)); diff --git a/plugins/split.c b/plugins/split.c index a362e0ed9..011981d42 100644 --- a/plugins/split.c +++ b/plugins/split.c @@ -1,5 +1,5 @@ -/* - Copyright (C) 2017-2021 Genome Research Ltd. +/* + Copyright (C) 2017-2023 Genome Research Ltd. Author: Petr Danecek @@ -9,10 +9,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -52,6 +52,7 @@ typedef struct char *fname; // output file name filter_t *filter; bcf_hdr_t *hdr; + char *index_fn; } subset_t; @@ -70,6 +71,7 @@ typedef struct subset_t *sets; int nsets, nhts_opts; char **hts_opts; + int write_index; } args_t; @@ -80,7 +82,7 @@ const char *about(void) static const char *usage_text(void) { - return + return "\n" "About: Split VCF by sample, creating single- or multi-sample VCFs. The output files are named\n" " by sample names whenever possible, with the characters from the set [ \\t:/\\] replaced\n" @@ -124,6 +126,7 @@ static const char *usage_text(void) " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n" " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" " --hts-opts LIST Low-level options to pass to HTSlib, e.g. block_size=32768\n" + " --write-index Automatically index the output files [off]\n" "\n" "Examples:\n" " # Split a VCF file\n" @@ -485,6 +488,7 @@ static void init_data(args_t *args) for (j=0; jnsmpl; j++) set->hdr->samples[j] = set->rename ? set->rename[j] : args->hdr_in->samples[set->smpl[j]]; if ( bcf_hdr_write(set->fh, set->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,str.s); + if ( args->write_index && init_index(set->fh,set->hdr,str.s,&set->index_fn)<0 ) error("Error: failed to initialise index for %s\n",str.s); if ( args->filter_str ) set->filter = filter_init(set->hdr, args->filter_str); } @@ -500,6 +504,15 @@ static void destroy_data(args_t *args) for (i=0; insets; i++) { subset_t *set = &args->sets[i]; + if ( args->write_index ) + { + if ( bcf_idx_save(set->fh)<0 ) + { + if ( hts_close(set->fh)!=0 ) error("Error: close failed .. %s\n", set->fname); + error("Error: cannot write to index %s\n", set->index_fn); + } + free(set->index_fn); + } if ( hts_close(set->fh)!=0 ) error("Error: close failed .. %s\n",set->fname); free(set->fname); free(set->smpl); @@ -596,7 +609,7 @@ static void process(args_t *args) bcf_unpack(rec, BCF_UN_ALL); int i; - bcf1_t *out = NULL; + bcf1_t *out = NULL; for (i=0; insets; i++) { subset_t *set = &args->sets[i]; @@ -641,13 +654,14 @@ int run(int argc, char **argv) {"groups-file",required_argument,NULL,'G'}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, + {"write-index",no_argument,NULL,4}, {NULL,0,NULL,0} }; int c; char *tmp; while ((c = getopt_long(argc, argv, "vr:R:t:T:o:O:i:e:k:S:G:",loptions,NULL)) >= 0) { - switch (c) + switch (c) { case 1 : args->hts_opts = hts_readlist(optarg,0,&args->nhts_opts); break; case 'k': args->keep_tags = optarg; break; @@ -658,11 +672,11 @@ int run(int argc, char **argv) if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'T': args->target = optarg; args->target_is_file = 1; break; - case 't': args->target = optarg; break; + case 't': args->target = optarg; break; case 'R': args->region = optarg; args->region_is_file = 1; break; case 'S': args->samples_fname = optarg; break; case 'G': args->groups_fname = optarg; break; - case 'r': args->region = optarg; break; + case 'r': args->region = optarg; break; case 'o': args->output_dir = optarg; break; case 'O': switch (optarg[0]) { @@ -690,6 +704,7 @@ int run(int argc, char **argv) args->targets_overlap = parse_overlap_option(optarg); if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; + case 4 : args->write_index = 1; break; case 'h': case '?': default: error("%s", usage_text()); break; @@ -708,7 +723,7 @@ int run(int argc, char **argv) if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n"); init_data(args); - + while ( bcf_sr_next_line(args->sr) ) process(args); destroy_data(args); diff --git a/plugins/trio-dnm2.c b/plugins/trio-dnm2.c index 4783458b2..7cbf7fbcd 100644 --- a/plugins/trio-dnm2.c +++ b/plugins/trio-dnm2.c @@ -125,6 +125,8 @@ typedef struct int need_QS; int strictly_novel; priors_t priors, priors_X, priors_XX; + char *index_fn; + int write_index; } args_t; @@ -179,6 +181,7 @@ static const char *usage_text(void) " --use-NAIVE A naive calling model which uses only FMT/GT to determine DNMs\n" " --with-pAD Do not use FMT/QS but parental FMT/AD\n" " --with-pPL Do not use FMT/QS but parental FMT/PL. Equals to DNG with bugs fixed (more FPs, fewer FNs)\n" + " --write-index Automatically index the output files [off]\n" "\n" "Example:\n" " # Annotate VCF with FORMAT/DNM, run for a single trio\n" @@ -767,6 +770,7 @@ static void init_data(args_t *args) args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); if ( args->dnm_score_type & DNM_FLOAT ) args->dnm_qual_float = (float*) malloc(sizeof(*args->dnm_qual_float)*bcf_hdr_nsamples(args->hdr)); @@ -796,6 +800,15 @@ static void destroy_data(args_t *args) free(args->ad); free(args->qs); free(args->qs3); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); bcf_hdr_destroy(args->hdr_out); bcf_sr_destroy(args->sr); @@ -1582,6 +1595,7 @@ int run(int argc, char **argv) {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"targets-overlap",required_argument,NULL,15}, + {"write-index",no_argument,NULL,16}, {NULL,0,NULL,0} }; int c; @@ -1670,6 +1684,7 @@ int run(int argc, char **argv) args->targets_overlap = parse_overlap_option(optarg); if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; + case 16 : args->write_index = 1; break; case 'X': args->chrX_list_str = optarg; break; case 'u': set_option(args,optarg); break; case 'e': diff --git a/plugins/variant-distance.c b/plugins/variant-distance.c index a1aeb9aef..1d195c133 100644 --- a/plugins/variant-distance.c +++ b/plugins/variant-distance.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2022 Genome Research Ltd. + Copyright (C) 2022-2023 Genome Research Ltd. Author: Petr Danecek @@ -63,6 +63,8 @@ typedef struct bcf_hdr_t *hdr; bcf_srs_t *sr; vcfbuf_t *buf; + char *index_fn; + int write_index; } args_t; @@ -91,6 +93,7 @@ static const char *usage_text(void) " -t, --targets REGION Similar to -r but streams rather than index-jumps\n" " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n" + " --write-index Automatically index the output files [off]\n" "Examples:\n" " bcftools +variant-distance input.bcf -Ob -o output.bcf\n" "\n"; @@ -126,6 +129,7 @@ static void init_data(args_t *args) if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); args->buf = vcfbuf_init(args->hdr, 0); vcfbuf_set_opt(args->buf,int,VCFBUF_DUMMY,1) @@ -134,6 +138,15 @@ static void destroy_data(args_t *args) { if ( args->filter ) filter_destroy(args->filter); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); bcf_sr_destroy(args->sr); vcfbuf_destroy(args->buf); @@ -233,6 +246,7 @@ int run(int argc, char **argv) {"targets-overlap",required_argument,NULL,2}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, + {"write-index",no_argument,NULL,4}, {NULL,0,NULL,0} }; int c; @@ -286,6 +300,7 @@ int run(int argc, char **argv) if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; + case 4 : args->write_index = 1; break; case 'h': case '?': default: error("%s", usage_text()); break; diff --git a/plugins/variantkey-hex.c b/plugins/variantkey-hex.c index c126babfc..bb07ac992 100644 --- a/plugins/variantkey-hex.c +++ b/plugins/variantkey-hex.c @@ -2,7 +2,7 @@ Copyright (C) 2017-2018 GENOMICS plc. - Author: Nicola Asuni + Author: Nicola Asuni Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/reheader.c b/reheader.c index 4458f27bc..ed852173c 100644 --- a/reheader.c +++ b/reheader.c @@ -68,7 +68,8 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see kstring_t key = {0,0,0}, val = {0,0,0}, tmp = {0,0,0}; char *chr_name = NULL, *p, *q = line + 9; // skip ##contig= char *end = q; - int nopen = 1, chr_len = 0; + int nopen = 1; + hts_pos_t chr_len = 0; while ( *end && *end!='\n' ) end++; while ( *q && *q!='\n' && nopen>0 ) { @@ -118,7 +119,7 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see if ( !strcmp("ID",key.s) ) { if ( khash_str2int_has_key(chr_seen,val.s) ) continue; - chr_len = faidx_seq_len(fai, val.s); + chr_len = faidx_seq_len64(fai, val.s); if ( chr_len==-1 ) { free(val.s); free(key.s); free(tmp.s); @@ -136,7 +137,7 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see if ( quoted ) kputc('"',&tmp); } if ( !chr_name ) return end; - ksprintf(dst,"##contig=",chr_name,chr_len,tmp.l ? tmp.s : ""); + ksprintf(dst,"##contig=",chr_name,chr_len,tmp.l ? tmp.s : ""); free(key.s); free(val.s); free(tmp.s); return q; } @@ -211,7 +212,7 @@ static void update_from_fai(args_t *args) for (i=0; i\n",faidx_iseq(fai,i),faidx_seq_len(fai,faidx_iseq(fai,i))); + ksprintf(&hdr_txt_new,"##contig=\n",faidx_iseq(fai,i),faidx_seq_len64(fai,faidx_iseq(fai,i))); } kputs(tmp+1,&hdr_txt_new); @@ -699,7 +700,7 @@ int main_reheader(int argc, char *argv[]) int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; - + static struct option loptions[] = { {"temp-prefix",1,0,'T'}, diff --git a/test/annotate.33.out b/test/annotate.33.out new file mode 100644 index 000000000..b651a0f9b --- /dev/null +++ b/test/annotate.33.out @@ -0,0 +1,41 @@ +##fileformat=VCFv4.1 +##FILTER= +##INFO= +##FORMAT= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FILTER= +##FILTER= +##contig= +##contig= +##contig= +##contig= +##test= +##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta +##readme=AAAAAA +##readme=BBBBBB +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B +1 3000150 . C T 59.2 PASS AN=4;AC=2;XXX GT:GQ 0/1:245 0/1:245 +1 3000151 . C T 59.2 PASS AN=4;AC=2;XXX GT:DP:GQ 0/1:32:245 0/1:32:245 +1 3062915 id3D GTTT G 12.9 q10 DP4=1,2,3,4;AN=4;AC=2;INDEL;STR=test;XXX GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 0/1:409:35:-20,-5,-20 +1 3062915 idSNP G T,C 12.6 test TEST=5;DP4=1,2,3,4;AN=3;AC=1,1;XXX GT:TT:GQ:DP:GL 0/1:0,1:409:35:-20,-5,-20,-20,-5,-20 2:0,1:409:35:-20,-5,-20 +1 3106154 . CAAA C 342 PASS AN=4;AC=2;XXX GT:GQ:DP 0/1:245:32 0/1:245:32 +1 3106154 . C CT 59.2 PASS AN=4;AC=2;XXX GT:GQ:DP 0/1:245:32 0/1:245:32 +1 3157410 . GA G 90.6 q10 AN=4;AC=4;XXX GT:GQ:DP 1/1:21:21 1/1:21:21 +1 3162006 . GAA G 60.2 PASS AN=4;AC=2;XXX GT:GQ:DP 0/1:212:22 0/1:212:22 +1 3177144 . G T 45 PASS AN=4;AC=2;XXX GT:GQ:DP 0/0:150:30 1/1:150:30 +1 3177144 . G . 45 PASS AN=4;AC=0;XXX GT:GQ:DP 0/0:150:30 0/0:150:30 +1 3184885 . TAAAA TA,T 61.5 PASS AN=4;AC=2,2;XXX GT:GQ:DP 1/2:12:10 1/2:12:10 +2 3199812 . G GTT,GT 82.7 PASS AN=4;AC=2,2;XXX GT:GQ:DP 1/2:322:26 1/2:322:26 +3 3212016 . CTT C,CT 79 PASS AN=4;AC=2,2;XXX GT:GQ:DP 1/2:91:26 1/2:91:26 +4 3258448 . TACACACAC T 59.9 PASS AN=4;AC=2;XXX GT:GQ:DP 0/1:325:31 0/1:325:31 +4 4000000 . T A,C 59.9 PASS AN=4;AC=2,0;XXX GT:GQ:DP 0/1:325:31 0/1:325:31 +4 4000001 . T A 59.9 PASS AN=4;AC=2;XXX GT:GQ:DP 0/1:325:31 0/1:325:31 diff --git a/test/annotate.olap.2.out b/test/annotate.olap.2.out index eab0ef4fc..8453306df 100644 --- a/test/annotate.olap.2.out +++ b/test/annotate.olap.2.out @@ -5,6 +5,7 @@ ##ALT= ##INFO= ##INFO= +##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO -1 10 . C . . END=19;DB=cnv10_15 +1 10 . C . . END=19;DB=cnv10_15;XXX 1 20 . C . . END=30 diff --git a/test/annotate34.out b/test/annotate34.out new file mode 100644 index 000000000..819aa0368 --- /dev/null +++ b/test/annotate34.out @@ -0,0 +1,14 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##FORMAT= +##FORMAT= +##FILTER= +##FILTER= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT syndip +chr21 8914240 . gttccattccattccattcaattccattccattgcattccattccattccattcca G 30 HET2 SVTYPE=DEL;SVLEN=55;END=8914295 GT:AD 0|.:3,1 +chr21 8914680 . tattccattccattcc TATTCCATTCCATTCCATTCCATTCCATTCCATTCCATTCCATTCTAGTTGATTCCATTCCATTCCATCCCGTTCCATTCCATTCCGTTACTTTCTATTCCATTCCATTCCATTCC 30 HET2 SVTYPE=INS;SVLEN=100;END=8914681 GT:AD 0|.:1,1 +chr21 8914690 . c CATTCCATTCCATTCCATTCCATTCTAGTTGATTCCATTCCATTCCATCCCGTTCCATTCCATTCCGTTACTTTCT 30 HET2 SVTYPE=INS;SVLEN=75;END=8914691 GT:AD 0|.:2,1 diff --git a/test/annotate34.vcf b/test/annotate34.vcf new file mode 100644 index 000000000..4b60b89ba --- /dev/null +++ b/test/annotate34.vcf @@ -0,0 +1,13 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##FORMAT= +##FORMAT= +##FILTER= +##FILTER= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT syndip +chr21 8914240 . gttccattccattccattcaattccattccattgcattccattccattccattcca G 30 HET2 SVTYPE=DEL;SVLEN=55 GT:AD 0|.:3,1 +chr21 8914680 . tattccattccattcc TATTCCATTCCATTCCATTCCATTCCATTCCATTCCATTCCATTCTAGTTGATTCCATTCCATTCCATCCCGTTCCATTCCATTCCGTTACTTTCTATTCCATTCCATTCCATTCC 30 HET2 SVTYPE=INS;SVLEN=100 GT:AD 0|.:1,1 +chr21 8914690 . c CATTCCATTCCATTCCATTCCATTCTAGTTGATTCCATTCCATTCCATCCCGTTCCATTCCATTCCGTTACTTTCT 30 HET2 SVTYPE=INS;SVLEN=75 GT:AD 0|.:2,1 diff --git a/test/annots34.tab b/test/annots34.tab new file mode 100644 index 000000000..06b217627 --- /dev/null +++ b/test/annots34.tab @@ -0,0 +1,3 @@ +chr21 8914240 8914240 8914295 +chr21 8914680 8914680 8914681 +chr21 8914690 8914690 8914691 diff --git a/test/concat.5.3.out b/test/concat.5.3.out new file mode 100644 index 000000000..a6eeb44e3 --- /dev/null +++ b/test/concat.5.3.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 1 . A C . . . +chr1 2 . C G . . . +chr1 3 . G T . . . +chr1 4 . T A . . . diff --git a/test/consensus.19.out b/test/consensus.19.out new file mode 100644 index 000000000..84a241db0 --- /dev/null +++ b/test/consensus.19.out @@ -0,0 +1,20 @@ +>1:2-501 +TAC:A:AT:Tga::t+++AT:AaAAAGAACATAACCTACGTATCAACTAAAGTGGTTGTT +TG:AGAAAAGGAAGACTTAAAAAGAGTCAGTACTAACCTACATAATATATACAATGTTCA +TTAAATAATAAAATGAGCTCATCATACTTAGGTCATCATAAATATATCTGAAATTCACAA +ATATTGATCAAATGGTAAAATAGACAAGTAGATTTTAATAGGTTAAACAATTACTGATTC +TCTTGAAAGAATAAATTTAATATGAGACCTATTTCATTATAATGAACTCACAAATTAGAA +ACTTCACACTGGGGGCTGGAGAGATGGCTCAGTAGTTAAGAACACTGACTGCTCTTCTGA +AGGTCCTGAGTTCAAATCCCAGCAACCACATGGTGACTTACAACCATCTGTAATGACATC +TGATGCCCTCTGGTGTGTCTGAAGACAGCTACAGTGTACTTACATAAAATAATAAATAAA +TCTTTAAAAACAAAAAAAAAGAA +>2 +gaagatcttttccttattaaggatctgaagctctgtagatttgtattctattaaacatgg +A:::attagtgattttccatattctttaagtcattttagagtaatgtgttcttaagat:: +:tcagaaaaacaaaaacttgtgctttcctgtttgaaaaacaaacagctgtggggaatgG+ ++++++++tgtcgggacagcctttttatA----------aaataatgttgaggctttgata +cgtcaaagttatatttcaaatggaatcacttagacctcgtttctgagtgtcaatggccat +attggggAtttgctgctgccaatgacaGcacaccctgggaatgccccaactacttactac +aaagcagtgttacatggagaagatcttcaagagtctttttgctagatctttccttggctt +ttgatgtgactcctctcaataaaatccacagtaatatagtgagtggtctcctgctccaaa +ccagtatt:cagacacagttaatccagac diff --git a/test/consensus.21.fa b/test/consensus.21.fa new file mode 100644 index 000000000..e81503b5b --- /dev/null +++ b/test/consensus.21.fa @@ -0,0 +1,6 @@ +>17 +ACGTACGT +>18 +ACGTACGT +>19 +ACGTACGT diff --git a/test/consensus.21.vcf b/test/consensus.21.vcf new file mode 100644 index 000000000..0c4dbb702 --- /dev/null +++ b/test/consensus.21.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.2 +##reference=file://some/path/human_g1k_v37.fasta +##contig= +##contig= +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a +19 2 . C A . . . GT 0/0 +19 3 . G C . . . GT 0/0 +19 4 . T C . . . GT 0/1 +19 5 . A C . . . GT 1/1 diff --git a/test/consensus21.1.out b/test/consensus21.1.out new file mode 100644 index 000000000..7633e2ff2 --- /dev/null +++ b/test/consensus21.1.out @@ -0,0 +1,6 @@ +>17 +ACGTACGT +>18 +ACGTACGT +>19 +ACGYCCGT diff --git a/test/consensus7.vcf b/test/consensus7.vcf index 21d657291..bc27f1f69 100644 --- a/test/consensus7.vcf +++ b/test/consensus7.vcf @@ -4,9 +4,9 @@ ##ALT= ##contig= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA001 -1 2 . C A . . . GT 0|1 -1 3 . G A . . . GT 0|1 -1 4 . T A . . . GT 0|1 -1 6 . C A . . . GT 0/1 -1 7 . G A . . . GT 0/1 -1 8 . T A . . . GT 0/1 +1 2 . C A . . . GT 0|1|0|1 +1 3 . G A . . . GT 0|1|0|1 +1 4 . T A . . . GT 0|1|0|1 +1 6 . C A . . . GT 0/1/0/1 +1 7 . G A . . . GT 0/1/0/1 +1 8 . T A . . . GT 0/1/0/1 diff --git a/test/convert.tsv b/test/convert.tsv new file mode 100644 index 000000000..e9f7c7e24 --- /dev/null +++ b/test/convert.tsv @@ -0,0 +1,24 @@ +rs001 1 2 A A +rs002 1 10 A G +rs003 1 14 A G +rs004 1 24 T C +rs005 1 44 C G +rs006 1 53 G G +rs007 1 60 G G +rs008 1 62 C C +rs009 1 75 A A +rs010 1 80 G G +rs011 1 89 T T +rs013 1 99 C C +rs014 1 102 G G +rs015 1 112 T T +rs016 2 5 C C +rs017 2 11 C T +rs018 2 16 C C +rs019 2 20 G G +rs020 2 33 C T +rs021 2 39 A A +rs022 2 44 C C +rs023 2 48 C C +rs024 2 55 A A +rs025 2 59 C T diff --git a/test/convert.tsv.vcf b/test/convert.tsv.vcf new file mode 100644 index 000000000..fec089b66 --- /dev/null +++ b/test/convert.tsv.vcf @@ -0,0 +1,31 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##contig= +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 2 . A . . . . +1 10 . A G . . . +1 14 . A G . . . +1 24 . T C . . . +1 44 . C G . . . +1 53 . G . . . . +1 60 . G . . . . +1 62 . C . . . . +1 75 . A . . . . +1 80 . G . . . . +1 89 . T . . . . +1 99 . C . . . . +1 102 . G . . . . +1 112 . T . . . . +2 5 . C . . . . +2 11 . C T . . . +2 16 . C . . . . +2 20 . G . . . . +2 33 . C T . . . +2 39 . A . . . . +2 44 . C . . . . +2 48 . C . . . . +2 55 . A . . . . +2 59 . C T . . . diff --git a/test/csq.chr.out b/test/csq.chr.out new file mode 100644 index 000000000..81d3f79be --- /dev/null +++ b/test/csq.chr.out @@ -0,0 +1,2 @@ +. +missense|gtrV|gtrV|protein_coding|+|1L>1I|1T>A diff --git a/test/csq.nchr.fa b/test/csq.nchr.fa new file mode 100644 index 000000000..f6f29f3ee --- /dev/null +++ b/test/csq.nchr.fa @@ -0,0 +1,2 @@ +>1 +TTAAGGCTGTTTTTTTATTAATGTCATCGTCCATCCTGCAGGGTTGAACTTGAAAGAATA diff --git a/test/csq.nchr.gff b/test/csq.nchr.gff new file mode 100644 index 000000000..0a70b0749 --- /dev/null +++ b/test/csq.nchr.gff @@ -0,0 +1,3 @@ +1 . gene 1 60 . + . ID=gene:gtrV;Name=gtrV;gene_biotype=protein_coding +1 . transcript 1 60 . + . ID=transcript:gtrV;Parent=gene:gtrV;gene_biotype=protein_coding +1 . CDS 1 60 . + . Parent=transcript:gtrV diff --git a/test/csq.nchr.vcf b/test/csq.nchr.vcf new file mode 100644 index 000000000..d29f2f6dd --- /dev/null +++ b/test/csq.nchr.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.2 +##reference=dummy.fa +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT smpl +1 1 . T A . . . GT 0|1 diff --git a/test/csq.ychr.fa b/test/csq.ychr.fa new file mode 100644 index 000000000..98aea0857 --- /dev/null +++ b/test/csq.ychr.fa @@ -0,0 +1,2 @@ +>chr1 +TTAAGGCTGTTTTTTTATTAATGTCATCGTCCATCCTGCAGGGTTGAACTTGAAAGAATA diff --git a/test/csq.ychr.gff b/test/csq.ychr.gff new file mode 100644 index 000000000..99326c5b8 --- /dev/null +++ b/test/csq.ychr.gff @@ -0,0 +1,3 @@ +chr1 . gene 1 60 . + . ID=gene:gtrV;Name=gtrV;gene_biotype=protein_coding +chr1 . transcript 1 60 . + . ID=transcript:gtrV;Parent=gene:gtrV;gene_biotype=protein_coding +chr1 . CDS 1 60 . + . Parent=transcript:gtrV diff --git a/test/csq.ychr.vcf b/test/csq.ychr.vcf new file mode 100644 index 000000000..0f1139e2a --- /dev/null +++ b/test/csq.ychr.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.2 +##reference=dummy.fa +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT smpl +chr1 1 . T A . . . GT 0|1 diff --git a/test/csq/EDUMMY0003/long-del.txt b/test/csq/EDUMMY0003/long-del.txt index 3a8fd3fcc..2e3222073 100644 --- a/test/csq/EDUMMY0003/long-del.txt +++ b/test/csq/EDUMMY0003/long-del.txt @@ -1,3 +1,3 @@ 28503 TTCACACCTGATGTGCGTCC T 3_prime_utr|PCGF3|ENST00000430644|NMD,3_prime_utr|PCGF3|ENST00000440452|NMD,5_prime_utr|PCGF3|ENST00000521023|protein_coding,frameshift|PCGF3|ENST00000400151|protein_coding|+|157SSHLMCVLTPDESVLWC*>157SSHLMSLCFGV|28503TTCACACCTGATGTGCGTCC>T -28503 TTCACACCTGATGTGCGTCC T 3_prime_utr|PCGF3|ENST00000430644|NMD,3_prime_utr|PCGF3|ENST00000440452|NMD,5_prime_utr|PCGF3|ENST00000521023|protein_coding,frameshift|PCGF3|ENST00000400151|protein_coding|+|157SSHLMCVLTPDESVLWC*>157SSHLMSLCFGV|28503TTCACACCTGATGTGCGTCC>T +28503 TTCACACCTGATGTGCGTCC T 3_prime_utr&NMD_transcript|PCGF3|ENST00000430644|NMD,3_prime_utr&NMD_transcript|PCGF3|ENST00000440452|NMD,5_prime_utr|PCGF3|ENST00000521023|protein_coding,frameshift|PCGF3|ENST00000400151|protein_coding|+|157SSHLMCVLTPDESVLWC*>157SSHLMSLCFGV|28503TTCACACCTGATGTGCGTCC>T diff --git a/test/csq/ENSMUST00000121418/filter-problem.2.cmd.out b/test/csq/ENSMUST00000121418/filter-problem.2.cmd.out index ce3b6f10c..32d88a799 100644 --- a/test/csq/ENSMUST00000121418/filter-problem.2.cmd.out +++ b/test/csq/ENSMUST00000121418/filter-problem.2.cmd.out @@ -1,3 +1,3 @@ -25105 G A 5_prime_utr|Rab3il1|ENSMUST00000121418|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000131407|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000137637|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000149967|protein_coding,synonymous|Rab3il1|ENSMUST00000113161|protein_coding|+|52E|25105G>A,synonymous|Rab3il1|ENSMUST00000117641|protein_coding|+|52E|25105G>A,synonymous|Rab3il1|ENSMUST00000144788|NMD|+|52E|25105G>A +25105 G A 5_prime_utr|Rab3il1|ENSMUST00000121418|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000131407|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000137637|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000149967|protein_coding,synonymous&NMD_transcript|Rab3il1|ENSMUST00000144788|NMD|+|52E|25105G>A,synonymous|Rab3il1|ENSMUST00000113161|protein_coding|+|52E|25105G>A,synonymous|Rab3il1|ENSMUST00000117641|protein_coding|+|52E|25105G>A 25496 ACAAA ACAAAGCAAA . 25500 AACAA AACAAGACAA intron|Rab3il1||NMD,intron|Rab3il1||protein_coding diff --git a/test/csq/ENSMUST00000121418/filter-problem.3.cmd.out b/test/csq/ENSMUST00000121418/filter-problem.3.cmd.out index 328acbb64..a93bcbd79 100644 --- a/test/csq/ENSMUST00000121418/filter-problem.3.cmd.out +++ b/test/csq/ENSMUST00000121418/filter-problem.3.cmd.out @@ -1,3 +1,3 @@ -25105 G A 5_prime_utr|Rab3il1|ENSMUST00000121418|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000131407|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000137637|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000149967|protein_coding,synonymous|Rab3il1|ENSMUST00000113161|protein_coding|+|52E|25105G>A,synonymous|Rab3il1|ENSMUST00000117641|protein_coding|+|52E|25105G>A,synonymous|Rab3il1|ENSMUST00000144788|NMD|+|52E|25105G>A +25105 G A 5_prime_utr|Rab3il1|ENSMUST00000121418|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000131407|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000137637|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000149967|protein_coding,synonymous&NMD_transcript|Rab3il1|ENSMUST00000144788|NMD|+|52E|25105G>A,synonymous|Rab3il1|ENSMUST00000113161|protein_coding|+|52E|25105G>A,synonymous|Rab3il1|ENSMUST00000117641|protein_coding|+|52E|25105G>A 25496 ACAAA ACAAAGCAAA intron|Rab3il1||NMD,intron|Rab3il1||protein_coding 25500 AACAA AACAAGACAA . diff --git a/test/csq/ENST00000360372/ENST00000360372.fa.fai b/test/csq/ENST00000360372/ENST00000360372.fa.fai new file mode 100644 index 000000000..0bd0f8f06 --- /dev/null +++ b/test/csq/ENST00000360372/ENST00000360372.fa.fai @@ -0,0 +1 @@ +chr1 14286 31 60 61 diff --git a/test/csq/ENST00000479739/short-cds-start-lost.txt b/test/csq/ENST00000479739/short-cds-start-lost.txt index e1ae5a880..5c5bc9902 100644 --- a/test/csq/ENST00000479739/short-cds-start-lost.txt +++ b/test/csq/ENST00000479739/short-cds-start-lost.txt @@ -1,3 +1,3 @@ 25091 T C start_lost&splice_region|SH3YL1|ENST00000479739|NMD|-|1M>1V|25091T>C -25091 T C start_lost&splice_region|SH3YL1|ENST00000479739|NMD|-|1M>1V|25091T>C +25091 T C start_lost&splice_region&NMD_transcript|SH3YL1|ENST00000479739|NMD|-|1M>1V|25091T>C diff --git a/test/csq/ENST00000479739/short-cds-start-lost.txt-l b/test/csq/ENST00000479739/short-cds-start-lost.txt-l index e1ae5a880..5c5bc9902 100644 --- a/test/csq/ENST00000479739/short-cds-start-lost.txt-l +++ b/test/csq/ENST00000479739/short-cds-start-lost.txt-l @@ -1,3 +1,3 @@ 25091 T C start_lost&splice_region|SH3YL1|ENST00000479739|NMD|-|1M>1V|25091T>C -25091 T C start_lost&splice_region|SH3YL1|ENST00000479739|NMD|-|1M>1V|25091T>C +25091 T C start_lost&splice_region&NMD_transcript|SH3YL1|ENST00000479739|NMD|-|1M>1V|25091T>C diff --git a/test/csq/ENST00000571540/ENST00000571540.fa b/test/csq/ENST00000571540/ENST00000571540.fa new file mode 100644 index 000000000..9ea3e30e9 --- /dev/null +++ b/test/csq/ENST00000571540/ENST00000571540.fa @@ -0,0 +1,84 @@ +>17 +CAGCCCCAGTTCACCCCTCGGGGCGGAGGATCTCCTGAGTGATTCATCAGAACCCCCTGGGCTCAACCAAGTGTCGTCTG +AAGTGACCTCCCAGCTCTATGCTTCTTTGCGCCTCAGCCGGCAGGCGGAGGCCACGGCCCGAGCCCAGCTGTATTTACCC +TCCACCTCCCCGCCTCATGAAGGGTTAGACGGCTTCGCCCAAGAATTGAGTCGAAGCTTGTCAGTCGGATTGGAAAAGAA +CTTGAAGAAAAAGGTGAGGGAAGTGTGTCTTGGAGACCACTGTGGCACTAGACACCAGAGAGTCTTGGGATTGGGGTTGG +TAAAAATAAAAAGCTTTGATGAGATTTGAACTCTTCCTGTTGGATTTCATATTCCTTTTAACTGCATAGGCAGCCATGCT +TATAAGGGAGGGAGTGACCTGGGACACCATAATTTGAAAATTATGAAACTTCCCAGTGTTTTTTTGTGAGAGACATCTCT +GCTCCGAGTAGATAGAGCAAACCTATGGGGTAGGTTGGTGAGCTATTTCCCGTGCACTGGGAATGGGTAAGGTTTCTTGA +TCCCAAGAGGGAGCTAGGGACTTAGGATAGCAGCTCCGATCCTTCCAGCTCAACACTATGTTGATAGTATGGTTCCAACT +TTGGCATGTACATCATTAAGACATAGCTTAGTCAACTACAACATGTTACAGAGAAGATAGTTATTAGTATTGTATAGAAA +AGGTGTCAGAGTCAATTTGCAGACTGGTTAGGTCTTCCAGAGTTTGAAAATGATAGCCATAAGCCATACAGTCCTTACAT +TTGTTCTTGAATCGGCAAGAATCTCCGCAACTCTTGTCTTATACCTCCTCGATTATTTTATTTACACTCTCTTTCTGGCT +TCTGTAGTATTTGAATTTCTGATTCAAGTTTAGGACATCCCTTATTGGATCCGTTTGGTTTTTCTTGTTTTCCAAAGTGC +TGGAGTGAAAATTCTACCCTGGCAATAGGTAGGAGATAGATAACACAGACTGCATCTGATTTCCATGTGGCTTTTTTCCA +GGATGGTTCTAAGCATATCTTTGAGATGGAAAGTGTTCGGGGTCAGCTCCAGACCATGCTCCAAACCTCACGTGATACAG +CCTATCGTGAGTAAGCCCCTTCCCTAGAACTATGAAGGAGAACCTAGATGTAAGGGGTGGGAGAAAGTGGACAGAGGAAG +CAGGCAGGAAAACCTCTGAGCAAGAGATTTCAGGAGAAAAGTCTCCCGTTCTTTGGGAGAGGGGGTGAAGGAGTGAGGGA +ACTGGAAAGGAAGGCCTTTTCTCTTCAGGTGGCCCCTGTTTTTGTCATTACCTGACTCTGGCTTTGGGGTCCCTCTGGCT +GCCTGCAGGGGATCCTCTCATTCCTGGCGCTGGCTCAGAGAGACGGGAAGAGGACTCCTTTGACAGTGATAGCACAGCCA +CCTTGCTCAAGTGAGTTCTCCTTGTGGTTCTCCTAGCTTGTTTGCTTTCTTGGAAATCCAGGTGTTTTGTTCCTTATTTC +TCCCTTTATTGCTTTTGTACCTCTTAAGAACCCAAGAGGCTTATCACTTGCCTTTCAGTTTTCATCTTTTTCTACTGTCT +CCTGCCCCCTTTACTCCTTCAGGAACACAGGTATCTGTTGCCTAGTACCTGGCTTTCTCTTAACCTGCAGTCATTCTTCC +CCTCTAAAGAGTTAGCTTGTTTCTGCTGAAACCTTTAAGTGAATCACAGACCTGTGTGTTATTTATAGCACTATGGTTAT +ATAGCTTTACGACTTTGGACAAGTCTCTTGCCCTCTCTGGGTCTCAGTTTCCTCATTTGTAGAATATCACAGTTGGACAA +GTGGATTCCAAAAGCTTTCCCAGCCCTAACATTCTCTACCTGATTTGCTCAGCACCCGGCCCCTGCAAGACTTGTCTCCA +TCTAGCTCAGCCCAAGCCCTGGAGGAGCTGTTTCCCCGCTACACCAGCCTTCGGCCAGGGCCTCCACTCAATCCCCCAGA +TTTTCAGGGGCTGAGAGATGCATTGGATTCAGAGCATACCCGCCGCAAGGTAAGATGCAAACGCTTCCTTTCGAAAGCAG +CAAAGATTAGAAAGAGGGGACCCAAGTGTTGAAAAGGGCTGAGGGGGCCTGGCGTGGTGGCTCACGCCTGTAATCACAGC +ACTTTGGGAGGCTGAGGCAGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTGGCTAACACGGTGAAACCCGTCTCTA +CTAAAAATAGAAAAAATTAGCCGGGCATGGTGGTGGGTGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATG +GCGTGAACCTGGGAGGCGGAGCTTGCAGTGAGCCGAGATCATGCCGCTGCACTCCAGCCTGGGCGACAGAGCGAGACTCC +GTCTCAAAAAAAAAAAACTAAAAAAGAAAAGGGCTGAGGGTTAAGAGCCTGAAAGCTGGGACTTAAATGTTTTTCTGAGG +AGGGCTTTTTTGACCATTCCATCTAATATGGCACCACCTCCTTTCCAAGTAGTCAGATCGCCTTCTTATTTCTTTCATGT +AGCTTATCGTTCTGATATATCTTGTTTTTTTCTCCTCTCTAAAATTTGAGGGTCCGTATCTTGTTTGTCTTAGTACTGGC +TGCCTAAGGTCTAGAATATACATAACACAGAAACACTCTGTGTTTGTCGAATGGACACAAACTCTGCTAGCTTGTGCACA +GGATTTACAGCTACTGGTGATATACTGAGCTTTATCTCTGTGAGCCTCTGCCTTAGTGTCTGAAGGGGACAGAGAAATAG +ACAAAGGAGAGGAAAAGTCAAAATCACTGGTTGCCATTTTTGGAAGCCTCTCTTTCTCCATGTTAAACCCCTTGGCATGA +ATGAAAGGCTTTCCCCTTTATCCTTGCTGTGTTTATTCATTGTGTCCAATGAGCTTCTGAAGGTAGGAGTAACTTTGCAC +ATTTTGCTGTGTGGGCTGCTGCATCTCTGCCAAAAGGTTTATGGGATTTTTACTCAGGAGATCCAGGAAAAGGAGGAGCC +TGGAATCTAAGCACTTATCTCCTGAAAAGGTACAGAAACATGTTGGAGATCTTGAGCCTCAAGGTGCTTGCTTCAGAATT +TTCCTGATTCTCCCTCTCCAGACCTATTCTGTTCTTGGGACCCAAGCTTCTTGGCTCCAGCCCACTCCCCCACTAGAGGA +GCTGGAAAGTTTGGTGCTGTGGTCATACCAAAGATGGGCAACACCCAGACTCCTCACCCTTTTCCCCAGCATTGTGAGCG +CCATATTCAGAGCCTGCAGACCCGAGTGTTAGAGCTACAGCAACAATTAGCCGTGGCTGTGGCTGCCGACCGCAAGAAAG +ATACCATGATTGAACAACTGGACAAGGTACCAGGGTAGCAAAATGTGGGTGGGTCTCTCCATGAAGAGCATTAAGGAATA +ATAAATAAGTGGGTGGCCAACCAATGTTTCTTGGTACATGCTGAGAGCTGGGCAAGGGGTTGGTTTGCTGACTGTTGGGA +GAAGATGGCTGTTGACCCTGCCCCTGTGGGTAGAAAGAGGCAAAAAAGTTATTTTGAAATTTCATCTTACTTGCCCTACC +TAAGACCCTGGCCCGTGTGGTGGAGGGCTGGAACCGGCATGAGGCTGAGCGGACAGAGGTTCTCAGGGGACTTCAAGAGG +AACACCAGGCAGCAGAGCTCACCAGAAGCAAGCAGCAGGAGGTGAGCGCCCTGGAGCATATGGCATTAGAACCTGAGTCA +CAGATCTCAAGATGGAAAGGGGCAGAAATAGCTCTGGAATTAGGGTTTCCAACAGATTGACCTCAGTCTTGTAGTGGCTG +TCCTGACATTTCTTCTCATCCCTCCTTTCTTCTCCTTTAACCGTTTCTGCTGTCCAACTTGAATCATCAATGTCTTAATT +TTTCCCACACCATGTAGTCTTCTATCTGGCCCCTTAGTCATTTAGTTAACTGTGAAGTTTCAAGTTTACATTGTCAATGC +TTTATAAAATATAGCACACACAGATTTCCCACAGTTCCTCTGCCCTGTATTCCTCTCTTCCTGAAAGACAGTAACCCGCC +TGGAACAAAGCCTTTCTGAGGCCATGGAGGCCCTGAATCGTGAGCAGGAAAGTGCCAGACTGCAGCAACGGGAAAGAGAG +ACACTGGTGAGAAGATTGGACTGGGTTAATTCCACTGGAAGCTGTTAATTACTTCTAGAGAGCTGTGGGCTATTGGTGGA +TTGTGGGAGATTATAATTTGAGTTGCACCAGAGCACTGTTTCCCAAAGTGTGTTCCTTAGAACACTAATTCAGCTAGATA +TTCTATTAAAAAAAAAGGCTCTGCTACCAAGTCGGTTTGAGAAACTCTGCAAATTGTATCATCATTAGAATATTAGTGTC +ATCTGAAGAATTATTTTAAAATATAGGCTAGGCACGGTGGCTCACCCCTGTAATCCCAGCACTTTGGGAGGCCAAGGCAG +GCAGATCATGAGGTCAGGAGTTTGAGACCAGCCTGGCCAGCATGGTGAAACCCCGTCTCTACTAAAAATACAAAAAACTA +GCTGGACATGGTGGTGCATGCCTGTAGTCCCAGCGACTTGGGAGGCTGAGGCAGGAGAATTGCTTGAACCCAGCAGGCAG +AGGTTGCAGTGAGCTGAGATCAAGCCACTGCACTCCAGCCTGGGTGACAGAGTGAGACTCCATCTCAAAAAATAAAAATA +AAAAATAAAATAAAATAAAGATTTCTGGGACTCACCTGCAGAGGTTTTGATTCAGTAGATATATGGTGGAACTCAAATCT +TTTCAGAAATTTCCTGGGTGATTTTTTTCAATCTGGTTTGGGACCTCTGGTGTAGGGCATGGCCAAAAAAGGTATGAAAT +TGACTAACTCAAGTTTCTTTTCGTGTTTTTTTTTTTTTTTTTTTGAGACAGGGTCGCGCTCTGTTGTCCAGGCTGGAGTG +CAGTGGCACAATCTCAGCTCACTGCAGCCTCTGCCCCCTGGATTCTAGCGATTCTCCCACCTTAGCCTCCCAAGTAGCTG +GGACTACAGGCGTATACCACCACGCCTAGCTAATTTTTGTATTTTTTTGGTAGAGACGGGGTTTCGCCATGTTGGCCAGG +CTGGTCTCAAACTCCTGACCTCAAGTGATCTGCCCGCCTTGGCCTCCCAAAGTGCTGGGATTACAGGCATAAGCCACCAT +ACCCAGCCACTTCACTCAAGTCTCTTAGAGCTGAAATGATAAGGTGATTAGCCATATGAATTAGTAGCTGGGATTGGAAC +TTGTTGCATGCATAATGCATGCTTGCTACATGCATGCTGGTACTTGTAGTCTTAACAGGCTATATCTGACAGTCCTGATG +ATGATGGGTGATGGTTCACGATTATAGGCTGGATATTCTAGTCTTAGTGGAATTTCTTTTGACGATGATGGTGGTAACTA +TTGTGTATTGAGTACTTACTATGTGCCTGATGCCGTGCTAAATGCCTTACATATGCTTTTTACTTTAATGCTCAGAAGGC +AGACCCTGTTATGATCTCCAGTTTACAGATAAGAATGTATGCCTAAGAACACATAGTAAATTCCAGAACCAGGATTCGAA +CCTGAGATTTGAACCCAGGTCTACCTGGCTCTGGAACAGGAACTCTTATCTACTATATTATAATCCTATGAGCTTGTCAA +CTATTGTGTTTGTTTGTTTTGAGATGGAGTCTCGCTCTGTTGCCCAGGCTAGGGTTCAGTGGCACAATCTTGGCTCACTG +CAACCTCCGCCTCCCAGGTTCAAGCAATTCTCCTGCCTCAGCCTCCTGAGTAGCTGGGAATACAGGTGTGCACCACCATG +CCCAGCTAATTTTTATATTTTTAGTAGAGACATGGTTTCACCATGTTGGCCAGGCTGGTCCCAAACTCCTGACCTCAGAT +GATCTGCCTGCCTCAGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACTGCGCCTGTCAACTATTGTGGGGTTTCTT +TTGGTTTTATTTGAGATGGAGTCTTGCTCGATGTCGCCTAGGCTGGAGTGCAGTTGTGTGATCCCAGCTCACTGCAACCT +CCACCTCCCAGGTTAGAGCGATTCTCCTGCCTTTGCCTCCCAAGTAGCTGGAATTACAGGCACCCACCACCACGCCTGGC +TAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCATGTTGGACCAGGCTGGTCTCAAACTCCTGACCTCAGGTAATCT +GCCTGCCTTGGCCTCCCAAAATGCTGGGATTACAGGTGTGAGCCACCGCACCCGGCCTAATTATTGTGTTTTTAATGAGC +ATAATTCTCAGCAGGCACCTTGTATGAGGGTCAGAGGGCAGATCGCTGGTCTCTGAAGAGCCTACTAAGGAGCTTGGTTT +TCTTCTGCGGCTGTAGGAGGAGGAAAGGCAAGCTCTGACTCTGAGGTTGGAGGCAGAACAGCAGCGGTGCTGTGTCCTGC +AGGAAGAGCGGGATGCAGCTCGGGCTGGGCAACTGAGTGAGCATCGAGAGTTGGAGACTCTTCGGGCTGCCCTAGAAGAA +GAACGGCAGACCTGGGCCCAGCAAGAGCACCAGCTTAAGGAACACTACCAGGCGCTGCAGGAGGAGA diff --git a/test/csq/ENST00000571540/ENST00000571540.fa.fai b/test/csq/ENST00000571540/ENST00000571540.fa.fai new file mode 100644 index 000000000..c2608fa04 --- /dev/null +++ b/test/csq/ENST00000571540/ENST00000571540.fa.fai @@ -0,0 +1 @@ +17 6627 4 80 81 diff --git a/test/csq/ENST00000571540/ENST00000571540.gff b/test/csq/ENST00000571540/ENST00000571540.gff new file mode 100644 index 000000000..856cccb66 --- /dev/null +++ b/test/csq/ENST00000571540/ENST00000571540.gff @@ -0,0 +1,15 @@ +17 ensembl_havana gene -995 16824 . + . ID=gene:ENSG00000170037;Name=CNTROB;biotype=protein_coding;description=centrobin%2C centriole duplication and spindle assembly protein [Source:HGNC Symbol%3BAcc:HGNC:29616];gene_id=ENSG00000170037;logic_name=ensembl_havana_gene_homo_sapiens;version=14 +17 havana mRNA 21 6607 . + . ID=transcript:ENST00000571540;Parent=gene:ENSG00000170037;Name=CNTROB-206;biotype=nonsense_mediated_decay;transcript_id=ENST00000571540;transcript_support_level=5;version=5 +17 havana exon 21 253 . + . Parent=transcript:ENST00000571540;Name=ENSE00002655061;constitutive=0;ensembl_end_phase=0;ensembl_phase=1;exon_id=ENSE00002655061;rank=1;version=1 +17 havana CDS 21 253 . + 2 ID=CDS:ENSP00000458688;Parent=transcript:ENST00000571540;protein_id=ENSP00000458688 +17 havana exon 1042 1126 . + . Parent=transcript:ENST00000571540;Name=ENSE00003522521;constitutive=0;ensembl_end_phase=1;ensembl_phase=0;exon_id=ENSE00003522521;rank=2;version=1 +17 havana CDS 1042 1126 . + 0 ID=CDS:ENSP00000458688;Parent=transcript:ENST00000571540;protein_id=ENSP00000458688 +17 havana exon 1369 1450 . + . Parent=transcript:ENST00000571540;Name=ENSE00001609771;constitutive=0;ensembl_end_phase=2;ensembl_phase=1;exon_id=ENSE00001609771;rank=3;version=1 +17 havana CDS 1369 1450 . + 2 ID=CDS:ENSP00000458688;Parent=transcript:ENST00000571540;protein_id=ENSP00000458688 +17 havana exon 1893 2049 . + . Parent=transcript:ENST00000571540;Name=ENSE00003784956;constitutive=0;ensembl_end_phase=0;ensembl_phase=2;exon_id=ENSE00003784956;rank=4;version=1 +17 havana CDS 1893 2049 . + 1 ID=CDS:ENSP00000458688;Parent=transcript:ENST00000571540;protein_id=ENSP00000458688 +17 havana CDS 3142 3312 . + 0 ID=CDS:ENSP00000458688;Parent=transcript:ENST00000571540;protein_id=ENSP00000458688 +17 havana exon 3142 3386 . + . Parent=transcript:ENST00000571540;Name=ENSE00002678822;constitutive=0;ensembl_end_phase=-1;ensembl_phase=0;exon_id=ENSE00002678822;rank=5;version=1 +17 havana exon 3605 3721 . + . Parent=transcript:ENST00000571540;Name=ENSE00003688389;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003688389;rank=6;version=1 +17 havana exon 4068 4166 . + . Parent=transcript:ENST00000571540;Name=ENSE00003562163;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003562163;rank=7;version=1 +17 havana exon 6417 6607 . + . Parent=transcript:ENST00000571540;Name=ENSE00002649879;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00002649879;rank=8;version=1 diff --git a/test/csq/ENST00000571540/nmd.txt b/test/csq/ENST00000571540/nmd.txt new file mode 100644 index 000000000..f7cfad3c5 --- /dev/null +++ b/test/csq/ENST00000571540/nmd.txt @@ -0,0 +1,3 @@ +3188 C T missense&NMD_transcript|CNTROB|ENST00000571540|NMD|+|201P>201L|3188C>T +3188 C T missense&NMD_transcript|CNTROB|ENST00000571540|NMD|+|201P>201L|3188C>T + diff --git a/test/csq/ENST00000571540/nmd.vcf b/test/csq/ENST00000571540/nmd.vcf new file mode 100644 index 000000000..eb4a9e4e9 --- /dev/null +++ b/test/csq/ENST00000571540/nmd.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +##contig= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +17 3188 . C T . . type=ENST00000571540:7936284-C-T;EXP=missense&NMD_transcript|CNTROB|ENST00000571540|NMD|+|201P>201L|3188C>T diff --git a/test/csq/ENST00000573314/incorrect-insertion-overlap.txt b/test/csq/ENST00000573314/incorrect-insertion-overlap.txt index 12dd09ada..3b6618d5e 100644 --- a/test/csq/ENST00000573314/incorrect-insertion-overlap.txt +++ b/test/csq/ENST00000573314/incorrect-insertion-overlap.txt @@ -1,3 +1,3 @@ 32388 GTCCGTCGGCATAAACTT GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT frameshift|VWA9|ENST00000573314|NMD|-|4DCVRRWARGEWGGRGPGTAFRCLLGLQVKGLWVKNNPGPALGLPATVAAAPRCL*>4ESLCRRTVSGDGHGVSGEAEGPGLPFAACWGYR*|32388GTCCGTCGGCATAAACTT>GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT -32388 GTCCGTCGGCATAAACTT GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT frameshift|VWA9|ENST00000573314|NMD|-|4DCVRRWARGEWGGRGPGTAFRCLLGLQVKGLWVKNNPGPALGLPATVAAAPRCL*>4ESLCRRTVSGDGHGVSGEAEGPGLPFAACWGYR*|32388GTCCGTCGGCATAAACTT>GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT +32388 GTCCGTCGGCATAAACTT GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT frameshift&NMD_transcript|VWA9|ENST00000573314|NMD|-|4DCVRRWARGEWGGRGPGTAFRCLLGLQVKGLWVKNNPGPALGLPATVAAAPRCL*>4ESLCRRTVSGDGHGVSGEAEGPGLPFAACWGYR*|32388GTCCGTCGGCATAAACTT>GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT diff --git a/test/csq/ENST00000573314/incorrect-insertion-overlap.txt-l b/test/csq/ENST00000573314/incorrect-insertion-overlap.txt-l index 12dd09ada..3b6618d5e 100644 --- a/test/csq/ENST00000573314/incorrect-insertion-overlap.txt-l +++ b/test/csq/ENST00000573314/incorrect-insertion-overlap.txt-l @@ -1,3 +1,3 @@ 32388 GTCCGTCGGCATAAACTT GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT frameshift|VWA9|ENST00000573314|NMD|-|4DCVRRWARGEWGGRGPGTAFRCLLGLQVKGLWVKNNPGPALGLPATVAAAPRCL*>4ESLCRRTVSGDGHGVSGEAEGPGLPFAACWGYR*|32388GTCCGTCGGCATAAACTT>GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT -32388 GTCCGTCGGCATAAACTT GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT frameshift|VWA9|ENST00000573314|NMD|-|4DCVRRWARGEWGGRGPGTAFRCLLGLQVKGLWVKNNPGPALGLPATVAAAPRCL*>4ESLCRRTVSGDGHGVSGEAEGPGLPFAACWGYR*|32388GTCCGTCGGCATAAACTT>GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT +32388 GTCCGTCGGCATAAACTT GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT frameshift&NMD_transcript|VWA9|ENST00000573314|NMD|-|4DCVRRWARGEWGGRGPGTAFRCLLGLQVKGLWVKNNPGPALGLPATVAAAPRCL*>4ESLCRRTVSGDGHGVSGEAEGPGLPFAACWGYR*|32388GTCCGTCGGCATAAACTT>GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT diff --git a/test/csq/ENST00000580206/test.cmd.out b/test/csq/ENST00000580206/test.cmd.out index 09b9d0d95..39ae1c3e0 100644 --- a/test/csq/ENST00000580206/test.cmd.out +++ b/test/csq/ENST00000580206/test.cmd.out @@ -1,16 +1,16 @@ -35750 C CAAAAGA inframe_insertion|ANKRD30B|ENST00000358984|protein_coding|+|353P>353QKK|35750C>CAAAAGA+35751C>A,inframe_insertion|ANKRD30B|ENST00000580206|NMD|+|353P>353QKK|35750C>CAAAAGA+35751C>A,inframe_insertion|ANKRD30B|ENST00000665241|protein_coding|+|353P>353QKK|35750C>CAAAAGA+35751C>A +35750 C CAAAAGA inframe_insertion&NMD_transcript|ANKRD30B|ENST00000580206|NMD|+|353P>353QKK|35750C>CAAAAGA+35751C>A,inframe_insertion|ANKRD30B|ENST00000358984|protein_coding|+|353P>353QKK|35750C>CAAAAGA+35751C>A,inframe_insertion|ANKRD30B|ENST00000665241|protein_coding|+|353P>353QKK|35750C>CAAAAGA+35751C>A 35751 C A @35750 35750 C CAAAAGA inframe_insertion|ANKRD30B|ENST00000358984|protein_coding|+|353P>353QKK|35750C>CAAAAGA+35751C>A 35751 C A @35750 -35750 CC CAAAAGAA inframe_insertion|ANKRD30B|ENST00000358984|protein_coding|+|353P>353QKK|35750CC>CAAAAGAA,inframe_insertion|ANKRD30B|ENST00000580206|NMD|+|353P>353QKK|35750CC>CAAAAGAA,inframe_insertion|ANKRD30B|ENST00000665241|protein_coding|+|353P>353QKK|35750CC>CAAAAGAA +35750 CC CAAAAGAA inframe_insertion&NMD_transcript|ANKRD30B|ENST00000580206|NMD|+|353P>353QKK|35750CC>CAAAAGAA,inframe_insertion|ANKRD30B|ENST00000358984|protein_coding|+|353P>353QKK|35750CC>CAAAAGAA,inframe_insertion|ANKRD30B|ENST00000665241|protein_coding|+|353P>353QKK|35750CC>CAAAAGAA 35750 CC CAAAAGAA inframe_insertion|ANKRD30B|ENST00000358984|protein_coding|+|353P>353QKK|35750CC>CAAAAGAA -35750 CCAG C inframe_deletion|ANKRD30B|ENST00000358984|protein_coding|+|353PA>353P|35750CCAG>C+35755A>T,inframe_deletion|ANKRD30B|ENST00000580206|NMD|+|353PA>353P|35750CCAG>C+35755A>T,inframe_deletion|ANKRD30B|ENST00000665241|protein_coding|+|353PA>353P|35750CCAG>C+35755A>T +35750 CCAG C inframe_deletion&NMD_transcript|ANKRD30B|ENST00000580206|NMD|+|353PA>353P|35750CCAG>C+35755A>T,inframe_deletion|ANKRD30B|ENST00000358984|protein_coding|+|353PA>353P|35750CCAG>C+35755A>T,inframe_deletion|ANKRD30B|ENST00000665241|protein_coding|+|353PA>353P|35750CCAG>C+35755A>T 35755 A T @35750 -35756 A C missense|ANKRD30B|ENST00000358984|protein_coding|+|355K>354Q|35756A>C,missense|ANKRD30B|ENST00000580206|NMD|+|355K>354Q|35756A>C,missense|ANKRD30B|ENST00000665241|protein_coding|+|355K>354Q|35756A>C +35756 A C missense&NMD_transcript|ANKRD30B|ENST00000580206|NMD|+|355K>354Q|35756A>C,missense|ANKRD30B|ENST00000358984|protein_coding|+|355K>354Q|35756A>C,missense|ANKRD30B|ENST00000665241|protein_coding|+|355K>354Q|35756A>C 35750 CCAG C inframe_deletion|ANKRD30B|ENST00000358984|protein_coding|+|353PA>353P|35750CCAG>C+35755A>T 35755 A T @35750 @@ -19,7 +19,7 @@ 35750 CCAGCA CCT inframe_deletion|ANKRD30B|ENST00000358984|protein_coding|+|353PA>353P|35750CCAGCA>CCT 35756 A C missense|ANKRD30B|ENST00000358984|protein_coding|+|355K>354Q|35756A>C -35750 CCAG C missense&inframe_altering|ANKRD30B|ENST00000358984|protein_coding|+|353PA>353PP|35750CCAG>C+35755A>ACCT,missense&inframe_altering|ANKRD30B|ENST00000580206|NMD|+|353PA>353PP|35750CCAG>C+35755A>ACCT,missense&inframe_altering|ANKRD30B|ENST00000665241|protein_coding|+|353PA>353PP|35750CCAG>C+35755A>ACCT +35750 CCAG C missense&inframe_altering&NMD_transcript|ANKRD30B|ENST00000580206|NMD|+|353PA>353PP|35750CCAG>C+35755A>ACCT,missense&inframe_altering|ANKRD30B|ENST00000358984|protein_coding|+|353PA>353PP|35750CCAG>C+35755A>ACCT,missense&inframe_altering|ANKRD30B|ENST00000665241|protein_coding|+|353PA>353PP|35750CCAG>C+35755A>ACCT 35755 A ACCT @35750 35750 CCAG C missense&inframe_altering|ANKRD30B|ENST00000358984|protein_coding|+|353PA>353PP|35750CCAG>C+35755A>ACCT diff --git a/test/fill-tags-hemi.1.out b/test/fill-tags-hemi.1.out index 10d4ddca3..634012187 100644 --- a/test/fill-tags-hemi.1.out +++ b/test/fill-tags-hemi.1.out @@ -3,6 +3,7 @@ ##FORMAT= ##contig= ##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta +##INFO= ##INFO= ##INFO= ##INFO= @@ -16,7 +17,7 @@ ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B -1 3177144 . G T,A 45 PASS NS=2;AN=2;AF=0.5,0.5;MAF=0.5;AC=1,1;AC_Het=0,0;AC_Hom=0,0;AC_Hemi=1,1;HWE=1,1;ExcHet=1,1 GT 1 2 -1 3177144 . G T 45 PASS NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=1;HWE=1;ExcHet=1 GT 0/. 1/. -1 3177144 . G T 45 PASS NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=1;HWE=1;ExcHet=1 GT ./0 ./1 -1 3177144 . G T 45 PASS NS=1;AN=1;AF=1;MAF=0;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=1;HWE=1;ExcHet=1 GT ./. ./1 +1 3177144 . G T,A 45 PASS F_MISSING=0;NS=2;AN=2;AF=0.5,0.5;MAF=0.5;AC=1,1;AC_Het=0,0;AC_Hom=0,0;AC_Hemi=1,1;HWE=1,1;ExcHet=1,1 GT 1 2 +1 3177144 . G T 45 PASS F_MISSING=1;NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=1;HWE=1;ExcHet=1 GT 0/. 1/. +1 3177144 . G T 45 PASS F_MISSING=1;NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=1;HWE=1;ExcHet=1 GT ./0 ./1 +1 3177144 . G T 45 PASS F_MISSING=1;NS=1;AN=1;AF=1;MAF=0;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=1;HWE=1;ExcHet=1 GT ./. ./1 diff --git a/test/fill-tags-hemi.2.out b/test/fill-tags-hemi.2.out index e746d6c6a..1bc2e60a5 100644 --- a/test/fill-tags-hemi.2.out +++ b/test/fill-tags-hemi.2.out @@ -3,6 +3,7 @@ ##FORMAT= ##contig= ##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta +##INFO= ##INFO= ##INFO= ##INFO= @@ -16,7 +17,7 @@ ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B -1 3177144 . G T,A 45 PASS NS=2;AN=2;AF=0.5,0.5;MAF=0.5;AC=1,1;AC_Het=0,0;AC_Hom=0,0;AC_Hemi=1,1;HWE=1,1;ExcHet=1,1 GT 1 2 -1 3177144 . G T 45 PASS NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1 GT 0/. 1/. -1 3177144 . G T 45 PASS NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1 GT ./0 ./1 -1 3177144 . G T 45 PASS NS=1;AN=1;AF=1;MAF=0;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1 GT ./. ./1 +1 3177144 . G T,A 45 PASS F_MISSING=0;NS=2;AN=2;AF=0.5,0.5;MAF=0.5;AC=1,1;AC_Het=0,0;AC_Hom=0,0;AC_Hemi=1,1;HWE=1,1;ExcHet=1,1 GT 1 2 +1 3177144 . G T 45 PASS F_MISSING=1;NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1 GT 0/. 1/. +1 3177144 . G T 45 PASS F_MISSING=1;NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1 GT ./0 ./1 +1 3177144 . G T 45 PASS F_MISSING=1;NS=1;AN=1;AF=1;MAF=0;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1 GT ./. ./1 diff --git a/test/fill-tags-hwe.out b/test/fill-tags-hwe.out index 45f0ae0e8..8837eb167 100644 --- a/test/fill-tags-hwe.out +++ b/test/fill-tags-hwe.out @@ -3,6 +3,7 @@ ##FORMAT= ##contig= ##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta +##INFO= ##INFO= ##INFO= ##INFO= @@ -16,7 +17,7 @@ ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 S4 S5 S6 S7 S8 S9 S10 -1 3177144 . G T 45 PASS NS=10;AN=20;AF=0;MAF=0;AC=0;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1 GT 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 -1 3177144 . G T 45 PASS NS=10;AN=20;AF=0.1;MAF=0.1;AC=2;AC_Het=2;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=0.947368 GT 1/0 1/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 -1 3177144 . G T 45 PASS NS=10;AN=20;AF=0.1;MAF=0.1;AC=2;AC_Het=0;AC_Hom=2;AC_Hemi=0;HWE=0.0526316;ExcHet=1 GT 1/1 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 -1 3177144 . G T,C 45 PASS NS=10;AN=20;AF=0.5,0;MAF=0.5;AC=10,0;AC_Het=10,0;AC_Hom=0,0;AC_Hemi=0,0;HWE=0.00690641,1;ExcHet=0.00554244,1 GT 1/0 1/0 1/0 1/0 1/0 1/0 1/0 1/0 1/0 1/0 +1 3177144 . G T 45 PASS F_MISSING=0;NS=10;AN=20;AF=0;MAF=0;AC=0;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1 GT 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 +1 3177144 . G T 45 PASS F_MISSING=0;NS=10;AN=20;AF=0.1;MAF=0.1;AC=2;AC_Het=2;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=0.947368 GT 1/0 1/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 +1 3177144 . G T 45 PASS F_MISSING=0;NS=10;AN=20;AF=0.1;MAF=0.1;AC=2;AC_Het=0;AC_Hom=2;AC_Hemi=0;HWE=0.0526316;ExcHet=1 GT 1/1 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 +1 3177144 . G T,C 45 PASS F_MISSING=0;NS=10;AN=20;AF=0.5,0;MAF=0.5;AC=10,0;AC_Het=10,0;AC_Hom=0,0;AC_Hemi=0,0;HWE=0.00690641,1;ExcHet=0.00554244,1 GT 1/0 1/0 1/0 1/0 1/0 1/0 1/0 1/0 1/0 1/0 diff --git a/test/gtcheck.5.1.out b/test/gtcheck.5.1.out index 4e7fbd625..639730377 100644 --- a/test/gtcheck.5.1.out +++ b/test/gtcheck.5.1.out @@ -5,4 +5,8 @@ INFO sites-skipped-monoallelic 1 INFO sites-skipped-no-data 1 INFO sites-skipped-GT-not-diploid 1 INFO sites-skipped-PL-not-diploid 1 +INFO sites-used-PL-vs-PL 0 +INFO sites-used-PL-vs-GT 1 +INFO sites-used-GT-vs-PL 0 +INFO sites-used-GT-vs-GT 1 DC A A 3.000150e-04 4.605170e+01 2 diff --git a/test/gvcf.merge.1.out b/test/gvcf.merge.1.out index d8a9b497b..f9b679d1a 100644 --- a/test/gvcf.merge.1.out +++ b/test/gvcf.merge.1.out @@ -24,6 +24,6 @@ ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT G06 D05 H09 chr1 10106 . C . 0 LowGQX BLOCKAVG_min30p3a;AN=2 GT:GQX:DP:DPF ./.:.:.:. 0/0:12:5:0 ./.:.:.:. -chr1 10107 . C . 0 LowGQX;HighDPFRatio BLOCKAVG_min30p3a;AN=2 GT:GQX:DP:DPF .:.:0:1 ./.:.:.:. 0/0:5:2:0 +chr1 10107 . C . 0 LowGQX;HighDPFRatio BLOCKAVG_min30p3a;AN=4 GT:GQX:DP:DPF .:.:0:1 0/0:12:5:0 0/0:5:2:0 chr1 10108 . N . 0 LowGQX;HighDPFRatio END=10110;BLOCKAVG_min30p3a;AN=2 GT:GQX:DP:DPF .:.:0:1 ./.:.:.:. 0/0:5:2:0 chr1 10111 . N . 0 LowGQX END=10120;BLOCKAVG_min30p3a;AN=2 GT:GQX:DP:DPF ./.:.:.:. ./.:.:.:. 0/0:5:2:0 diff --git a/test/merge.10.1.out b/test/merge.10.1.out index c34673867..2325b4ba6 100644 --- a/test/merge.10.1.out +++ b/test/merge.10.1.out @@ -1,13 +1,14 @@ ##fileformat=VCFv4.3 ##FILTER= +##FORMAT= ##contig= ##reference=file:ref.fa -#CHROM POS ID REF ALT QUAL FILTER INFO -1 3000000 . C CCG . . . -1 3000000 . C CAA . . . -1 3000150 . CC C . . . -1 3000150 . C CTT . . . -1 3000152 . C A . . . -1 3000152 . C CA . . . -1 3000154 . C A . . . -1 3000154 . C T . . . +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a b +1 3000000 . C CCG . . . GT 0/1 ./. +1 3000000 . C CAA . . . GT ./. 0/1 +1 3000150 . CC C . . . GT 0/1 ./. +1 3000150 . C CTT . . . GT ./. 0/1 +1 3000152 . C A . . . GT 0/1 ./. +1 3000152 . C CA . . . GT ./. 0/1 +1 3000154 . C A . . . GT 0/1 ./. +1 3000154 . C T . . . GT ./. 0/1 diff --git a/test/merge.10.2.out b/test/merge.10.2.out index 2b345cf56..f38e9d02c 100644 --- a/test/merge.10.2.out +++ b/test/merge.10.2.out @@ -1,10 +1,11 @@ ##fileformat=VCFv4.3 ##FILTER= +##FORMAT= ##contig= ##reference=file:ref.fa -#CHROM POS ID REF ALT QUAL FILTER INFO -1 3000000 . C CCG,CAA . . . -1 3000150 . CC C,CTTC . . . -1 3000152 . C A . . . -1 3000152 . C CA . . . -1 3000154 . C A,T . . . +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a b +1 3000000 . C CCG,CAA . . . GT 0/1 0/2 +1 3000150 . CC C,CTTC . . . GT 0/1 0/2 +1 3000152 . C A . . . GT 0/1 ./. +1 3000152 . C CA . . . GT ./. 0/1 +1 3000154 . C A,T . . . GT 0/1 0/2 diff --git a/test/merge.10.3.out b/test/merge.10.3.out index cec954427..53f1a51c5 100644 --- a/test/merge.10.3.out +++ b/test/merge.10.3.out @@ -1,11 +1,12 @@ ##fileformat=VCFv4.3 ##FILTER= +##FORMAT= ##contig= ##reference=file:ref.fa -#CHROM POS ID REF ALT QUAL FILTER INFO -1 3000000 . C CCG,CAA . . . -1 3000150 . CC C,CTTC . . . -1 3000152 . C A . . . -1 3000152 . C CA . . . -1 3000154 . C A . . . -1 3000154 . C T . . . +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a b +1 3000000 . C CCG,CAA . . . GT 0/1 0/2 +1 3000150 . CC C . . . GT 0/1 ./. +1 3000150 . C CTT . . . GT ./. 0/1 +1 3000152 . C A . . . GT 0/1 ./. +1 3000152 . C CA . . . GT ./. 0/1 +1 3000154 . C A,T . . . GT 0/1 0/2 diff --git a/test/merge.10.a.vcf b/test/merge.10.a.vcf index f2da6e185..c3672980c 100644 --- a/test/merge.10.a.vcf +++ b/test/merge.10.a.vcf @@ -1,8 +1,9 @@ ##fileformat=VCFv4.3 +##FORMAT= ##contig= ##reference=file:ref.fa -#CHROM POS ID REF ALT QUAL FILTER INFO -1 3000000 . C CCG . . . -1 3000150 . CC C . . . -1 3000152 . C A . . . -1 3000154 . C A . . . +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a +1 3000000 . C CCG . . . GT 0/1 +1 3000150 . CC C . . . GT 0/1 +1 3000152 . C A . . . GT 0/1 +1 3000154 . C A . . . GT 0/1 diff --git a/test/merge.10.b.vcf b/test/merge.10.b.vcf index 15c96e9b7..eff5262fb 100644 --- a/test/merge.10.b.vcf +++ b/test/merge.10.b.vcf @@ -1,8 +1,9 @@ ##fileformat=VCFv4.3 +##FORMAT= ##contig= ##reference=file:ref.fa -#CHROM POS ID REF ALT QUAL FILTER INFO -1 3000000 . C CAA . . . -1 3000150 . C CTT . . . -1 3000152 . C CA . . . -1 3000154 . C T . . . +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT b +1 3000000 . C CAA . . . GT 0/1 +1 3000150 . C CTT . . . GT 0/1 +1 3000152 . C CA . . . GT 0/1 +1 3000154 . C T . . . GT 0/1 diff --git a/test/merge.gvcf.10.1.out b/test/merge.gvcf.10.1.out index 285548169..3a51f253d 100644 --- a/test/merge.gvcf.10.1.out +++ b/test/merge.gvcf.10.1.out @@ -1,6 +1,11 @@ ##fileformat=VCFv4.2 ##FILTER= ##contig= +##contig= ##INFO= -#CHROM POS ID REF ALT QUAL FILTER INFO -chr1 1 . A <*>,C . . END=2 +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a b +chr1 1 . A <*>,C . . END=2 GT 0/0 0/2 +chr2 1 . A <*> . . END=2 GT 0/0 ./. +chr2 3 . G C,<*>,A . . . GT 0/1 0/3 +chr2 4 . T <*> . . END=6 GT 0/0 0/0 diff --git a/test/merge.gvcf.10.2.out b/test/merge.gvcf.10.2.out index ddaf00e52..8feceaef3 100644 --- a/test/merge.gvcf.10.2.out +++ b/test/merge.gvcf.10.2.out @@ -1,7 +1,12 @@ ##fileformat=VCFv4.2 ##FILTER= ##contig= +##contig= ##INFO= -#CHROM POS ID REF ALT QUAL FILTER INFO -chr1 1 . A <*> . . END=2 -chr1 1 . A C . . . +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a b +chr1 1 . A <*>,C . . END=2 GT 0/0 0/2 +chr2 1 . A <*> . . END=2 GT 0/0 ./. +chr2 3 . G C,<*> . . . GT 0/1 ./. +chr2 3 . G A,<*> . . . GT ./. 0/1 +chr2 4 . T <*> . . END=6 GT 0/0 0/0 diff --git a/test/merge.gvcf.10.3.out b/test/merge.gvcf.10.3.out index 7129995b2..ca9bfccb5 100644 --- a/test/merge.gvcf.10.3.out +++ b/test/merge.gvcf.10.3.out @@ -1,7 +1,13 @@ ##fileformat=VCFv4.2 ##FILTER= ##contig= +##contig= ##INFO= -#CHROM POS ID REF ALT QUAL FILTER INFO -chr1 1 . A <*>,C . . . -chr1 2 . C <*> . . . +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a b +chr1 1 . A <*>,C . . . GT 0/0 0/2 +chr1 2 . C <*> . . . GT 0/0 ./. +chr2 1 . A <*> . . END=2 GT 0/0 ./. +chr2 3 . G C,<*>,A . . . GT 0/1 0/3 +chr2 4 . T <*> . . END=6 GT 0/0 0/0 +chr2 7 . G <*> . . END=8 GT ./. 0/0 diff --git a/test/merge.gvcf.10.4.out b/test/merge.gvcf.10.4.out index 38c43ad4c..8be54a9c2 100644 --- a/test/merge.gvcf.10.4.out +++ b/test/merge.gvcf.10.4.out @@ -1,7 +1,14 @@ ##fileformat=VCFv4.2 ##FILTER= ##contig= +##contig= ##INFO= -#CHROM POS ID REF ALT QUAL FILTER INFO -chr1 1 . A C . . . -chr1 2 . C <*> . . . +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a b +chr1 1 . A <*>,C . . . GT 0/0 0/2 +chr1 2 . C <*> . . . GT 0/0 ./. +chr2 1 . A <*> . . END=2 GT 0/0 ./. +chr2 3 . G C,<*> . . . GT 0/1 ./. +chr2 3 . G A,<*> . . . GT ./. 0/1 +chr2 4 . T <*> . . END=6 GT 0/0 0/0 +chr2 7 . G <*> . . END=8 GT ./. 0/0 diff --git a/test/merge.gvcf.10.a.vcf b/test/merge.gvcf.10.a.vcf index da7568d60..6d8b498dd 100644 --- a/test/merge.gvcf.10.a.vcf +++ b/test/merge.gvcf.10.a.vcf @@ -1,6 +1,11 @@ ##fileformat=VCFv4.2 ##FILTER= ##contig= +##contig= ##INFO= -#CHROM POS ID REF ALT QUAL FILTER INFO -chr1 1 . A <*> . . END=2 +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a +chr1 1 . A <*> . . END=2 GT 0/0 +chr2 1 . A <*> . . END=2 GT 0/0 +chr2 3 . G C,<*> . . . GT 0/1 +chr2 4 . T <*> . . END=6 GT 0/0 diff --git a/test/merge.gvcf.10.b.vcf b/test/merge.gvcf.10.b.vcf index 9e2840fab..0062a2ed2 100644 --- a/test/merge.gvcf.10.b.vcf +++ b/test/merge.gvcf.10.b.vcf @@ -1,5 +1,10 @@ ##fileformat=VCFv4.2 ##FILTER= ##contig= -#CHROM POS ID REF ALT QUAL FILTER INFO -chr1 1 . A C . . . +##contig= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT b +chr1 1 . A C . . . GT 0/1 +chr2 3 . G A,<*> . . . GT 0/1 +chr2 4 . T <*> . . END=8 GT 0/0 diff --git a/test/merge.gvcf.10.fa b/test/merge.gvcf.10.fa index 9cea0be52..86b1e1550 100644 --- a/test/merge.gvcf.10.fa +++ b/test/merge.gvcf.10.fa @@ -1,2 +1,4 @@ >chr1 ACGT +>chr2 +ACGTACGT diff --git a/test/merge.gvcf.11.1.out b/test/merge.gvcf.11.1.out new file mode 100644 index 000000000..ada937137 --- /dev/null +++ b/test/merge.gvcf.11.1.out @@ -0,0 +1,12 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file.fa +##contig= +##FORMAT= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s1 s2 s3 +chr20 64249835 . T 0 . END=64249836 GT 0/0 0/0 0/0 +chr20 64249837 . T 0 . . GT 0/0 0/0 0/0 +chr20 64249838 . T 0 . END=64250066 GT 0/0 0/0 0/0 +chr20 64250067 . T 0 . . GT 0/0 0/0 ./. +chr20 64250068 . N 0 . END=64251648 GT 0/0 ./. ./. diff --git a/test/merge.gvcf.11.a.vcf b/test/merge.gvcf.11.a.vcf new file mode 100644 index 000000000..0c5cf70d2 --- /dev/null +++ b/test/merge.gvcf.11.a.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##reference=file.fa +##contig= +##FORMAT= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s1 +chr20 64249835 . T 0 . END=64249837 GT 0/0 +chr20 64249838 . T 0 . END=64251648 GT 0/0 diff --git a/test/merge.gvcf.11.b.vcf b/test/merge.gvcf.11.b.vcf new file mode 100644 index 000000000..3a73a061c --- /dev/null +++ b/test/merge.gvcf.11.b.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##reference=file.fa +##contig= +##FORMAT= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s2 +chr20 64249835 . T 0 . END=64250066 GT 0/0 +chr20 64250067 . T 0 . END=64250067 GT 0/0 diff --git a/test/merge.gvcf.11.c.vcf b/test/merge.gvcf.11.c.vcf new file mode 100644 index 000000000..b8b0a083f --- /dev/null +++ b/test/merge.gvcf.11.c.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##reference=file.fa +##contig= +##FORMAT= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s3 +chr20 64249835 . T 0 . END=64249836 GT 0/0 +chr20 64249837 . T 0 . END=64250066 GT 0/0 diff --git a/test/merge.gvcf.2.out b/test/merge.gvcf.2.out index a939338b0..4bb0c75d2 100644 --- a/test/merge.gvcf.2.out +++ b/test/merge.gvcf.2.out @@ -18,16 +18,16 @@ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT AAA BBB CCC 2 21444416 . G <*> . . END=21444427;MinDP=5;QS=1,0 PL:DP 0,15,125:5 .:. .:. 2 21444428 . C <*> . . END=21444429;MinDP=2;QS=2,0 PL:DP 0,15,125:5 0,6,51:2 .:. -2 21444430 . TCAA T,TAA,<*> 0 . MinDP=2;QS=1.60366,0.304878,0.0914634,0 PL:DP:DV 37,0,79,35,73,113,.,.,.,.:5:2 0,.,.,.,.,.,6,.,.,51:2:. .:.:. +2 21444430 . TCAA T,TAA,<*> 0 . MinDP=2;QS=1.60366,0.304878,0.0914634,0 PL:DP:DV 37,0,79,35,73,113,113,113,113,113:5:2 0,51,51,51,51,51,6,51,51,51:2:. .:.:. +2 21444431 . CA C,CAAACAAAAAA 0 . QS=0.75,0.25,1 PL:DP:DV 0,4,10,10,10,10:4:1 28,28,28,3,28,0:1:1 .:.:. 2 21444431 . C <*> . . MinDP=4;QS=1,0 PL:DP 0,12,110:4 .:. .:. -2 21444431 . CA C,CAAACAAAAAA 0 . QS=0.75,0.25,1 PL:DP:DV 0,4,10,.,.,.:4:1 28,.,.,3,.,0:1:1 .:.:. 2 21444433 . C <*> 0 . END=21444444;QS=0.75,0.25 PL:DP:DV 0,4,10:4:1 .:.:. .:.:. 3 1 . C <*> 0 . END=4;MinDP=33;QS=0.75,0.25 PL:DP:DV 0,4,10:4:1 .:.:. .:.:. -3 5 . C <*>,T 0 . MinDP=33;QS=1.5,0.25,0.25 PL:DP:DV 0,4,10,.,.,.:4:1 0,.,.,4,.,10:4:1 .:.:. +3 5 . C <*>,T 0 . MinDP=33;QS=1.5,0.25,0.25 PL:DP:DV 0,4,10,10,10,10:4:1 0,10,10,4,10,10:4:1 .:.:. 3 6 . N <*> 0 . END=10;MinDP=33;QS=0.75,0.25 PL:DP:DV 0,4,10:4:1 .:.:. .:.:. 1 1619670 . C <*> 0 . END=1619782;MinDP=33;QS=0.75,0.25 PL:DP:DV 0,4,10:4:1 .:.:. .:.:. 1 1619783 . C <*> 0 . END=1619787;MinDP=33;QS=0.75,1.25 PL:DP:DV 0,4,10:4:1 28,3,0:1:1 .:.:. -1 1619788 . G <*>,GAAAAAAA 0 . MinDP=33;QS=0.75,0.25,1 PL:DP:DV 0,4,10,.,.,.:4:1 28,.,.,3,.,0:1:1 .:.:. +1 1619788 . G <*>,GAAAAAAA 0 . MinDP=33;QS=0.75,0.25,1 PL:DP:DV 0,4,10,10,10,10:4:1 28,28,28,3,28,0:1:1 .:.:. 1 1619789 . N <*> 0 . END=1619877;MinDP=33;QS=0.75,0.25 PL:DP:DV 0,4,10:4:1 .:.:. .:.:. 4 20000975 . C <*> 0 . END=20001021;MinDP=33;QS=0.75,0.25 PL:DP:DV 0,4,10:4:1 .:.:. .:.:. 4 20001022 . C <*> 0 . END=20001070;MinDP=33;QS=1.5,0.5 PL:DP:DV 0,4,10:4:1 0,4,10:4:1 .:.:. @@ -44,9 +44,9 @@ 6 630 . T <*> . . . PL 66,1,1 66,2,3 . 6 631 . N <*> . . END=666 PL 66,1,1 . . 7 701 . T <*> . . . PL 77,1,1 77,2,1 . -7 702 . T <*> . . . PL . 77,2,2 . -7 703 . T <*> . . . PL 77,1,2 . . +7 702 . T <*> . . . PL 77,1,1 77,2,2 . +7 703 . T <*> . . . PL 77,1,2 77,2,2 . 7 704 . N <*> . . END=777 PL 77,1,2 . . 8 1 . T <*> . . END=2 PL 88,1,1 . . -8 3 . T <*>,A . . . PL 88,1,1,.,.,. 88,.,.,2,.,1 88,.,.,3,.,1 +8 3 . T <*>,A . . . PL 88,1,1,1,1,1 88,88,88,2,88,1 88,88,88,3,88,1 8 4 . N <*> . . END=10 PL 88,1,1 . . diff --git a/test/merge.gvcf.5.1.out b/test/merge.gvcf.5.1.out new file mode 100644 index 000000000..74f6efbd1 --- /dev/null +++ b/test/merge.gvcf.5.1.out @@ -0,0 +1,10 @@ +##fileformat=VCFv4.2 +##FILTER= +##ALT= +##FORMAT= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 11 13 +chr22 10510106 . T . . END=10510112 GT 0/0 ./. +chr22 10510113 . C ,T . . . GT 0/0 2/2 +chr22 10510114 . N . . END=10510117 GT 0/0 ./. diff --git a/test/merge.gvcf.5.a.vcf b/test/merge.gvcf.5.a.vcf new file mode 100644 index 000000000..1a77662e3 --- /dev/null +++ b/test/merge.gvcf.5.a.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +##ALT= +##FORMAT= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 11 +chr22 10510106 . T . . END=10510117 GT 0/0 diff --git a/test/merge.gvcf.5.b.vcf b/test/merge.gvcf.5.b.vcf new file mode 100644 index 000000000..9dda0326b --- /dev/null +++ b/test/merge.gvcf.5.b.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +##ALT= +##FORMAT= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 13 +chr22 10510113 . C T, . . . GT 1/1 diff --git a/test/merge.mrules.1.1.out b/test/merge.mrules.1.1.out new file mode 100644 index 000000000..b22420e09 --- /dev/null +++ b/test/merge.mrules.1.1.out @@ -0,0 +1,19 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file://hs38DH.fa +##contig= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SampleA SampleB +chr1 1769963 . A . . END=1769967 GT:PL 0/0:0,3,45 ./.:. +chr1 1769968 . T . . . GT:PL 0/0:0,3,45 0/0:0,18,270 +chr1 1769969 . CAAAACAAAAACA CAAAACA,,C . . . GT:AD:PL 1/1:0,9,0,0:405,27,0,405,27,405,405,405,405,405 3/3:0,0,0,4:181,181,181,181,181,181,12,181,12,0 +chr1 1769976 . A . . . GT:PL 0/0:0,0,0 ./.:. +chr1 1769982 . A . . . GT:PL ./.:. 0/0:0,0,0 +chr1 1769983 . C T,A . . . GT:AD:PL 1/1:0,9,0:405,27,0,405,405,405 2/2:0,0,4:181,181,181,12,181,0 +chr1 1769990 . CAAAACAAAAACA CAAAACA,,C . . . GT:AD:PL 1:0,9,0,0:405,27,0,0 3:0,0,0,4:181,0,0,12 +chr1 1769991 . C T,A . . . GT:AD:PL 1:0,9,0:405,0,405 2:0,0,4:181,181,0 diff --git a/test/merge.mrules.1.2.out b/test/merge.mrules.1.2.out new file mode 100644 index 000000000..c935f2b7c --- /dev/null +++ b/test/merge.mrules.1.2.out @@ -0,0 +1,19 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file://hs38DH.fa +##contig= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SampleA SampleB +chr1 1769963 . A . . END=1769967 GT:PL 0/0:0,3,45 ./.:. +chr1 1769968 . T . . . GT:PL 0/0:0,3,45 0/0:0,18,270 +chr1 1769969 . CAAAACAAAAACA CAAAACA,,C . . . GT:AD:PL 1/1:0,9,0,0:405,27,0,405,27,405,405,405,405,405 3/3:0,0,0,4:181,181,181,181,181,181,12,181,12,0 +chr1 1769976 . A . . . GT:PL 0/0:0,0,0 ./.:. +chr1 1769982 . A . . . GT:PL ./.:. 0/0:0,0,0 +chr1 1769983 . C T,A . . . GT:AD:PL 1/1:0,9,.:405,27,0,.,.,. 2/2:0,.,4:181,.,.,12,.,0 +chr1 1769990 . CAAAACAAAAACA CAAAACA,,C . . . GT:AD:PL 1:0,9,0,0:405,27,0,0 3:0,0,0,4:181,0,0,12 +chr1 1769991 . C T,A . . . GT:AD:PL 1:0,9,.:405,0,. 2:0,.,4:181,.,0 diff --git a/test/merge.mrules.1.a.vcf b/test/merge.mrules.1.a.vcf new file mode 100644 index 000000000..cd09132fe --- /dev/null +++ b/test/merge.mrules.1.a.vcf @@ -0,0 +1,16 @@ +##fileformat=VCFv4.2 +##reference=file://hs38DH.fa +##contig= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SampleA +chr1 1769963 . A . . END=1769968 GT:PL 0/0:0,3,45 +chr1 1769969 . CAAAACA C, . . . GT:AD:PL 1/1:0,9,0:405,27,0,405,27,405 +chr1 1769976 . A . . END=1769976 GT:PL 0/0:0,0,0 +chr1 1769983 . C T . . . GT:AD:PL 1/1:0,9:405,27,0 +chr1 1769990 . CAAAACA C, . . . GT:AD:PL 1:0,9,0:405,27,0 +chr1 1769991 . C T . . . GT:AD:PL 1:0,9:405,0 diff --git a/test/merge.mrules.1.b.vcf b/test/merge.mrules.1.b.vcf new file mode 100644 index 000000000..8345153a7 --- /dev/null +++ b/test/merge.mrules.1.b.vcf @@ -0,0 +1,16 @@ +##fileformat=VCFv4.2 +##reference=file://hs38DH.fa +##contig= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SampleB +chr1 1769968 . T . . END=1769968 GT:PL 0/0:0,18,270 +chr1 1769969 . CAAAACAAAAACA C, . . . GT:AD:PL 1/1:0,4,0:181,12,0,181,12,181 +chr1 1769982 . A . . END=1769982 GT:PL 0/0:0,0,0 +chr1 1769983 . C A . . . GT:AD:PL 1/1:0,4:181,12,0 +chr1 1769990 . CAAAACAAAAACA C, . . . GT:AD:PL 1:0,4,0:181,12,0 +chr1 1769991 . C A . . . GT:AD:PL 1:0,4:181,0 diff --git a/test/norm.4.1.out b/test/norm.4.1.out index 6df7de93d..a786de4c1 100644 --- a/test/norm.4.1.out +++ b/test/norm.4.1.out @@ -4,4 +4,4 @@ ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample 1 10 . T C . . . GT 0/1 -1 10 . T TAC,TATAC,TATATAC,TATATATAC . . . GT 2/0 +1 10 . T TAC,TATAC,TATATAC,TATATATAC . . . GT 0/2 diff --git a/test/norm.merge.2.out b/test/norm.merge.2.out index a81c76bc7..d3c14147c 100644 --- a/test/norm.merge.2.out +++ b/test/norm.merge.2.out @@ -33,6 +33,6 @@ ##FILTER= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT XY00001 XY00002 XY00003 -1 105 . T C 999 PASS . GT:FGI:FRI 1:1,2:3,4 1/0:5,6,7:8,9 1/0:.:.,. +1 105 . T C 999 PASS . GT:FGI:FRI 1:1,2:3,4 0/1:5,6,7:8,9 0/1:.:.,. 1 110 . C A 999 PASS . GT:FGI 1:1,2 0:3,4 0:. 1 150 . A C 999 PASS . GT:FGI 1:1,2 0:. 0:3,4 diff --git a/test/norm.phased-join.1.out b/test/norm.phased-join.1.out new file mode 100644 index 000000000..6f5c01718 --- /dev/null +++ b/test/norm.phased-join.1.out @@ -0,0 +1,6 @@ +##fileformat=VCFv4.2 +##FILTER= +##FORMAT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample +20 1 . A C,G,T . . . GT 3|2|1|0 diff --git a/test/norm.phased-join.vcf b/test/norm.phased-join.vcf new file mode 100644 index 000000000..9afdad96f --- /dev/null +++ b/test/norm.phased-join.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##FILTER= +##FORMAT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample +20 1 . A C . . . GT 0|0|1|0 +20 1 . A G . . . GT 0|1|0|0 +20 1 . A T . . . GT 1|0|0|0 diff --git a/test/norm.phased-split.1.out b/test/norm.phased-split.1.out new file mode 100644 index 000000000..9afdad96f --- /dev/null +++ b/test/norm.phased-split.1.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##FILTER= +##FORMAT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample +20 1 . A C . . . GT 0|0|1|0 +20 1 . A G . . . GT 0|1|0|0 +20 1 . A T . . . GT 1|0|0|0 diff --git a/test/norm.phased-split.vcf b/test/norm.phased-split.vcf new file mode 100644 index 000000000..2b44d34be --- /dev/null +++ b/test/norm.phased-split.vcf @@ -0,0 +1,5 @@ +##fileformat=VCFv4.2 +##FORMAT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample +20 1 . A C,G,T . . . GT 3|2|1|0 diff --git a/test/norm.right-align.1.out b/test/norm.right-align.1.out new file mode 100644 index 000000000..f72145e5a --- /dev/null +++ b/test/norm.right-align.1.out @@ -0,0 +1,10 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +7 897 . GGAATTAAGA G . . . +7 910 . G C . . . diff --git a/test/norm.right-align.2.out b/test/norm.right-align.2.out new file mode 100644 index 000000000..0f5e30ff8 --- /dev/null +++ b/test/norm.right-align.2.out @@ -0,0 +1,10 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +7 900 . ATTAAGAGAA A . . ORI=7|897|GGAATTAAGA|G +7 910 . G C . . . diff --git a/test/norm.right-align.fa b/test/norm.right-align.fa new file mode 100644 index 000000000..31ea7c86c --- /dev/null +++ b/test/norm.right-align.fa @@ -0,0 +1,488 @@ +>7 7:154955-184155 +TGAGGGCTGAGGTGACCCTTGTCTCTGTGTTCTTGTCCCCCCCAGCTTGTGGAGCCTCTT +ACACCCAGTGGAGAAGCTCCCAACCAAGCTCTCTTGAGGATCTTGAAGGAAACTGAATTC +AAAAAGATCAAAGTGCTGGGCTCCGGTGCGTTCGGCACGGTGTATAAGGTAAGGTCCCTG +GCACAGGCCTCTGGGCTGGGCCGCAGGGCCTCTCATGGTCTGGTGGGGAGCCCAGAGTCC +TTGCAAGCTGTATATTTCCATCATCTACTTTACTCTTTGTTTCACTGAGTGTTTGGGAAA +CTCCAGTGTTTTTCCCAAGTTATTGAGAGGAAATCTTTTATAACCACAGTAATCAGTGGT +CCTGTGAGACCAATTCACAGACCAAAGGCATTTTTATGAAAGGGGCCATTGACCTTGCCA +TGGGGTGCAGCACAGGGCGGGAGGAGGGCCGCCTCTCACCGCACGGCATCAGAATGCAGC +CCAGCTGAAATGGGCTCATCTTCGTTTGCTTCTTCTAGATCCTCTTTGCATGAAATCTGA +TTTCAGTTAGGCCTAGACGCAGCATCATTAAATTCTGGATGAAATGATCCACACGGACTT +TATAACAGGCTTTACAAGCTTGAGATTCTTTTATCTAAATAATCAGTGTGATTCGTGGAG +CCCAACAGCTGCAGGGCTGCGGGGGCGTCACAGCCCCCAGCAATATCAGCCTTAGGTGCG +GCTCCACAGCCCCAGTGTCCCTCACCTTCGGGGTGCATCGCTGGTAACATCCACCCAGAT +CACTGGGCAGCATGTGGCACCATCTCACAATTGCCAGTTAACGTCTTCCTTCTCTCTCTG +TCATAGGGACTCTGGATCCCAGAAGGTGAGAAAGTTAAAATTCCCGTCGCTATCAAGGAA +TTAAGAGAAGCAACATCTCCGAAAGCCAACAAGGAAATCCTCGATGTGAGTTTCTGCTTT +GCTGTGTGGGGGTCCATGGCTCTGAACCTCAGGCCCACCTTTTCTCATGTCTGGCAGCTG +CTCTGCTCTAGACCCTGCTCATCTCCACATCCTAAATGTTCACTTTCTATGTCTTTCCCT +TTCTAGCTCTAGTGGGTATAACTCCCTCCCCTTAGAGACAGCACTGGCCTCTCCCATGCT +GGTATCCACCCCAAAAGGCTGGAAACAGGCAATTACTGGCATCTACCCAGCACTAGTTTC +TTGACACGCATGATGAGTGAGTGCTCTTGGTGAGCCTGGAGCATGGGTATTGTTTTTGGT +ATTTTTTGGATGAAGAAATGGAGGCATAAAGAAATTGGCTGACCCTTATATGGCTGGGAT +AGGGTTTAAGCCCCTTGTTATTTCTGACTCTGAAACTTGCATTCAATTCACTCCACCAAG +TTATCTCATCTTTGAAATGGCTTTTTTTAAAGGTGCCTAGAATATGATGGCGTGCAGTCT +ATAAACTGTTGCCCACCTTCTGTACTTTCTCTCAGAATAATTCACATTCTTCTCCAGTGT +CTGTTGATTGTTACTTTGTGGAATAAGTTCTTGGAAAATTCCACAAGATTATTGTTATCT +TCTTACTACCAATTCTATTGAACTTTCTCCACCTTCTCTGGGCCTTCCCCAGCCAGTGGT +GGGAAGATGCTGGCTGGAGTCTGACAGAGCCTCTTCTACACTGGCCTGGGCTTGCTGTGA +GTTGGTGGAAACCTTTGCTCTTGTCCCAACACAGAGCAAGTGAAAGAGGAGGTCAAGGGG +CTCAGGCAGCGGACTAGGGAAGCAGAATCGAGGAAAAGGAAAAATGGCTGACTTATTACC +TCAAAACTCTAGAGAATTTAGTTGATCTTACAGCCAAGAAGGACAAAAGCCAGAGAGTAA +TATCCTCCGCCTCATGTCTAACCCACAGAATACATAGCAAGTAAAGAGAACATGGGCCTT +TATAAAAATGTCTTAAGATACAATTTTTTAATTGGAGGAAATCTACAGTTTAATTTTCTC +TGGGCAGCTTTTCTTCCTTTTATTATAGTAGGGGAAATCCCATGTTGATATACTTCTAAA +TGAAAGATGATGAATTGATATAATACAATAAAAAATCTGTAAAATTGATGATATACTTAT +CAAGAAAAATTAGCTTTCATTTTAACGGTTTACAAATTGAGTCAAGTCCTAGTAACAAAA +TGTTAAGTCTATTAACATAACCACAAGAAATACAGGAAGACGGGCAATCTGTGAAGCCTT +TCACTTACAATCTCTGGCCCCTCACCTGTGCTGTGTAGGAAAATCTTTGTGCACAATTTG +CTTCCTTAATTCATTTTTTATTCATTCAACACATTCTAATAAATTATACAAAATCATGTT +GAAATGTGAATTTCAGTGGTATTTATAAATGCAGTGTGAGGAGGGTTTGGATGTATTCTA +AGACAATAGTTGTGCTTTGGGAAGGAAGCAGTGTTCACTGAAAAGTGCCCCCAGGACCTT +TTAATTGGAGGAAATATGCTTCTGTGGAGTTGGAAATGGGGTAGAAGATAGATAAGGTCA +AGGCTTAAAAGTTAAGTGCACCCAACATCTGAAGCGTCCATGGGCCTGGCATGGTGGCTT +TCGCCTGTAATCCCAGCACTTTGGGAGGCTGAGGCAGGAGGATCCCTTGAGCTTAGGAGT +TTGAGACCAGCCTGGGCAACATACTGAGACCCAGTCTCTACAAAAAATAAAAAATTAGCT +GGGTGTGGTGTCTCATGCCTGTAGTCCCAGCCACTCAGGAGATGGGAAGATGGCTTGAGT +CCAGGAGATCTAGGCTGCAGTGAGCTAAAATCTCACCACTGCACTCCAGCCTGGGTGACA +AAGCAAGACCCTGCTCAAAAAAATAGTTAGATATAAATATTAATATAGATACCTATATAT +ATCTGAATATAGATATCTATATATACTCTGTATATAGTTATTTAGATATATAAATATATA +TGATATATATTTAGAGAGATATATATTTAGAGAGATATATATTTAGAGATTTATATATAT +TTTATATATATTTAGAGATATATATCTCTAAATATATATCTCTCTCTAAATATATATATA +TCTCTCTCTAAATATATATATATCCCTAAATATATTAAATAAATAAAAGAAATAAAAGAA +AGCTCAGTTTGGCCTCCTGCTTGTCCTGTCTCCTCATCCCCTCTTCCCCCTCCATCATTT +TATTTCCTTGCCCCATGTTTCTTCACTGCGGCCATGTCCCCCCTCCTCTCCAATGATGGA +TGTCATGTCTGCTGCAGTCAGAGGGCGACAAGCCTGGAGTGTTCCCTGAAGCCTGTGGTT +TGTGGTTTGTCCTGCAGCTCAGGCTGCCCAGGCCTCACCAGCAATCCTGGCGGGCAGGGC +ACCACACTGGGATGGAGAGGGGGAAGCTGGAGGAGGCACTTTCTGGTAAAGAAAGCAAAA +GCCAGCAGTGCCCAGGCCAATTTCAACAGGGAGTTAAATAGCACCTTAATCCTGTGGCAG +GACAGCTCATGGGGCCATGTGTGCTCTTAGAAAGACTCACATGCACGCATGCACGGCAGC +AATGACTCCATACTCACGTTCCCCTGCAGACACCAGGCCCCCACAGCCGGCACACACACT +GCAGCCCCAGTTCCATGTTGCTAGCAGTGGCTTAGTGAATGAGTAAAGTTCTTAAAATGC +AGGGGACACCTGCCCTTCATTCATAAGGCTGGACGTACACCTCTCCTTAAGGAGTTCAAG +AGCTAGTGGAATCCCAATTCATACGGTAGAGCCATTCACAGATGAGAGAGACAAGCCAGA +AGGAAGGAACCAAAAGTCATGTCAGCAGTTAGGACAAAATAACAGGCTTTCAAGGTCACA +AAGCCTCAGGGACACTCCTGCGGTGGGACTGGGCTAGGAGCCATGGGGGCTCCAACTGTG +CGCTCTGCCTGCCAGCCTGTGGGTGCTGGGGCTCCACGAAGATTGTTGTGGAATACCAAG +CATGCTTGCTGTAGGTCACGGTGCACGTTTACTACTTCCAAGACAAACAGCCGAGAACAA +AGCTCGCTTTAGCTTCTGCGTACACCGAACGGGACACACGACTGAACAGCGTTCCCATTG +TGCCTGCTGGGTGGGGAGGAAGTGATGGCCCAGTGGGTCTATCAGATGTTAGTAGGATGG +GGCCTGGCGGGGCTCCAGGCTCTGTGTGGCCGACACCCACGCCCCCCGCTCTGCTCCCCA +TTCCCAGCCCCAGGTCAGCCCTGCGAGGCCCTGCAGCAGATGGGCTGCTCAAACTGCTCT +GGTTTGCAGATTTTTCTTCCCTCTCAAATGAATACAATATGTTTTCAAGTCTCAACCAGA +TCTTGAGAAAATAGGAAGAGCCAGAGGGTTTCTTTGGTGTTATGGTTGTACAGCTTCCCA +GACTCCGGGGGAGAGATGTGATTTGTGCTTTCTGGCAATCCCATGGCGTATTAAATTTTC +ATAGGCTTTCCAGTTTAAATTTAGGGTAGGCAATGGAAGGGAACGCAAAACAGATTTCTA +GGTGTACTGTGTGTGTGTCTCCCACGTCTAAAGTCTGTTAACTGGAGCACCCAACAGGCC +CCACAGGCTGCCTTCACACAGAGGACCTGGGGCGCCTCCGACCCATTGGGGTGAGCAGTG +GGCCATGGAGGGAGCCAGGGTCAGGAGACCTGGTTGTGGGCCTGACCTGACCCTGCTCAG +GGTGGCCTCAGGTGGGCCGTTCACCTCGTCAGCCTCAGCTTACCCTCTGACTACAGTGAC +CTCAGACAAAATACGCTTCCTGGCCCTGTCCAGTTCTGACTTTTTATAAACAAGCACTTA +TCCAAGTTAAAGGGATATTTTCAATATCTACTGAGTCCACAGATATTAAATATCTCCTCT +CTTCTTTAAAATTGTGGCATTATCTTTAGAATATAAAAGGAAAATAACACACACTCTCCT +TGAAAATAGAGAGCCTAAACACTCTGCAGGAAATATTTAAAGCTATAGTTTTTGTTTGTT +TGTCTTGAATGCAAGTGGCCTGGACTTTGACTTGCTTTGAGTCTTTGACCTTCATGACTT +CAGTACAGTTCAACCCTGACAGTTTTGAAGTAGGTATGTGCCTAGATCTGCCCTAGTCCC +TGCTGGAATGTTGAAGAAGCAAAGGTCCAGGCCCTCAGAGCACTTGCCACGTACTTGCCA +ACAGATACGGGGCGGAGACTTGAGTCAACGTAAGAGCAAGTGTGTGCCGGGTGATCCGAC +ACTGCAGAGCGCCAGCTAGACCCTAAGCGTGTGCTAGGGGCTGACCAAGCCGTTCTTTCC +TCAAAAACTTGGTGGGGAGGGTATTTTTAAAATCACACAAATATTTAAGTACAGATTATG +ATGACTGCCTCAAAGCAGTGGCTCTTCAGCTTCATCAAGCTTCAGAGTCCAGAGGGTTTG +TTCATATGGAAGGCTAGGCCTGTCTCCTGCATTTCACCCTCTTGGCCTGGGGGCGGGACC +CAAGAATGTGTGGCTCTAAAAGGTTCCCAGGCAATGCTGAGGCTGCTTTCTGAAGGAAAA +ACTGCAAGATACCAGGAGAGTTTCATTTAGATTGAAGAGTCGAGGAAGGCTCCTCTGAGA +AAGAGTCTGCTAAGGAAGGAGGAGGTGGGTTCTGGGGACAGAGGTTCTCCCGTGGGTAAG +GGTGGAGGGAAGCTCTCCTGGGGAGAAGGTGGGCAGGAGGACCAGAGGCTGGAGGGAGGA +GGGCAGTCAGCCTCGGGGCTTCCCAGGAACAGGGACGGCCAGGGCAGGGTTTAGGGCAAG +GAAAGCGTGTGAGCATATTTGTATTTTAGTAAATATTTACAGTTTGCCCTCCATGTCTGC +AGTTTCATATCCATGGATTCAATCAACCACAATGAAAAACGTTGGGGAAAAAAATTGCAT +CGGTACTGAACATATACGGACTTTTTTTCTTGTCATTATTCCCTAAACAATACAGCATAA +CAATTATTCACATAGCATTTGCACTGTATTAGGTACTATAGGTAATCAGGAGATGCTGTA +GATGGGAGGATGTCTGTAGGTTACACACAAATGCTGTGCCACTTTATATCAGGGGCTTGA +GCATCCTCACATTTTGATATTTAAGGGAGGTCCTGGAACCAATTCCCCAGATACTGAGGG +TCCACTGTCTGTGTCCCCTCGCCCCACCTTGCCTTTGTCTCCTGTCTCCTATCTCCACCC +TGCCTCCCGCCAGCCTGTTGCTCCTGACCTGCCCGGGCACCCTGGAGCAGCACCCTATCT +CAGAGCCTGGCTCAGTGTGTTCACTTCTGCAGAGAAACTAACTTGCCCAAGTCCACACTC +AAAACATAGGCATTGCTGAGATGTGAAAAGCAGCTGTGGATGCTTTCTGCTACAGTCTGT +GTGTTCTTTTCCATATCTGAATAAAAGGTCACCACCATTTGTATTTTAAAGAGAAAGAGA +ATTTATGGGTGGAAATTGGGGATTCCCTCATTCTCAGTCAGACAGAAAAGAGGGCCCCAT +TGTGTGCCTGATTGCAAATAAATTTAGCTTCCTCAGCCCAAGAATAGCAGAAGGGTTAAA +ATAAAGTCTGTATTTATGGCTCTGTCAAAGGAAGGCCCCTGCCTTGGCAGCCAGCCGGAA +TTAGCAGGGCAGCAGATGCCTGACTCAGTGCAGCATGGATTTCCCATAGGGAGCCTGGGG +GCACAGCACAGAGAGACCACTTCTCTTTAGAAATGGGTCCCGGGCAGCCAGGCAGCCTTT +AGTCACTGTAGATTGAATGCTCTGTCCATTTCAAAACCTGGGACTGGTCTATTGAAAGAG +CTTATCCAGCTACTCTTTGCAGAGGTGCTGTGGGCAGGGTCCCCAGCCCAAATGCCCACC +CATTTCCCAGAGCACAGTCAGGGCCAAGCCTGGCCTGTGGGGAAGGGAGGCCTTTCTCCC +TGCTGGCTCGGTGCTCCCCGGATGCCTTCTCCATCGCTTGTCCTCTGCAGCACCCACAGC +CAGCGTTCCTGATGTGCAGGGTCAGTCATTACCCAGGGTGTTCCGGACCCCACACAGATT +CCTACAGGCCCTCATGATATTTTAAAACACAGCATCCTCAACCTTGAGGCGGAGGTCTTC +ATAACAAAGATACTATCAGTTCCCAAACTCAGAGATCAGGTGACTCCGACTCCTCCTTTA +TCCAATGTGCTCCTCATGGCCACTGTTGCCTGGGCCTCTCTGTCATGGGGAATCCCCAGA +TGCACCCAGGAGGGGCCCTCTCCCACTGCATCTGTCACTTCACAGCCCTGCGTAAACGTC +CCTGTGCTAGGTCTTTTGCAGGCACAGCTTTTCCTCCATGAGTACGTATTTTGAAACTCA +AGATCGCATTCATGCGTCTTCACCTGGAAGGGGTCCATGTGCCCCTCCTTCTGGCCACCA +TGCGAAGCCACACTGACGTGCCTCTCCCTCCCTCCAGGAAGCCTACGTGATGGCCAGCGT +GGACAACCCCCACGTGTGCCGCCTGCTGGGCATCTGCCTCACCTCCACCGTGCAGCTCAT +CACGCAGCTCATGCCCTTCGGCTGCCTCCTGGACTATGTCCGGGAACACAAAGACAATAT +TGGCTCCCAGTACCTGCTCAACTGGTGTGTGCAGATCGCAAAGGTAATCAGGGAAGGGAG +ATACGGGGAGGGGAGATAAGGAGCCAGGATCCTCACATGCGGTCTGCGCTCCTGGGATAG +CAAGAGTTTGCCATGGGGATATGTGTGTGCGTGCATGCAGCACACACACATTCCTTTATT +TTGGATTCAATCAAGTTGATCTTCTTGTGCACAAATCAGTGCCTGTCCCATCTGCATGTG +GAAACTCTCATCAATCAGCTACCTTTGAAGAATTTTCTCTTTATTGAGTGCTCAGTGTGG +TCTGATGTCTCTGTTCTTATTTCTCTGGAATTCTTTGTGAATACTGTGGTGATTTGTAGT +GGAGAAGGAATATTGCTTCCCCCATTCAGGACTTGATAACAAGGTAAGCAAGCCAGGCCA +AGGCCAGGAGGACCCAGGTGATAGTGGTGGAGTGGAGCAGGTGCCTTGCAGGAGGCCCAG +TGAGGAGGTGCAAGGAGCTGACAGAGGGCGCAGCTGCTGCTGCTATGTGGCTGGGGCCTT +GGCTAAGTGTCCCCCTTTCCACAGGCTCGCTCCAGAGCCAGGGCGGGGCTGAGAGAGCAG +AGTGGTCAGGTAGCCCTGCCTGGGTGCTGGAGACAGGCACAGAACAACAAGCCAGGTATT +TCACAGCTGGTGCGGACCCAGAAAGACTTCTGCTTTTGCCCCAAACCCCTCCCATCTCCA +TCCCAGTCTTGCATCAGTTATTTGCACTCAACTTGCTAAGTCCTATTTTTTTCTAACAAT +GGGTATACATTTCATCCCATTGACTTTAAAGGATTTGCAGGCAGGCCCTGTCTCTGAGAA +TACGCCGTTGCCCGTCATCTCTCTCCGACAGCAGGGCAGGGGGTCCAGAGATGTGCCAGG +GACCAGAGGGAGGGAGCAGACACCCACCCGGCCTGGGCAGGTCCTCCTCATTGCTTGCAT +CCGCCTGGTTAGCAGTGGCAGTCAGTCCTGCCGAGTCATTCGTGAGGCGCTCACCCAACT +CCAGGCAGATGTAAAAGGTGACCTACAAGAAGACAAACAAAAACATCTGGAGCGCTCTTA +TGCCAGCATCTGCCCTTGACACCACCAGGCAGGCTGTTGCTGGGAGCCGTGGTGCTTGGG +TAAGCTCCTTCCCATGGCAGAGCTCCTGGGACGCATTGTAGAAGCAGGGACCACCTCCCA +GGATAACCAGATAGCAGCACACCCTGCACAGCCCCTTTTACTCCAGCATCATCGGGCATT +GATATCTCAGCTGCAGCCACAGGCGGCCCCCAGCACCCCAGGAAGTGGGGAGCGCTCATG +CTTCTCTGAGCACAAAAATCACTGAATATTTTTGCCATTCTCATGGTCATAACCCGGGCC +ACAGAGTAGAACACTCCTATCACTGTTGTTAGACAGTGGTCCTGGGAGAGGGTCTTGTGT +GCCTCGGATGCCAGGGCCTCTTTTTATTGGGAGGTGCTTGTTATTTCTGTGTGTGGCTGC +ATTTGTTTCCCAAGACTGCCACAACAAATCATCACCAACTTGGTAGCTCAACATAGCACA +GCTTTATTCCCTCCTGGCTCTGGAGGCCAGGTGTCTAAAAGGCCATGCTCCCACAATGGT +TCTGAGGAGGATCCTTCCTGCCTCTCTGGCTTCTGGTGGCTCCAGCATCCCTGGGCTGTG +GCTGCACCTCCCCATGTCAACCTCCGTCTTCACAAGGCCTTTTCCTGTGTCTCTGCAACC +ACAGGCCCCTCTCCTTTCTCTTAATAAAGATACCAGTCATTGAGTTTGAAAATTGCTAAG +AGAGTCTGTTGTAAATCTTCTTAGCACAAAAAAAAATGACAGATATGTGAAGTGGTAGAT +ATATTAATTAGTTTGATTTGATCACTCCGCTATGTGTATAAATGTCAAAACAAACATTGC +ACTCCATAAATATATATATTAAAAAAGATCCCAGTCATTGCATTTAGGACCCACCCTAAA +TCCAGGATGATTTCATTTCAAGACTTTTAACTAGATTTGCAAAACCCCATTTCCAAATAA +GGTCACATTCTGCAGTTTTGGGTAGACGTGAAATGTGGAGACACTGTGCAACCCACTGTC +TTGGGGAGGGGGTGGTCAGCCTGGGGCAGATGTTGCTGGGTGTGGAGCTACATCCACTCA +TGCCCTGACCTGGAACCCAGACCTGCTTCCCCAGCTCTCCTCCTGGTTATCTGAAGCAGG +GAATGGAGAGCACTGCCCTCCTTGCCCAGGCAGTCTCTATCACCTGGTTTTAGTTTCTTC +TTAGCACATATTGCCCCAGAATATCTGGTTGGTTTATGGCTTACTTGAGTTTGTGCCTAC +CTGTCCCAACCGGGAGGTGAGCCCTGGCTATTCCCCAAACCCGGCCCTGCATGTGGGAGC +TGCCCTTCCTCCGTTCATCAGAGGGGGCCAACAGTCCACAGCTGTTCTTAATCATCTCCC +AGTAACCCCCAGCTCCACAAAGGTGACTCCTTACATGGTGGAGAGGTGGTCGGGCCATCC +GTGTGAAATGTGTATGTGACCGTTTTCCTTAAGGGGCACGTAGTCTTGGCAGGTTTCGCT +CAATATAGGATGAGCTCAGGACTCCAGTGGACTGTGGATTCAGATCTGGATTCTGGCGCA +TTCGCCGTGTGAACGGGGGCACGTTGCTGGCCTGTCTGCGCCTCGTCTCCCGACTGTGGA +GTGTGTTCTGCCCCTTGTCTTTCTGGGAGGTAGGGAGGGCAGTGAGCCCCTTCGCATCGC +CCACCACAGGCCCAGCACATGGCTGATCCCCACTGAGTGTTCTTTTCCTCCTTTGATCCC +CTTTGGCTGACCTAGGTTGGAGCAGCCACTAAAATATACCCAGAAACATCTTCCTAATCT +ACATCTGTGCCAACCCTCATTCCCTGGCGCAGCATGACCATCACATGCCCGCCATTGTTC +CTGATCTCTGCTGCTCATGACCTGCTCTCCAGCGCTCCTTCTCATGCTCACATTCCAGTT +GGCCTGACCTAGATAAGTGGAGGTTTATTTGACCCCAAAAATTAGCCTTCTACAAACGAA +TATAATAGTGTCCATTACAGAGAATAAACTTAGTGCGTGTCCCATTTAAGCAGAAGTTAC +TGAAAGCCTGAGTTTAAGTTTCCAGGGCCTGAAAGTTTTCCATGACAGTTTTCTGCATAA +TATTACCTACAATTTCAATCTGTTATTTAAAGCCATTCTTGTGTTTGTTGTACTTTGATT +AGCTTTATTTTGATTTGAAGTCCTTTTACATTACGGGCAGTTAACGCTTTGTCTCTGTTA +GATTTGCTTTTTAGTTCACAAGAGAAACCTCATTCCTCTGTATTTGAATAGTTGCAATGA +TGGAACAGCTGTCCCTGGAGGGAAATGAAAACAGTGATTCCCCAAATTGTGACAATAGAA +ATTTGCTCTTGGGTTACTTACAATGTATCTGAGTATTAAAAAATTTTCTTTTTAAACGTT +TGAAGTAAAACTACCCAGAAACACTTAGTGGCTGACCAGAAACTAAACTCCTGGCATCCT +CAAAATGGGATTTATTGGCTTATAAATGTCCTGTGTTGACTCACAAAGGCACAAACTATC +TAGGTAAGTTTTCTTCTAAATGTTGATGGGAGAGCTGGCCACTGTTATGCAAGTTTCATT +GTCCTGACTAAACTGCCAAAGAGATTACATAAAATTATATCAACTAGACAAAAGGAAAAA +GGAAAAAAAACAGAGGTGTCTTGGGAGGAATCCATATGAGACCAGTAGACCATGAGAGAG +ACATCCCTTGCCATCTACAAGGAAAATGGATTTTGTTCTCCATATGCAAAACCATCTCAG +GAGCTTGCGGAGACACCACTTGCTTACTAGCCAGAAAGAGCAGGTGCCTCCTAAATTCCC +CACACAGGAGCTCACAGTGGCTTTCATGCACTGGGATTAAGTTAGACTTAAGAAAGCCTG +TCTACTCTTCCTGGGATTTACAAGCCAGCTAGTAAATCCCAGAATAAATCACACGGCACA +GTCATCCAAAGATCCCGTCATCCGTGCCGTTTGGAAAGCCCTGCTCCTGTGCCACCCTCT +CCCCGTGGAGCCTCCCATGCCCAGGACTGCAGAGTCCTGCCATTCAGACTGCAACTCATC +TCACATTCTTCCAAACTATTTGGACAACAGAGCTTTCTCATCACCTAATGCAGATTACAG +TCTCACAGAATTGAGTGTTCAGGCAGACACTGATGTGGTTCTGTAGTACAGCAAACAATA +TCAGTTTACAGTCCTGAGGCCAGGCCTGGTGAACAACGCACGGTAGCGGTGGGGCAGGGT +TCTCAGAATGAAACTGGCTTACACATGGCACTCTCTGACCACAACTGTATAAGCACCAAA +CTACACTTAGTTCCATCTATGAGGTAAAATTTAATGCAGATGAACATCAAAGAAAACGTC +AAAGGCTCCTTTTTACAAGTACGTGGGCTACTTAATTTGGTCCAAGTCCATTTTAAAAAG +CCCTAGGTGCTTTCACGGCTCTGCTACTGACAAGAAGCCCCAGTGCCTGTGAGCTGCTAA +TGGGAGGGAGAGGAAGATGAGCTGAGTGGGCCGGGCTATCCCGTCCACACCGGGAGACAG +GGAAGGAGACTCCAAGCTGGTGGTGCCAGCACATTCCAGGCCACTCAGGCCTATTCCTAG +GTGCCAGGTCACGAAAACCACGCTGACAGATCGTGCTGTGTGCGTGTCATAGCACACAAG +CAGGACTGTGAGAGAGTGAAAGTGACACTGGGTGGAGCACTGAGGAAGGGCCACAGTGTG +TTGGTGGAGATAGGCTGTCATGGAGAAGAGACCCTGGCTTGCTCTACATTGCTTCCAATG +CAACTGCAAGGCAGGTCCCAGAGGGCTCCGGCCTTCGTCATCCAGGTTTGCTCCCTCCCC +TCATGGCTTTCCCATCCTCAGATGAGGACTCGGCAGAGCCTACCCCTGCTGACTAACTGT +GGCCCCAGGGTGGTGACTCAGCCCTGCACCTCCTGATCCCGTCTGCACTGGGCCAGAGAG +GATGACTTACCCAGCACGTTCACATCACACAGCTTTGTGGATTCCTAGGTCCAAGGACCA +GAGATTTCAGTTATGTGAGTTATTTTTTTTATTTGTTCTTGCGTATTCCACAAAGGGTCG +CAGCTAAACTTAACCTAATGATCACTTTAGTATATCACTAAAAAGACAAAGCTCACAGTG +CTGTTGAAGCACATTCATCATCTTTAGACATTTTGACTAGTTATTTCTTAAGCATTTACC +TGCTAGTGTTAAGCATCACATGAAATACATATAGAAGTAAGACAAAATTTCTTATCTCCC +CAAGTTTGCCAACAAATACAGAGCAGGAAGGGAAGCAGGTCAGAGCAGGAGGCGCAGCTA +TAGTGAGGCCACCATGCAAGGCACAGGGAGGGTGAGCTCCAAGTTTGAATGGAATGGGTC +TGTCAGCCAAGCCCCCTGGCTCTGGGAAGATAGCAGTGAACAAGCCAGATGGCCCCTCAC +CCTCCAGAGCCGTGAGTCCTGCAGACCAAACAGCGTGACAGGTCCTTTCCCTGTCCAGGA +GGCCTCTGTGGGTGAGAGTTGGCTGCGGACAGGGCGTGAAGGCACTTGAGGGTGGGGAAG +TGACTCTGACTGGGAGATGCTGAGGACAGGGAGGAAACCACCAGATAAGGGACACTGGGG +AGGAGGGGTGGACCCCTCAGGGCCAAGCACATGGAGCCTCATCACAAAGGCAAGATGGTG +GCCAAATTCAAGGTCGCTGCAAAAGGAATGGAGAAGAGAGAATAGATTTGGCATTTGGAG +GAAATGGTGACAATCATGAGCACCTACCCGGGACTCTCCATGGGTGCTATCTCTACATAA +ACTCATTCCACCCTCTGATTAATCCATTCTACATATGGGGAAACAAAGGCATGCGGTGTT +TACGTCACTTGCCAAGATCTCAGGATTTGATCCAGGTGGCCTGGTTCCATGGTGCAGCCT +CTCAGCCTGCATGGATGCCCCAGCTCAGAGCATGACTCTCAGGACAGGGGTCCCAGCAGC +CCTCCCTCCCTGAGCAGCAGGGTGCCCGTGCTGCACCACTTCTGTCTAGGAATAGGACAT +TCTGACACTTTCCTGCCTCTTCCGAGGTCTAGCACTTACTCTATGCCTGCCTGGGAAGGT +GGCAAGCTGGCCTGAGGAACAGACTCTTCCATTTTTTAGGGAGCTCAAGGCCACAGATGC +TCTGAGATCTGGAGTCCAGAGACAGGAGCGGAGGCTTCTCCTGGTGACCACTCTGCTTAA +AAACTTCATCAGATCCGTAGTTTCAGAGCCCCCCTGAACCCCATCCCTTACCTCTACCAG +TTGCAGGTGGGTCTCTGGGGTGGGGCTGCCCTCCCCACCAGCACCCCAAGGGCTAAAAGG +TTGAGGGGAGAACACCATCATTTGTACAGGGGGATCCTGGAAGATGAGGCCTGAGAAAGC +CCTGCGGGGCCCCTCACCTTCTCCCTAGCTGTGGCCAAGAGTGTCTGGCCTTGCCTGCCT +CAGGACCAGCCCAAAGTGGAGGTGAGAGGTGAGCCCCAGCCCCCAGGGGAAGGGTGATGG +TGGTCTTGGTCTCAGCATGGTTCTGGTAGAGGTGGGTTATTTTGAAGATGATGAACCTTA +AGCCTCTTTCTGATCTTGCTTTAAATAAATACTTCTGAACAACAGCAACAACAGAATAGT +GTTGATAGGAAAGCCCTCCACTCCACCAGAACCACGCGGCCTTCTCGTCCTCCCCTCCTC +CACTTCCTTCCTAAGTCACTGCTCCATGAGCTCTTCCACAGGAGATTTACAAAATAGAAC +ACAAACAATCCAGTTCCCGCCTCTCACTCTGAACTCCTCCCAAGACTCGTGGGGTGCGGC +AGCCCCTGGGAACACCCAGCCCTTCAAGGTCAAACACAGCCCCCGCCCCTCACTCTGGGG +TACCCTGCCAGAATAAGCCCCGACAGCCATGTGGAGCAGAGCCTTCTTTTTTGTAAGTGG +AAGTTCCAGGCTGGCTTTTCAAATCCCCTTTTAACCTCAGTGCTGTATTTCAAAATTCAT +TCCAGTTTTCCTGTAGTAATTAACAAAAATAAATATTTTAATTTCAATTAAAGTGAGGGT +CTCGGAGAAGAAGCAGGAACTGAGTTTCCTGAGAGGCCCCGCTGAGGCTTTGTTGATATT +TCTTCCTGCGACCTCTGCTCGGACCCTGGGAGCTCACAGGCCGTATCGCAGCTCTTATCT +TTGGGGACCAGTTAAAGCATAACTGCGCCAGGCACAGAGTTGTCCTTTCAAATGTGCCGG +CAGTGGGACGGAGACCCATGCGTCAAGTCTCCTCTAAGTTCACATGGGATTCTCTCCTTG +TCCCAAAGCTGTCTCTGACTTAAAACCCTCCAACTGATTACCTGAATTCCAGAATATGTC +CTGTGCTCTCTGCCCTTTCCCACGCCTTTGGTGAAGACCGGTGTTCTGAGGAAACAGACA +CTGTGTAGAAATGGCTCAGGTCCTTTAAAGCCCTGGTGTGAGGAGTGGGGAAGGGCTGGG +CCAGAGGTCAGCTGGATTTGTTAGATTGACAGAGTGACGCGGACTTCCCCAGAGGCACGG +GACCAAGGTGCATGCTCACGCTGTCTCATGCTCTCACACATAATGTGTGTGTGTGTGTGT +GTGTATATATATATACACATATACATATATATATATACACACATATGCATATATATAAAA +CCCCAAGCAGCCTCTGGCTTAGCAGGTGCATTTCCCAGCAGGGCAATTAAAGCCATGGTC +CCAGTAGTGGTCTTGGGGTCTCAGGGTATTTGGTCTGTGCAGCCACATGCTTCAGTCTCT +GGACCCCAGGTCATCTAACGAGGTGGTCGTGTGGGGACTGGGATAGAAAAGGTGTCTGCA +CGGACGTGTGTGAAAGGGCTGGCACATCGCCAGTGCTCAGCACTGTCAGCTGCTATCACC +AGTCATTCAATCATTCATTCATTCAGTTGTTCATTCTTCAACAGGCCGTTTTAAAAATGT +GCCCAGTATACCAAAATCTCCGCTAAGCATTTAAAGAGGCAGAATGAAAGTTAGCAGTGG +TGGTGAAACGAAGCTGGGAATGTGCTCTGAGGGCCTCCTTGTGGGCTTAATGAATATGTA +GAAACCACGCATTTTAAATAGAGAGGGAGAAAGGGAGAGGTTCCTGGTCCTCTGCATGGG +GACTTGTGTGTGGCTCTTTACTGTAGGCCTGTGCCACTCCTGCTCAACAGCTACCACAGA +GGACGCCTTCAACAAATGTGAAGAACGAACAAAAGGTACAAATGTGAAGAACGAACAGGG +TAGAAAGAAAGGAGAAAGCAAGGGTGAGGGTGAGAAATCAAGGGACAGAGAAGAGAGAAG +AGGAGATAGCCTGGGAGTTCACACAGCCAAGAAGGTAGACACTCAGTTGAACCAGCAAGA +GGCTGAGCCTAACTCTCCCTTTCGAATGGGCAGGAGTTCATGATATTTAATAAACAGAGG +CCTTGCTCTGTAAGAGACAGGGTACCAGGCAGAGAGCAAGTCAGCATCGCAGGAGTCAAA +CGAGGCAGACAGCGGGGGCAGGGAGCTTGCCTCTGAAGGAGACCCAGGCTGCCAGAGTAG +CAGGGAGTCTGGGCCAGTCCTCTTTTGGGAAGCGCTTCCTCGGCTTCTGCCCCCCCTCTC +CTCTCCCTTTCCACCCACCATCCTGACATAATACTTCCTAATCTGGAAGTGTTGTCCAGA +GAAGAACCTGCTCATTTCCTCTTAAGTAGGCAGGGAAGCACTAACGTCCAGCAGCATCGG +AAACCCGTAGGAGCGCTCTCGGCAGTGCAGGGTGAGGGGACAGTCCATGTAGTCATGAGA +CGTGGGTGTCAGGCAAGCGTCTCTTTTCCAAAAGAGAAAAACATTAAAGGCCTCACAAAC +GGCGCCCAAAGACTAATTCTGCATAGCATCTTTGCGAGACCCTAGGTTCTTATGATGACT +GGTTTTGCCTGAGAAAGAAAAAATTTTAATTTTGCTCTGACATGCCAATTCAACAAATCA +TTTTCACATAATATTCATGCAAAAAAAAAACAATTTGCCAGAAAACTTGGGAATCCATCC +ACATCTACAGCTTTTCCCTGCAGTCACACTACAGTGGGATCCCTCCATACAGGAGCGGCA +GAGTGGAGCAGGCTAGAGATGCCTGTTTGTTTCTGTTTGCTGCACCGCAGCAAGCATTTC +TGTCGTGCCCACTCTGTACTAGAAAGTACATGAACATCAGCCATAAAGGGAACTAGAAAG +GTGGCCCACCCTCTTGGTGGAGAGAGAAGAGAGTGTGGTAGAAACAATAATAAGAAGTCT +GCAGAACTTGACCCCTCCCAGCCTCTCCCACCTGCCAGCCTGGCCCTTGCAGAGAGATGC +AGGCTGCCATTCTTAGGCCAAAGCCTGGGACAGTTGGGCTCAGCAAGGTAGGCATCCGTC +AAGCAAGGAGGAGCAGGGGTCAGCAGTGACCCCAGCAGCCAGCAGGGAGAAAGGTGCATG +TGACAAGGACACCAGAGGCCGTGGGTCAGGATCAGCCAGGGTCAGGGTAGCATTTCTAGG +AATTCACTCTGTTGGGCGCTGTGCTGGCTGCTTCTCACATATTATTCCTTTCTTACTCTC +AGAGCAGAGATTTCAATTGCAGCGAGATTGTGGAGGCAGCCAGGGAGGTGGGGAGGGTGG +TGTCTTCTAAAAGCATTTTCAGTATCCATGTGGTTTCAGTAATAATAATAATAATAAACC +AGTGAAAAGTAAAACAGGACAAAAATCTTCATAGGCAGTGAACCATATCAGAGAGTCCAA +GAAAGCACAATGAGAGTGTGGCTTAAAAACCCTGAACGACATTCCTTTGCACCAGCTTGG +TGAGGAGGGCATGGTCCCCGCCACCCCCCACCCCCACTTTGCAGATAAACCACATGCAGG +AAGGTCAGCCTGGCAAGTCCAGTAAGTTCAAGCCCAGGTCTCAACTGGGCAGCAGAGCTC +CTGCTCTTCTTTGTCCTCATATACGAGCACCTCTGGACTTAAAACTTGAGGAACTGGATG +GAGAAAAGTTAATGGTCAGCAGCGGGTTACATCTTCTTTCATGCGCCTTTCCATTCTTTG +GATCAGTAGTCACTAACGTTCGCCAGCCATAAGTCCTCGACGTGGAGAGGCTCAGAGCCT +GGCATGAACATGACCCTGAATTCGGATGCAGAGCTTCTTCCCATGATGATCTGTCCCTCA +CAGCAGGGTCTTCTCTGTTTCAGGGCATGAACTACTTGGAGGACCGTCGCTTGGTGCACC +GCGACCTGGCAGCCAGGAACGTACTGGTGAAAACACCGCAGCATGTCAAGATCACAGATT +TTGGGCTGGCCAAACTGCTGGGTGCGGAAGAGAAAGAATACCATGCAGAAGGAGGCAAAG +TAAGGAGGTGGCTTTAGGTCAGCCAGCATTTTCCTGACACCAGGGACCAGGCTGCCTTCC +CACTAGCTGTATTGTTTAACACATGCAGGGGAGGATGCTCTCCAGACATTCTGGGTGAGC +TCGCAGCAGCTGCTGCTGGCAGCTGGGTCCAGCCAGGGTCTCCTGGTAGTGTGAGCCAGA +GCTGCTTTGGGAACAGTACTTGCTGGGACAGTGAATGAGGATGTTATCCCCAGGTGATCA +TTAGCAAATGTTAGGTTTCAGTCTCTCCCTGCAGGATATATAAGTCCCCTTCAATAGCGC +AATTGGGAAAGGTCACAGCTGCCTTGGTGGTCCACTGCTGTCAAGGACACCTAAGGAACA +GGAAAGGCCCCATGCGGACCCGAGCTCCCAGGGCTGTCTGTGGCTCGTGGCTGGGACAGG +CAGCAATGGAGTCCTTCTCTCCCTTCACTGGCTCGGTTTCTCTTAGGGACCCTCACAGCA +CTAAGGGGTGCGCGTCCCCTGTCAGGCCCTCGAATGCCCTCCCACAGCCAGGCCCCTCTG +AGGTTTCACTCTGGCCTGCTTGGCTCCTAGCAGCCACCAACCCATGATGCTGGGCCCTGA +AAACACACGCAGACCTGGATGAGTGAGGCCACTGGGCACAACCAGGGCTCCCAGCTCACC +AGAGCAGCCTGGGACACAGAGGGTGCTCAGAAACCTACCAGAGCAGCCCTGAACTCCGTC +AGACTGAAATCCCCTGTTGCCGGGAGGAGGCGCCGGGCCTGGGGGACGGGTCCTGGGGTG +ATCTGGCTCGTCTGTGTGTGTCACTCGTAATTAGGTCCAGAGTGAGTTAACTTTTTCCAA +CAGAGGGAAACTAATAGTTGTCTCACTGCCTCATCTCTCACCATCCCAAGGTGCCTATCA +AGTGGATGGCATTGGAATCAATTTTACACAGAATCTATACCCACCAGAGTGATGTCTGGA +GCTACGGTGAGTCATAATCCTGATGCTAATGAGTTTGTACTGAGGCCAAGCTGGCTTTTA +TTGTTAGTTAATTTACATTATATCCTCTGACATGCAAGTATTTTCTTTCGAGATAATGAC +TAATGATAATGTAATCATTGCTGTCTATCTATTGTACTGAGAAAACACGGCAGAGGAAAT +CGAGTCCAGCTGCCGTCCAAAAGTCACTGGAGATTGCAATGAGCTCGTCTGGCAGGGTGG +GGGGTATGGGAGGGAAAGAGCTTAGGAAACGGCTCTCCCTGCAAAGTCCAACCAAACTTT +AACGTTAACCAAACCATTAATGTTGCCATGAATTTGAAGTGAACCAGAGGGAGGTGGCAG +AAGAAGCTTAATGGGGAATAGTTCCGGTAGAGAAATGAGGCTTAAGATGAACTACCCTGG +CCCTTATGTGTCAGAGAGAACGGCTTGACAAACACACACTGAGGATGTCTGCAGGGATAA +AAGAAGAAAGGGAGATGACCCTTGCTTCTCGCTCTCGGGAGGACCATCTGGTCCGGCCCT +GGGGATTCTCTGTTTCCTCTTCTGAATCCCAGTGTTGCCCAGCACTGGCCTGTACCCATC +CTCACGAGGGCCGCTCTCCTCACCCGGCCCTAGGTCCCTGCCCTGTCCTGAGCCTACAGG +GGCCTCCCATGTTGAGAAAGTGTTGCTGACACATTGTCTCTGACCGCTGTGCCAGGCATT +TTCTGCTGAATTACCGCACTTGGTCCTTGAATTTCACCCAGCAACTTACTGAAAGGCTGG +AACCCATGAACCTACCCCTTCACTGAGGAAAATAAGTTACCCCAGCCATCTACAGCGACA +GGAGCAAGGGAGGAGTCGCCTCACCTCTCTAGAAATGTGTATTTGAGGAGAACACTATTG +AAATGAATTTCCAAGAATAATCTAGTCAGTATTACAAAAGCAAAATTATTTGGGATATCG +TCCTTTTTTACTTAGTATTTTTTCTTTTTCCTATAGCATTATTAACTTTCTGATTTTCCA +AATACATACACATTTTTAAATTTCCTGAGTCTTTATCTCTTCTGTTAAAATGTAAGATTT +ATGATACAAAGGCAGAGATTTGTGTCCATGAATAAGTGAAGTTTGGTGTGCACCTGTGAG +CTGAGCCACCTCAATTAATGGAACAGATAAGGAAATAAAGGTCTGCTGATGCATTGTTAT +TTACAGCCATTTTCAGAATGTATCTCCTCTCCACGAGGGAACTGCAGGGTCCTGCCCCAA +GCCATTTATTTTGTCCTCAAGCAGCCCGCCCCTCCCACTCCAGGCACAGCCCGGTCTCCT +GCTGGTCTCCCCTCTTCCCACTTGCTCCCCCTCATCTATGCTCCAGACAGAGGCCACATA +TATTTTTTAACTTTTTTTTTTTTTTTTTTGAGACAGAGTCTTGCCCTGTCACCCAGGCTG +GAGTGCAGTGGTGCAGTCTCGGCTCACTGCAACCTCCACCTCCCGGGTTCAAGTGATTCT +CCTGCCTCAGCCTCCTGAGTAGCTGGGATTACAGGCGCACACCACCATGCCCAGCTAATT +TTTTGTATCTCTAGTTGAGACAGGGTTTCACTATGTTGGCCAGGCTGGTCTCGAACTCCT +GACCTCATGATCTGCCCGCCTCGGCCTCCCAAAGTGCATATTTTTTAACTTTATCAGACT +TTTCATTCTCTGCTCAACATCTTTCTTTGGTCCTCCAGGTATGTTCAGATAAAACCTGAG +CACCTGGCCATGACTGATGGGTTGCTGGGCCATCTGGCCCTGGCAACTCTCCCGTCCACC +AGGTCCCCCTCCCGTCACGCTCCAGGCATAGCCTGTGTGTGCCAGCGCAATGCCCACACT +CCATGCACAAGTGGAAGCCCTCTCAAAGTCAGTGGCTTAGTGCCTTGATGTGGTCACACC +CATTCTCAGGAAGTCCGTTCCCACTGAAAACATTGTGTGTTTTCAACATCATTGAGGCTG +CCACGGCAGATTATAATCACTGGCCTAGGCAGCCCACTGGAACTACCAGACCATGAGCCT +GAATTTTTTGTTTAAAAATCATATCCTGTTTTCTCTACTCTCTAGTCTCTAGTCAAGGTG +AATTATTCAATTTAATAAATTAGGGGCCTAGTGTGTTGTACCAAGGAGCTAAAAAGAGAG +AACTCGCAACACCTTCCAGCCCATTCTCCACCTAACACTGGCTATACTGGCTCTCCTCTC +TCTCGCTGTTTGTTCCAAAATCTAATAACCTGTCTTCCCACTAGAATTCATCATACATGT +TTAAAAACCTAGTTAAATAGTAGTTAAACTGACTGCATAGATCTGGAAATGAGACAGTCT +TTCTTTTACAAATCCATATAGACTATGAGTTGGGGGCAGGGGATGACACAAGAATCTATT +TTCTTGCCCCCAAACCATTGCTTTCCTTCCAATGTTAAGCTTGTATTCTGTGTATTAATT +CAGGTGGTTCCGTTTGGGAATGGCCTCTGTTACCCAGAGATGGGAGGGCCATCAGAACTC +GGGGTTGTCTGAAAAAACACTGGTTCTAAAATTATCACTGCTTTCACTTGTTTTTAACCA +TCATAGTTGTTTGATTTTGAAGGAAAAACATGAGGGTTTTTATTCTATGCTTGTTATATC +TATATTGTGGTTTCGTATTTTTTAGATTTTAGTACCTGACATTTTTTTAACTTTTATTTT +AGGTTCAGGGGTACATGTGCAGGTTTGTTATATAGGTAAATTTGTGTCATGGGGGTTTGT +TACACAGATTATTTTATCACCCAGGGATTAAGCCTAGTACCCATTAGTTATTTTTCCTGA +TCCTCTCCCTCCTCCCATCCTCCACCGTCCTATAGACCCCAGTGTGTGTTGTTCCCCTCT +AAGTGTCCATGTGTTCTCATCATTTAGCTCCCACTTATAAGTAAGAACATGCGGTATTTG +ATTTTCTGTTCCTGCATTAGTTTGCTAGGGATGATGGCCTCTAGCTCCATCCATGTTCTT +GCAAAGTACATGATCTCATTCTCTTTTGTGGCTGCCTAGTGTTCCATGGTGTATATGTAC +CACATTTTCTTTATCCAGTCTGTCATTGATGGGCATTTAGGTTGATTCCATGTCTTTGCT +ATTGTAAATAGTGCTGCAGTGAAAATACGCATGCATATGTCTTTATGGTAGAATGATTTA +TATTCCTTTGAGTAATGGGATTGCCGGGTCAAATGGTAGTTCTGTTTTTAGCTATCTGAG +AAATTGCCACACTCTTTTCCACAATAATTGAACTAATTTACATTCCCACCAACAGTGTAA +AAGCATTCCTTTTTCTCCACAACCTCACCAGCATGTGTTGGGATTTTTTTTTTTTTTTAC +TTTTCAATAATAGCCATCTGACTGGTATGAGATGGTATCTCAGTGTGGTTTTGATTTTTA +TTTCTTTAATGATCAGTGATGTTAAGCTCTTTTTCATATACTTGTTGGCTGCATGTATGT +CTTCTTCTAAAAAGTGTCTGCTCATGTCCTTTGCCCACTTTTTAATGGGATTGTTTAATT +TTTTCTTGTGAATTTACTTAAGTTCCTTATAGATGCTGGTTATTAGACCCTTCTCAGATT +TGTAGCTTGCAAAAATGTTCACCCATTCTGTGGGTTGTCTTCACTCTGATGATAGTTTCT +TTTGCTGTGCAGAAGATCTTCAGTTTAGTTAGATCCCATTTGTCAATTTTTGCTTTTGTT +GCAATTGCTTGATGTGTTTTCATCATGAAATCTTAGCCCATTCCTATATCCAGAATGGTA +TTACCTAGGTTGTCTTCCAGGGTTTTTATAGTTTGGGGTTTTACATTTAAGTCTTTAATC +CATGTTGAGTTTATTTTTGTGTATGGTGTAAGGAAGGAGTCCAGTTTCAATCTTCTTCAT +GGCTAGCTAGTCATCATTTATTGAGTAGGGAGTCCTTTATTCATTGCTTTTTTTTTTTTG +TCAACTTTGTCAACGATCACATGGTTGTAGGTGTGCAGCCTTATTTCTGGGCTCTCTATT +CTGTTTCATTGGTCTGTATGTCTGTTTCTGTACTAGTACCATGCTGTTTTGGTTACTGTA +TCCCTGTAGTTTAAAGTCAGGTAGCATCATGCTTCCAGCTTTGTTCTTTTTGCTTAGGAT +TGCCTTGGCAATTCAGGCTCTTTTTTGGTTCCATGTGAATTTTTAAATTGTATTTTCTAG +TTCTGTGAAGAATCTCATTGGTAGTGTGATAGGAGTAACATTGAATCTATAAAATACTTT +GGGCAGTATAGTCATTTTAATGATATTGATTCTTTCTATCCATGAGCATGGAATGTTTTT +CCATTTGTTTGTGTCATCTCTGATTTCTTTAAGCAGTGTTTTGTGGTTCTTATTGTAGAG +ATCTTTCACTTTCCTGGTTTACTGTATTTCTAGGTATTTTATTCTTTTTGTGGCAATTGT +GAATTGAATTGCATTCCTGATTTGGTTCTCAGCTTGACTGTTGTTGGCATATTGGAATGC +TAATTATTTTTGTACATTGATTTTGTACAACTGAGTCTTCACTGAAGTTGTTTATCAGCT +TAAGGGGTTTTGGGTCAAGACTATGGGGTTTTCTAGATATAGGATCATGTCATCTGCAAA +CAGAGATAGCTGTTTTCCTCTCTTCCTGTTTGGATGTCCATTATTTCTTTCTCTCACCTG +ATTTATCTGGCCAGGACTTCCAATACTATGTTAAATAGGAGTGTTGAGAGAGGGAATCCT +TGTCTTGTGTCAATTTTCAAGGGGAATGTTTTCAACTTTTGCCCATTCAATATGATGTTG +GCTGTGGGTTTGCCATAGATGGCTAATATGTTGAGGTTTGTTCTTTAAATACCTAGTTTA +TTGAGAATTTTAAACATGTTGAATTTTATTGAGAGCCTTTTCTGCATCTATTGAGATGAT +CATGTGGCTTTTGTCCTTAGTTCTGTTTGTGTGGTGAATCACATTTATTGATTTGCATAT +GTTGAACCAATCTTGCATCCCAGGGATGAAGCCGACTTGATTGTGGTGGCTTAAGCTTTT +TGATGTGCTGCTGGATTCGATTTGCCAGTATTTTGTTGAGGATTTTTATGTCTATGTTCA +TCAGAGATATTGGCCTGAAGTTTTCTTTTTTTGTTGTATCTCTGCCAAGCTTTGGTATCA +GGATGACATTGGCCTCATAGAATGAGTTAAGGAAGAGTCCCTCCTTCTCAATTTTTTTGG +AATAGTTTCAGTAGGAATGGTACCAGCTTTTTTTGTACATCTTGTAGAATTTGGCTATGA +ATCCATCTAGTCTTAGGCTTTGTTTTGGTTGGTAGGCTATTTATTACTGATTCAATTTTG +GAGCTCATTATTGGTCTGTTCAGGGATTCAGTTTCTTCCTGAGGTTTTTATTTTTATCAA +ATGGAACTTAAGCTTTTTCATTTCCAATTTTTTTATGATCTAAAAATGTGCAGTTTACAG +CCCTGTTCAGAATCTGCATCTTCCTCATTCTGCAGATACAGGTCCCTCAGAGCAGGTGAC +TGAGTGTGTATCCTGTCTGGAGCATAATACTTATGCTAGTAGAGTTACTGTTGTCTTTAT +TGTTAATTACCAAAGTTTACCACTTATCAGTCACTTACTACTTGCTGGGCATTGCACTAA +GCATTTCAGTTGTATTATCTTGTTGGGTCCTTACAGCAATCCTGTGAAACAGATACTGCT +ATTACCCCACTTTATAGAGAGGTAGACTGAGGCTTCCAGCATTGAAGCAAATTGCCCAAG +ACTACAGAAATGTAGGTTTCTAAACATCAAGAAACAGTAACCAGTAATGATGACTAAAGC +AAGGGATTGTGATTGTTCATTCATGATCCCACTGCCTTCTTTTCTTGCTTCATCCTCTCA +GGGGTGACTGTTTGGGAGTTGATGACCTTTGGATCCAAGCCATATGACGGAATCCCTGCC +AGCGAGATCTCCTCCATCCTGGAGAAAGGAGAACGCCTCCCTCAGCCACCCATATGTACC +ATCGATGTCTACATGATCATGGTCAAGTGTGAGTGACTGGTGGGTCTGTCCACACTGCCT +AGCTGAGCCTTGGTGGCTGCTCTTAGCCAAACAGCTGAGGCCTTTGCATCCCTGGAGAAA +TGTCATCACATTACTTAAGGCAGGCACACAAATCCAGAAACATCTGTAAATACCCCTTCA +AGCATTCTTTTAAAGACACTTCTTGACTCATTGGGCAGTATGACCTGACATTTGCCCATG +TTTGCAAGCAAATAAATAAAACTAAAGTCTTCCGCAAGCCATTACACCAAAATATTCTAT +TCGCTGAGTTACTCAATGAAATACCGAGTTGCCCTATATTTTGAAGCCTGTTACCAGAGA +GACTGAATGTTTTTAAATGCATGGCAGTGAGTAACAACATAAGGCTAATAGAGTCAACAT +TTCTGCTTTGACTTAAACCTTTTAAACCAGTGGATTTATGTGAAGTCTCTGCAGTGTGGC +ATTTAAACATTTCAATCTAAATAAGAGTGTGTAATTTGATTGATGCTATTATTCTACCAG +ATTCACGAGTGCAGTGGGCTCTGGAGGTAGCATTACATGCATGGGATGAGCATTTGCAAA +AGAAAGTTGTATAGGGAATATGACAGAGCCAAGTTAATGTAAATATTAATGCCTTTCTGA +ACTCTAGGCCACAGAGTTGATCTTTTTTAACTTCCTTGGTTTGGGCTAAGGAAGCTGTGA +TCCAGAGAAGCCACGTGATTTGTCTAAGGTCACATAGCAGTCTGGCCTAAAATAGCTTGA +TATGCTGTGGATGGAAAATAAATGTGATCCCTCAAGAGGCATGAGGATTTCCAGGCAGTA +GCCATACCTCCAAATTGTTTAATCTGGATTTAGATTGTTGGGTAGTCACATGCAGCAGCA +CAGTTAACAGTGTGTCCTCCTGTGGAAGTTGCCAGCACAGCCAGCCCTCTCACTTGCATG +CATGCCCACCAGCCTTCTCACTTGCATGCATGCCCACTGGGTATGTGCTGTACTGGAGAC +GCCGGGGGTAGGGGCCCAGTCCCAACCCCAAATTCTTTAAAGCCTATTTTTCTAAGTTGC +ATCTGGTTTCCTACCTGAAGGAATGCTAAGGGTGGATGTTGAGTGAGGACCTTGGTGCAG +GGCACCCTGCAGTCAGGATAGTTCATGGAGAGCAATTGTACAGACCCACACTGCTCCATC +CCCTCAGGCGTAACACAGGATGCTGACCCCAGGAAGAGTGGGCGTAGAAAAACTAGAGGG +CATTATTGTTATTCTGATTCAAATGTACAGTGCTGGCATGGTCTTTAAACAGTAACCAGT +ACTAGCTGGCCAAGACAGAAAAGTCTACCACAAAGACTTGGTTCTTTCATCACTTATTTG +ACTGGAAGTGTCGCATCACCAATGCCTTCTTTAAGCAATGCCATCTTTATCATTTCTTCC +AGTGTTCTAATTGCACTGTTTTTTCTCATTCCTTCCCCAGGCTGGATGATAGACGCAGAT +AGTCGCCCAAAGTTCCGTGAGTTGATCATCGAATTCTCCAAAATGGCCCGAGACCCCCAG +CGCTACCTTGTCATTCAGGTACAAATTGCAGTCTGTGCTTCCATTGGGAAGAGTCCCTCT +AATGAGCATCTCATGTCACTGTGTTCTGTCACATGCCAGCCTGGCCTCCCTGTGTCCCAG +ATCGCATTATTAAACCCTCCAGCGCATTAGAGCAAGCCTCAGTAAGGCGCAGGCCACATC +GTGAACTAAGCAGCATCCGTGAGTGGGGCCCACCCAACTCCATCTCCCCCTCCCCGTCTG +AACTCTCCTCTGGTGCTCGTCCTCACTGTCCGGCTAGCCAAAGCCTCAGCTGGGTCTAAG +AGAGAAGCATGGTCTATTGGGCTTTGGTGTCAGGCAGACGTGGCTTCACACCCCTGACTC +TCCACTTCTTCGCATCACCCAGGCAGCCGATCCACCTATCTCCTTCCATAACACAGGAAT +ACCAAAACCAAGCTCACAGGATTGTCTCAAAGATTCAATAAAATATGTTGCAAAATACGC +TCCCTAACACCTCACAGCAAGGTGCACACTCGATGAATGCTGCAGCTTCTTCCCTTTCTG +TTTCCTCAGAAGCTATTTGAATCTCATGTAGGGGCTTTCAAGCATCAAAGGATGGTTCAT +GTTTTATTTTAAGGCACCCACATCATGTCATGAGGGGAGGCAGCTATAATTTAGAGAACC +AAGGGGGATTTCATTATAACAAAATTGGCAAACACACAGGCACCTGCTGGCAATAGACCC +CTGCTCCTATAGCCAAGAAGTGGAATAGCATCTCTACGGGCCATTCTAATAGCCTCAAAA +TCTCTGCACCAGGGGGATGAAAGAATGCATTTGCCAAGTCCTACAGACTCCAACTTCTAC +CGTGCCCTGATGGATGAAGAAGACATGGACGACGTGGTGGATGCCGACGAGTACCTCATC +CCACAGCAGGGCTTCTTCAGCAGCCCCTCCACGTCACGGACTCCCCTCCTGAGCTCTCTG +GTATGAAATCTCTGTCTCTCTCTCTCTCTCAAGCTGTGTCTACTCATTTGAACAAATTGA +ATTTTAGGGAAAATAACCATCTAGTGAAACTCACATGGATATGAAGTCAATTTTAACCAA +ATGGTAAAATCAAAATCAAAATAAATTAAGTGTATTAATTATTTTGTTGCATTGCAACAA +CTTGATTGTAAGCCTTTTAGGTCCACTATGGAATGTAATTAAATCAAAACTAAACCTAGT +TGCTCTAAAACTAACGATTAAGACAAAAATTAAACACCTTCACAATATACCCTCCATGAG +GCACACCACCTGCATTCAGGAAAAGTGGATGAGATGTGGTACAAGCATTCCATGGGCAAC +TTCTCTGTTTCTTTTTCAGAGTGCAACCAGCAACAATTCCACCGTGGCTTGCATTGATAG +AAATGGGGTATGTATGAACACCTTATAAGCCAGAATTTACAGCTCTCCACTATGGCTCTA +TTTTACATGGAAAATGCCTTAACCTAAATAATTTTAACCCAGATAATCTTGAGTTTTCTT +CCTGTGTGGGTTTTTCCCTGCACGGCTGTCACGCCTCACAGTGCCGTTCAAAGCGTGACT +CCTGGACCAGTAGTAGCATCGCCTGGCCTTGTTAGAAACGCCATTTTTCAGGCCACTGCC +CCAGTTTGACCAAATCAGGACCTCTGGGGGTGGCACCCAGTAGTCTATGTTTGAGCCACT +TTCCAGGTGATGCTGATGTCTGTTGAAGTGTGAGGCCGTGGTCTAGACCGCACTGTGCCA +TGCAGAAACCACTAGCCACATGTGGCTACTTCAACTTAAATGTTAATGAGTTAAAATGAA +ATAAAATATAAAATTCAGTTTCTCACACATGTGAAGTGTCCAGTAGCCACACGTGGCTAG +TGGTGACCGTATTGAAGAGCACCGCTCATAGCACACCTCCCTCACTGCGGAAAGTTCTGC +TGTACAGCACCCAGCACAGCCCTGCTGCCCACCCTGCAGCCTGTGGCCCAGTAGCACCAG +CACCCACCAGGGTGCAGACTCTCAGGCCTGCCCAACCTACTAATCAGAACCAGCATCTCA +AGGAGATCTCGGGTGATTTTTGCAAACACTGAAGTTGGGGCAGCCCTGACCGGAGTAACC +TTCCCTCATTTCCTCCTGCAGCTGCAAAGCTGTCCCATCAAGGAAGACAGCTTCTTGCAG +CGATACAGCTCAGACCCCACAGGCGCCTTGACTGAGGACAGCATAGACGACACCTTCCTC +CCAGTGCCTGGTGAGTGGCTTGTCTGGAAACAGTCCTGCTCCTCAACCTCCTCGACCCAC +TCAGCAGCAGCCAGTCTCCAGTGTCCAAGCCAGGTGCTCCCTCCAGCATCTCCAGAGGGG +GAAACAGTGGCAGATTTGCAGACACAGTGAAGGGCGTAAGGAGCAGATAAACACATGACC +GAGCCTGCACAAGCTCTTTGTTGTGTCTGGTTGTTTGCTGTACCTCTGTTGTAAGAATGA +ATCTGCAAAATTTCTAGCTTATGAAGCAAATCACGGACATACACATCTGTGTGTGTGAGT +GTTCATGATGTGTGTACATCTGTGTATGTGTGTGTGTGTATGTGTGTGTTTGTGACAGAT +TTGATCCCTGTTCTCTCTGCTGGCTCTATCTTGACCTGTGAAACGTATATTTAACTAATT +AAATATTAGTTAATATTAATAAATTTTAAGCTTTATCCAGA diff --git a/test/norm.right-align.gff b/test/norm.right-align.gff new file mode 100644 index 000000000..4b2b426a0 --- /dev/null +++ b/test/norm.right-align.gff @@ -0,0 +1,4 @@ +7 ensembl_havana gene 100 29201 . + . ID=gene:ENSG00000146648;Name=EGFR;biotype=protein_coding;description=epidermal growth factor receptor [Source:HGNC Symbol%3BAcc:3236];gene_id=ENSG00000146648;logic_name=ensembl_havana_gene;version=11 +7 ensembl_havana mRNA 100 29201 . + . ID=transcript:ENST00000455089;Parent=gene:ENSG00000146648;Name=EGFR-004;biotype=protein_coding;havana_transcript=OTTHUMT00000343056;havana_version=1;tag=basic;transcript_id=ENST00000455089;version=1 +7 havana gene 5875 15059 . - . ID=gene:ENSG00000224057;Name=EGFR-AS1;biotype=antisense;description=EGFR antisense RNA 1 [Source:HGNC Symbol%3BAcc:40207];gene_id=ENSG00000224057;logic_name=havana;version=1 +7 havana transcript 5875 15059 . - . ID=transcript:ENST00000442411;Parent=gene:ENSG00000224057;Name=EGFR-AS1-001;biotype=antisense;havana_transcript=OTTHUMT00000343091;havana_version=1;tag=basic;transcript_id=ENST00000442411;version=1 diff --git a/test/norm.right-align.vcf b/test/norm.right-align.vcf new file mode 100644 index 000000000..d8dd93f6c --- /dev/null +++ b/test/norm.right-align.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##contig= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +7 897 . GGAATTAAGA G . . . +7 910 . G C . . . diff --git a/test/norm.symbolic.1.out b/test/norm.symbolic.1.out new file mode 100644 index 000000000..cec427798 --- /dev/null +++ b/test/norm.symbolic.1.out @@ -0,0 +1,11 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +20 15 . TAC T . . ORI=20|24|ACA|A +20 15 . TAC . . END=17;SVTYPE=DEL;ORI=20|24|A| +20 93 . CAAA C . . ORI=20|98|AAAA|A +20 93 . CAAA . . END=96;SVTYPE=DEL;ORI=20|98|A| diff --git a/test/norm.symbolic.fa b/test/norm.symbolic.fa new file mode 100644 index 000000000..12bf50588 --- /dev/null +++ b/test/norm.symbolic.fa @@ -0,0 +1,3 @@ +>20 +AGGATGGGGCTCATTACACACACACACCTTGTCTCCAGAATCACTGGTGAGGAAGGGGAG +TGCAGCCTGGGAGACAGAGCAAGACTCCATCTCAAAAAAAAAAAAAAAAAAAAAGGCCAT diff --git a/test/norm.symbolic.vcf b/test/norm.symbolic.vcf new file mode 100644 index 000000000..2dc375186 --- /dev/null +++ b/test/norm.symbolic.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.2 +##contig= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +20 24 . ACA A . . . +20 24 . A . . END=26;SVTYPE=DEL +20 98 . AAAA A . . . +20 98 . A . . END=101;SVTYPE=DEL diff --git a/test/query.95.out b/test/query.95.out new file mode 100644 index 000000000..ec5476a91 --- /dev/null +++ b/test/query.95.out @@ -0,0 +1,3 @@ +#[1]CHROM [2]POS [3]SAMPLE [4]DP [5]GT +4 3258449 C 1 1/1 +4 3258449 D 0 0/0 diff --git a/test/query.96.out b/test/query.96.out new file mode 100644 index 000000000..585c684bc --- /dev/null +++ b/test/query.96.out @@ -0,0 +1,2 @@ +#[1]CHROM [2]POS [3]SAMPLE [4]DP [5]GT[6]CHROM [7]POS [8]SAMPLE [9]DP [10]GT +4 3258449 C 1 1/14 3258449 D 0 0/0 diff --git a/test/query.97.out b/test/query.97.out new file mode 100644 index 000000000..c409c224f --- /dev/null +++ b/test/query.97.out @@ -0,0 +1,2 @@ +#[1]CHROM [2]POS [3]SAMPLE [4]DP [5]GT [6]SAMPLE [7]DP [8]GT +4 3258449 C 1 1/1 D 0 0/0 diff --git a/test/query.98.out b/test/query.98.out new file mode 100644 index 000000000..b53f5968c --- /dev/null +++ b/test/query.98.out @@ -0,0 +1,2 @@ +#[1]CHROM [2]POS [3]SAMPLE [4]SAMPLE [5]DP [6]DP [7]GT [8]GT +4 3258449 C D 1 0 1/1 0/0 diff --git a/test/query.header.vcf b/test/query.header.vcf new file mode 100644 index 000000000..5328daad1 --- /dev/null +++ b/test/query.header.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +##FORMAT= +##FORMAT= +##contig= +##reference=ref.fa +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT C D +4 3258449 . A C . . . GT:DP 1/1:1 0/0:0 diff --git a/test/reheader.3.fai b/test/reheader.3.fai new file mode 100644 index 000000000..25234c1af --- /dev/null +++ b/test/reheader.3.fai @@ -0,0 +1,12 @@ +1 2364278061 6 2364278061 2364278062 +2 2317450362 7159592787 2317450362 2317450363 +3 2291775479 9477043156 2291775479 2291775480 +4 2192534405 11768818642 2192534405 2192534406 +5 2148190925 13961353054 2148190925 2148190926 +6 2107674557 16109543986 2107674557 2107674558 +7 2082167746 18217218550 2082167746 2082167747 +8 2081484518 20299386303 2081484518 2081484519 +9 2024734096 22380870828 2024734096 2024734097 +10 1752849333 2364278075 1752849333 1752849334 +11 1650012615 4117127416 1650012615 1650012616 +12 1392452741 5767140039 1392452741 1392452742 diff --git a/test/reheader.3.vcf b/test/reheader.3.vcf new file mode 100644 index 000000000..7f7a4ce83 --- /dev/null +++ b/test/reheader.3.vcf @@ -0,0 +1,4 @@ +##fileformat=VCFv4.3 +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 22 . A G . . . diff --git a/test/reheader.6.out b/test/reheader.6.out new file mode 100644 index 000000000..9cb406cd7 --- /dev/null +++ b/test/reheader.6.out @@ -0,0 +1,16 @@ +##fileformat=VCFv4.3 +##FILTER= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 22 . A G . . . diff --git a/test/split-vep.26.out b/test/split-vep.26.out new file mode 100644 index 000000000..9eadaff25 --- /dev/null +++ b/test/split-vep.26.out @@ -0,0 +1,21 @@ +860416 +860417 +860431 +860461 +860498 +860504 +860511 +860521 +860564 +860604 +860607 +860637 +860684 +860688 +861172 +861177 +861178 +861196 +861209 +861211 +861213 diff --git a/test/split-vep.filter.1.out b/test/split-vep.filter.1.out new file mode 100644 index 000000000..a0c86c26e --- /dev/null +++ b/test/split-vep.filter.1.out @@ -0,0 +1 @@ +41334153 5_prime_utr_variant&nmd_transcript_variant ENST00000478993 nonsense_mediated_decay diff --git a/test/split-vep.filter.2.out b/test/split-vep.filter.2.out new file mode 100644 index 000000000..135f6c0ec --- /dev/null +++ b/test/split-vep.filter.2.out @@ -0,0 +1 @@ +41334153 5_prime_utr_variant&nmd_transcript_variant ENST00000478993 nonsense_mediated_decay A|5_prime_UTR_variant&NMD_transcript_variant|MODIFIER|DDX3X|ENSG00000215301|Transcript|ENST00000478993|nonsense_mediated_decay|1/19||ENST00000478993.5:c.-100G>A||756||||||1||1||SNV|HGNC|HGNC:2745|||1||CCDS43931.1|ENSP00000478443|O00571.228||UPI000013CB6D|O00571-1|1|||||||||||||||||||||||||||||||||||loss/donor/41334153-41334154/Medium/1.787081|0|0|2|uAUG_gained_CapDistanceToStart:755&uAUG_gained_DistanceToCDS:100&uAUG_gained_DistanceToStop:27&uAUG_gained_KozakContext:GCGATGC&uAUG_gained_KozakStrength:Moderate&uAUG_gained_type:uORF|uAUG_gained|15.55|1.471728||||||-27|11|-27|-25|0.01|0.00|0.00|0.00|DDX3X diff --git a/test/split-vep.filter.vcf b/test/split-vep.filter.vcf new file mode 100644 index 000000000..2a058801d --- /dev/null +++ b/test/split-vep.filter.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +##contig= +##VEP="v102" time="2020-12-14 19:43:49" cache="/opt/vep/.vep/homo_sapiens/102_GRCh38" ensembl-variation=102.2716d2e ensembl-io=102.ff1cf96 ensembl-funcgen=102.6bd93a0 ensembl=102.347f9ed 1000genomes="phase3" COSMIC="91" ClinVar="202006" ESP="V2-SSA137" HGMD-PUBLIC="20194" assembly="GRCh38.p13" dbSNP="153" gencode="GENCODE 36" genebuild="2014-07" gnomAD="r2.1" polyphen="2.2.2" regbuild="1.0" sift="sift5.2.2" +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +chrX 41334153 . G A . . CSQ=A|5_prime_UTR_variant&NMD_transcript_variant|MODIFIER|DDX3X|ENSG00000215301|Transcript|ENST00000478993|nonsense_mediated_decay|1/19||ENST00000478993.5:c.-100G>A||756||||||1||1||SNV|HGNC|HGNC:2745|||1||CCDS43931.1|ENSP00000478443|O00571.228||UPI000013CB6D|O00571-1|1|||||||||||||||||||||||||||||||||||loss/donor/41334153-41334154/Medium/1.787081|0|0|2|uAUG_gained_CapDistanceToStart:755&uAUG_gained_DistanceToCDS:100&uAUG_gained_DistanceToStop:27&uAUG_gained_KozakContext:GCGATGC&uAUG_gained_KozakStrength:Moderate&uAUG_gained_type:uORF|uAUG_gained|15.55|1.471728||||||-27|11|-27|-25|0.01|0.00|0.00|0.00|DDX3X +chrX 41334585 . GGACGCGCATGCGC G . . CSQ=-|intron_variant|MODIFIER|DDX3X|ENSG00000215301|Transcript|ENST00000457138|protein_coding||1/15|ENST00000457138.7:c.45+291_45+303del||||||||1||1||deletion|HGNC|HGNC:2745|||2||CCDS55404.1|ENSP00000392494|O00571.228||UPI00017A8655|O00571-2|1||||||||||||||||||||||||||||||||||||||||||||||||||||||||,-|intron_variant&NMD_transcript_variant|MODIFIER|DDX3X|ENSG00000215301|Transcript|ENST00000478993|nonsense_mediated_decay||1/18|ENST00000478993.5:c.45+291_45+303del||||||||1||1||deletion|HGNC|HGNC:2745|||1||CCDS43931.1|ENSP00000478443|O00571.228||UPI000013CB6D|O00571-1|1|||||||||||||||||||||||||||||||||||||||||||||||||||||||| diff --git a/test/split-vep.mixed-list.txt b/test/split-vep.mixed-list.txt new file mode 100644 index 000000000..aafe13f1b --- /dev/null +++ b/test/split-vep.mixed-list.txt @@ -0,0 +1,3 @@ +ENST00000344229 +ENST00000317578 +RPL10 diff --git a/test/stats.counts.2.chk b/test/stats.counts.2.chk index e86630791..812772c90 100644 --- a/test/stats.counts.2.chk +++ b/test/stats.counts.2.chk @@ -26,7 +26,7 @@ ST 0 T>A 0 ST 0 T>C 0 ST 0 T>G 0 PSC 0 A 4 0 0 0 0 0 0.0 0 0 0 1 -PSC 0 B 0 1 2 2 1 0 0.0 0 2 0 0 +PSC 0 B 0 1 2 2 2 0 0.0 0 2 0 0 PSC 0 C 0 3 0 2 1 0 0.0 0 0 1 1 PSI 0 A 0 0 0 0.00 0 0 0 0 PSI 0 B 0 0 0 0.00 0 0 0 0 diff --git a/test/stats.counts.chk b/test/stats.counts.chk index ee798a03b..348c81e1d 100644 --- a/test/stats.counts.chk +++ b/test/stats.counts.chk @@ -27,11 +27,11 @@ ST 0 T>A 0 ST 0 T>C 0 ST 0 T>G 0 PSC 0 A 11 0 0 0 0 0 0.0 0 0 0 1 -PSC 0 B 1 1 4 3 1 1 0.0 0 2 0 0 +PSC 0 B 1 1 4 3 2 1 0.0 0 2 0 0 PSC 0 C 1 5 0 3 1 0 0.0 0 0 1 1 PSI 0 A 0 0 0 0.00 0 0 0 0 PSI 0 B 0 0 0 0.00 1 0 0 0 PSI 0 C 0 0 0 0.00 0 0 0 0 -HWE 0 0.000000 3 0.000000 0.000000 0.000000 +HWE 0 0.000000 2 0.000000 0.000000 0.000000 HWE 0 0.330000 1 0.000000 0.000000 0.000000 HWE 0 0.490000 7 0.330000 0.330000 0.330000 diff --git a/test/stats.vaf.1.chk b/test/stats.vaf.1.chk new file mode 100644 index 000000000..4310c8566 --- /dev/null +++ b/test/stats.vaf.1.chk @@ -0,0 +1,45 @@ +SN 0 number of samples: 2 +SN 0 number of records: 10 +SN 0 number of no-ALTs: 0 +SN 0 number of SNPs: 9 +SN 0 number of MNPs: 0 +SN 0 number of indels: 1 +SN 0 number of others: 0 +SN 0 number of multiallelic sites: 3 +SN 0 number of multiallelic SNP sites: 2 +TSTV 0 1 10 0.10 0 9 0.00 +SiS 0 1 4 1 3 2 0 0 2 +AF 0 0.000000 4 1 3 3 0 0 3 +AF 0 0.490000 7 0 7 0 0 0 0 +QUAL 0 . 9 0 9 1 +IDD 0 -5 1 0 . +IDD 0 -4 1 1 0.43 +IDD 0 -3 1 1 0.29 +ST 0 A>C 0 +ST 0 A>G 0 +ST 0 A>T 0 +ST 0 C>A 8 +ST 0 C>G 1 +ST 0 C>T 1 +ST 0 G>A 0 +ST 0 G>C 0 +ST 0 G>T 1 +ST 0 T>A 0 +ST 0 T>C 0 +ST 0 T>G 0 +DP 0 1 8 40.000000 0 0.000000 +DP 0 2 3 15.000000 0 0.000000 +DP 0 3 2 10.000000 0 0.000000 +DP 0 5 2 10.000000 0 0.000000 +DP 0 6 2 10.000000 0 0.000000 +DP 0 7 1 5.000000 0 0.000000 +DP 0 8 1 5.000000 0 0.000000 +DP 0 10 1 5.000000 0 0.000000 +PSC 0 a 0 0 9 0 10 1 3.7 2 0 0 0 +PSC 0 b 2 0 8 1 7 0 3.0 0 0 0 0 +PSI 0 a 0 0 0 0.00 0 1 0 0 +PSI 0 b 0 0 0 0.00 0 0 0 0 +HWE 0 0.000000 3 0.000000 0.000000 0.990000 +HWE 0 0.490000 7 0.990000 0.990000 0.990000 +VAF 0 a 0,0,0,1,2,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,4 0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0 +VAF 0 b 0,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,4 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/test/stats.vaf.vcf b/test/stats.vaf.vcf new file mode 100644 index 000000000..fb3c1c431 --- /dev/null +++ b/test/stats.vaf.vcf @@ -0,0 +1,16 @@ +##fileformat=VCFv4.2 +##reference=ref.fa +##contig= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a b +chr1 603697 . C A,T . . . GT:AD 0/1:0,1,0 0/2:0,0,1 +chr1 847172 . C A . . . GT:AD 0/1:1,1 0/1:1,1 +chr1 895916 . C A,G . . . GT:AD 1/2:8,2,0 0/0:8,0,0 +chr1 940526 . CTTTTT C,CTT,CT . . . GT:AD 2/3:2,0,2,3 0/0:2,0,0,0 +chr1 1054587 . C A . . . GT:AD 0/1:5,1 0/1:5,1 +chr1 1123455 . C A . . . GT:AD 0/1:0,1 0/1:0,1 +chr1 1130454 . C A . . . GT:AD 0/1:4,1 0/1:4,1 +chr1 1145661 . G T . . . GT:AD 0/1:0,1 0/1:0,1 +chr1 1198538 . C A . . . GT:AD 0/1:0,1 0/1:0,1 +chr1 1349818 . C A . . . GT:AD 0/1:2,1 0/1:2,1 diff --git a/test/test.pl b/test/test.pl index 37cb1a8d3..e2178a33b 100755 --- a/test/test.pl +++ b/test/test.pl @@ -48,6 +48,7 @@ run_test(\&test_vcf_stats,$opts,in=>['stats.a','stats.b'],out=>'stats.B.chk',args=>'-s B'); run_test(\&test_vcf_stats,$opts,in=>['stats.counts'],out=>'stats.counts.chk',args=>'-s -'); run_test(\&test_vcf_stats,$opts,in=>['stats.counts'],out=>'stats.counts.2.chk',args=>q[-s - -i 'type="snp"']); +run_test(\&test_vcf_stats,$opts,in=>['stats.vaf'],out=>'stats.vaf.1.chk',args=>q[-s -]); run_test(\&test_vcf_isec,$opts,in=>['isec.a','isec.b'],out=>'isec.ab.out',args=>'-n =2'); run_test(\&test_vcf_isec,$opts,in=>['isec.a','isec.b'],out=>'isec.ab.flt.out',args=>'-n =2 -i"STRLEN(REF)==2"'); run_test(\&test_vcf_isec,$opts,in=>['isec.a','isec.b'],out=>'isec.ab.both.out',args=>'-n =2 -c both'); @@ -90,8 +91,6 @@ run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.10.a','merge.gvcf.10.b'],out=>'merge.gvcf.10.2.out',args=>'-m none'); run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.10.a','merge.gvcf.10.b'],out=>'merge.gvcf.10.3.out',args=>'-g {PATH}/merge.gvcf.10.fa'); run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.10.a','merge.gvcf.10.b'],out=>'merge.gvcf.10.4.out',args=>'-g {PATH}/merge.gvcf.10.fa -m none'); -run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.10.b','merge.gvcf.10.a'],out=>'merge.gvcf.10.5.out',args=>'-g {PATH}/merge.gvcf.10.fa'); -run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.10.b','merge.gvcf.10.a'],out=>'merge.gvcf.10.4.out',args=>'-g {PATH}/merge.gvcf.10.fa -m none'); run_test(\&test_vcf_merge,$opts,in=>['merge.noidx.a','merge.noidx.b','merge.noidx.c'],out=>'merge.noidx.abc.out',args=>''); run_test(\&test_vcf_merge,$opts,in=>['merge.noidx.a','merge.noidx.b','merge.noidx.c'],out=>'merge.noidx.abc.out',args=>'--no-index',noidx=>1); run_test(\&test_vcf_merge,$opts,in=>['merge.8.a','merge.8.b'],out=>'merge.8.out',args=>''); @@ -101,6 +100,11 @@ run_test(\&test_vcf_merge,$opts,in=>['merge.10.a','merge.10.b'],out=>'merge.10.1.out',args=>'-m none'); run_test(\&test_vcf_merge,$opts,in=>['merge.10.a','merge.10.b'],out=>'merge.10.2.out',args=>'-m both'); run_test(\&test_vcf_merge,$opts,in=>['merge.10.a','merge.10.b'],out=>'merge.10.3.out',args=>'-m snp-ins-del'); +run_test(\&test_vcf_merge,$opts,in=>['merge.mrules.1.a','merge.mrules.1.b'],out=>'merge.mrules.1.1.out',args=>'--gvcf -'); +run_test(\&test_vcf_merge,$opts,in=>['merge.mrules.1.a','merge.mrules.1.b'],out=>'merge.mrules.1.2.out',args=>'--gvcf - -M AD:.,PL:.'); +run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.5.a','merge.gvcf.5.b'],out=>'merge.gvcf.5.1.out',args=>'--gvcf -'); +run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.5.a','merge.gvcf.5.b'],out=>'merge.gvcf.5.1.out',args=>'--gvcf - --merge none'); +run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.11.a','merge.gvcf.11.b','merge.gvcf.11.c'],out=>'merge.gvcf.11.1.out',args=>'--gvcf -'); # run_test(\&test_vcf_merge_big,$opts,in=>'merge_big.1',out=>'merge_big.1.1',nsmpl=>79000,nfiles=>79,nalts=>486,args=>''); # commented out for speed run_test(\&test_vcf_query,$opts,in=>'query.string',out=>'query.string.1.out',args=>q[-f '%CHROM\\t%POS\\t%CLNREVSTAT\\n' -i'CLNREVSTAT="criteria_provided,_conflicting_interpretations"']); run_test(\&test_vcf_query,$opts,in=>'query.string',out=>'query.string.1.out',args=>q[-f '%CHROM\\t%POS\\t%CLNREVSTAT\\n' -i'CLNREVSTAT="criteria_provided" || CLNREVSTAT="_conflicting_interpretations"']); @@ -233,6 +237,13 @@ run_test(\&test_vcf_query,$opts,in=>'filter.12',out=>'query.89.out',args=>q[-i'FILTER~"A;B"' -f'%FILTER\\n']); run_test(\&test_vcf_query,$opts,in=>'filter.12',out=>'query.90.out',args=>q[-i'FILTER!~"A;B"' -f'%FILTER\\n']); run_test(\&test_vcf_query,$opts,in=>'filter.10',out=>'query.91.out',args=>q[-i'DP%10==2' -f'[ %DP]\\n']); +run_test(\&test_vcf_query,$opts,in=>'query.header',out=>'query.95.out',args=>q[-H -f'[%CHROM %POS %SAMPLE %DP %GT\\n]']); +run_test(\&test_vcf_query,$opts,in=>'query.header',out=>'query.95.out',args=>q[-H -f'[%CHROM %POS %SAMPLE %DP %GT]']); +run_test(\&test_vcf_query,$opts,in=>'query.header',out=>'query.96.out',args=>q[-H -f'[%CHROM %POS %SAMPLE %DP %GT]\\n']); +run_test(\&test_vcf_query,$opts,in=>'query.header',out=>'query.97.out',args=>q[-H -f'%CHROM %POS[ %SAMPLE %DP %GT]\\n']); +run_test(\&test_vcf_query,$opts,in=>'query.header',out=>'query.97.out',args=>q[-H -f'%CHROM %POS[ %SAMPLE %DP %GT]']); +run_test(\&test_vcf_query,$opts,in=>'query.header',out=>'query.98.out',args=>q[-H -f'%CHROM %POS[ %SAMPLE][ %DP][ %GT]\\n']); +run_test(\&test_vcf_query,$opts,in=>'query.header',out=>'query.98.out',args=>q[-H -f'%CHROM %POS[ %SAMPLE][ %DP][ %GT]']); run_test(\&test_vcf_norm,$opts,in=>'norm',out=>'norm.out',fai=>'norm',args=>'-cx'); run_test(\&test_vcf_norm,$opts,in=>'norm.split',out=>'norm.split.out',args=>'-m-'); run_test(\&test_vcf_norm,$opts,in=>'norm.split.2',out=>'norm.split.2.out',args=>'-m-'); @@ -272,6 +283,24 @@ run_test(\&test_vcf_norm,$opts,in=>'norm.5',out=>'norm.5.1.out',args=>'-m - --multi-overlaps 0'); run_test(\&test_vcf_norm,$opts,in=>'norm.5',out=>'norm.5.2.out',args=>'-m - --multi-overlaps .'); run_test(\&test_vcf_norm,$opts,in=>'norm.m-any',out=>'norm.m-any.1.out',args=>'-m -any'); +run_test(\&test_vcf_norm,$opts,in=>'norm.phased-split',out=>'norm.phased-split.1.out',args=>'-m -any'); +run_test(\&test_vcf_norm,$opts,in=>'norm.phased-join',out=>'norm.phased-join.1.out',args=>'-m +any'); +run_test(\&test_vcf_norm,$opts,in=>'norm.symbolic',fai=>'norm.symbolic',out=>'norm.symbolic.1.out',args=>'--old-rec-tag ORI'); +run_test(\&test_vcf_norm,$opts,in=>'norm.right-align',fai=>'norm.right-align',out=>'norm.right-align.1.out',args=>'--old-rec-tag ORI'); +run_test(\&test_vcf_norm,$opts,in=>'norm.right-align',fai=>'norm.right-align',out=>'norm.right-align.2.out',args=>'--old-rec-tag ORI -g {PATH}/norm.right-align.gff'); +run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.1.out',args=>'',reg=>'-r 1'); +run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.1.out',args=>'',reg=>'-r 1:1-2'); +run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.1.out',args=>'',reg=>'-r 1:1,1:2'); +run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.2.out',args=>'',reg=>'-r 1:1-1'); +run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.3.out',args=>'',reg=>'-r {1:1}'); +run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.3.out',args=>'',reg=>'-r {1:1}:1-2'); +run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.3.out',args=>'',reg=>'-r {1:1}:1,{1:1}:2'); +run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.4.out',args=>'',reg=>'-r {1:1}:1-1'); +run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.5.out',args=>'',reg=>'-r {1:1-1}'); +run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.5.out',args=>'',reg=>'-r {1:1-1}:1-2'); +run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.5.out',args=>'',reg=>'-r {1:1-1}:1,{1:1-1}:2'); +run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.6.out',args=>'',reg=>'-r {1:1-1}:1-1'); +run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',args=>'',reg=>'-r {1:1-1}-2',expected_failure=>1); run_test(\&test_vcf_view,$opts,in=>'view',out=>'view.1.out',args=>'-aUc1 -C1 -s NA00002 -v snps',reg=>''); run_test(\&test_vcf_view,$opts,in=>'view',out=>'view.2.out',args=>'-f PASS -Xks NA00003',reg=>'-r20,Y'); run_test(\&test_vcf_view,$opts,in=>'view',out=>'view.3.out',args=>'-xs NA00003',reg=>''); @@ -498,7 +527,7 @@ run_test(\&test_vcf_annotate,$opts,in=>'annotate.missing',tab=>'annotate.missing',out=>'annotate.missing.5.out',args=>'-c CHROM,POS,REF,ALT,.=TSTR,.=TFLT,.=TINT'); run_test(\&test_vcf_annotate,$opts,in=>'annotate.missing',tab=>'annotate.missing',out=>'annotate.missing.6.out',args=>'-c CHROM,POS,REF,ALT,=TSTR,=TFLT,=TINT'); run_test(\&test_vcf_annotate,$opts,in=>'annotate.olap',tab=>'annots.olap',out=>'annotate.olap.1.out',args=>'-c CHROM,BEG,END,DB -l DB:unique'); -run_test(\&test_vcf_annotate,$opts,in=>'annotate.olap',tab=>'annots.olap',out=>'annotate.olap.2.out',args=>'-c CHROM,BEG,END,DB -l DB:unique --min-overlap 0.4:0.5'); +run_test(\&test_vcf_annotate,$opts,in=>'annotate.olap',tab=>'annots.olap',out=>'annotate.olap.2.out',args=>'-c CHROM,BEG,END,DB -l DB:unique --min-overlap 0.4:0.5 -m XXX'); run_test(\&test_vcf_annotate,$opts,in=>'annotate.id',vcf=>'annots.id',out=>'annotate.id.1.out',args=>'-c ALT'); run_test(\&test_vcf_annotate,$opts,in=>'annotate.id',vcf=>'annots.id',out=>'annotate.id.2.out',args=>'-c +ALT'); run_test(\&test_vcf_annotate,$opts,in=>'annotate.id.2',vcf=>'annots.id.2',out=>'annotate.id.2.1.out',args=>'--pair-logic some -c +ID'); @@ -508,6 +537,8 @@ run_test(\&test_vcf_annotate,$opts,in=>'annotate28',tab=>'annots28',out=>'annotate28.2.out',args=>'-c CHROM,POS,REF,ALT,FMT/TEST -s smpl2,smpl1'); run_test(\&test_vcf_annotate,$opts,in=>'annotate28',tab=>'annots28',out=>'annotate28.3.out',args=>'-c CHROM,POS,REF,ALT,FMT/TEST -s smpl1'); run_test(\&test_vcf_annotate,$opts,in=>'annotate28',tab=>'annots28',out=>'annotate28.4.out',args=>'-c CHROM,POS,REF,ALT,FMT/TEST -s smpl2'); +run_test(\&test_vcf_annotate,$opts,in=>'annotate',out=>'annotate.33.out',args=>'-m XXX'); +run_test(\&test_vcf_annotate,$opts,in=>'annotate34',tab=>'annots34',out=>'annotate34.out',args=>q[-c CHROM,FROM,TO,INFO/END -H '##INFO=']); run_test(\&test_vcf_plugin,$opts,in=>'checkploidy',out=>'checkploidy.out',cmd=>'+check-ploidy --no-version'); run_test(\&test_vcf_plugin,$opts,in=>'checkploidy.2',out=>'checkploidy.2.out',cmd=>'+check-ploidy --no-version'); run_test(\&test_vcf_plugin,$opts,in=>'checkploidy.2',out=>'checkploidy.3.out',cmd=>'+check-ploidy --no-version',args=>'-- -m'); @@ -588,11 +619,6 @@ run_test(\&test_vcf_plugin,$opts,in=>'view',out=>'view.GTsubset.NA1.out',cmd=>'+GTsubset --no-version',args=>'-- -s NA00001'); run_test(\&test_vcf_plugin,$opts,in=>'view',out=>'view.GTsubset.NA1NA2.out',cmd=>'+GTsubset --no-version',args=>'-- -s NA00001,NA00002'); run_test(\&test_vcf_plugin,$opts,in=>'view',out=>'view.GTsubset.NA1NA2NA3.out',cmd=>'+GTsubset --no-version',args=>'-- -s NA00001,NA00002,NA00003'); -run_test(\&test_vcf_plugin,$opts,in=>'mendelian',out=>'mendelian.1.out',cmd=>'+mendelian',args=>'-t mom1,dad1,child1 -md'); -run_test(\&test_vcf_plugin,$opts,in=>'mendelian',out=>'mendelian.2.out',cmd=>'+mendelian',args=>'-t mom1,dad1,child1 -m+'); -run_test(\&test_vcf_plugin,$opts,in=>'mendelian',out=>'mendelian.3.out',cmd=>'+mendelian',args=>'-t mom1,dad1,child1 -mx'); -run_test(\&test_vcf_plugin,$opts,in=>'mendelian',out=>'mendelian.4.out',cmd=>'+mendelian',args=>'-t mom1,dad1,child1 -ma'); -run_test(\&test_vcf_plugin,$opts,in=>'mendelian',out=>'mendelian.5.out',cmd=>'+mendelian',args=>'-t mom1,dad1,child1 -mu'); run_test(\&test_vcf_plugin,$opts,in=>'mendelian',out=>'mendelian.1.out',cmd=>'+mendelian2',args=>'-p child1,dad1,mom1 -md'); run_test(\&test_vcf_plugin,$opts,in=>'mendelian',out=>'mendelian.6.out',cmd=>'+mendelian2',args=>'-p child1,dad1,mom1 -mg'); run_test(\&test_vcf_plugin,$opts,in=>'mendelian',out=>'mendelian.3.out',cmd=>'+mendelian2',args=>'-p child1,dad1,mom1 -me'); @@ -658,9 +684,14 @@ run_test(\&test_vcf_plugin,$opts,in=>'split-vep.10',out=>'split-vep.25.out',cmd=>'+split-vep',args=>qq[-a CSQ -f '%xM_CAP_pred %xM_CAP_score\\n' -p x | grep -v ^#]); run_test(\&test_vcf_plugin,$opts,in=>'split-vep.gene-list',out=>'split-vep.gene-list.1.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Gene %Consequence\\n']); run_test(\&test_vcf_plugin,$opts,in=>'split-vep.gene-list',out=>'split-vep.gene-list.2.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Gene %Consequence\\n' -g {PATH}/split-vep.gene-list.txt]); +run_test(\&test_vcf_plugin,$opts,in=>'split-vep.gene-list',out=>'split-vep.gene-list.2.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Gene %Consequence\\n' -g {PATH}/split-vep.mixed-list.txt --gene-list-fields Feature,SYMBOL]); run_test(\&test_vcf_plugin,$opts,in=>'split-vep.gene-list',out=>'split-vep.gene-list.3.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Gene %Consequence\\n' -g +{PATH}/split-vep.gene-list.txt]); +run_test(\&test_vcf_plugin,$opts,in=>'split-vep.gene-list',out=>'split-vep.gene-list.3.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Gene %Consequence\\n' -g +{PATH}/split-vep.mixed-list.txt --gene-list-fields Feature,SYMBOL]); run_test(\&test_vcf_plugin,$opts,in=>'split-vep.broken-LoF',out=>'split-vep.broken-LoF.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Consequence %LoF_info\\n' -a vep]); run_test(\&test_vcf_plugin,$opts,in=>'split-vep.broken-LoF',out=>'split-vep.broken-LoF.2.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %LoF_info\\n' -a vep -i 'Consequence=="frameshift_variant"']); +run_test(\&test_vcf_plugin,$opts,in=>'split-vep',out=>'split-vep.26.out',cmd=>'+split-vep',args=>qq[-f'%POS\\n' -i'SYMBOL~"SAMD11"']); +run_test(\&test_vcf_plugin,$opts,in=>'split-vep.filter',out=>'split-vep.filter.1.out',cmd=>'+split-vep',args=>qq[-s worst -i'CSQ~"nonsense"' -f '%POS %Consequence %Feature %BIOTYPE']); +run_test(\&test_vcf_plugin,$opts,in=>'split-vep.filter',out=>'split-vep.filter.2.out',cmd=>'+split-vep',args=>qq[-s worst -i'CSQ~"nonsense"' -f '%POS %Consequence %Feature %BIOTYPE %CSQ']); run_test(\&test_vcf_plugin,$opts,in=>'parental-origin',out=>'parental-origin.1.out',cmd=>'+parental-origin',args=>qq[-r 20:100 -p proband,father,mother -t del | grep -v ^#]); run_test(\&test_vcf_plugin,$opts,in=>'parental-origin',out=>'parental-origin.2.out',cmd=>'+parental-origin',args=>qq[-r 20:101 -p proband,father,mother -t del | grep -v ^#]); run_test(\&test_vcf_plugin,$opts,in=>'parental-origin',out=>'parental-origin.3.out',cmd=>'+parental-origin',args=>qq[-r 20:102 -p proband,father,mother -t del | grep -v ^#]); @@ -704,6 +735,8 @@ run_test(\&test_vcf_concat,$opts,in=>['concat.5.a','concat.5.b','concat.5.c'],out=>'concat.5.1.out',do_bcf=>0,args=>'-l --ligate-warn'); run_test(\&test_vcf_concat,$opts,in=>['concat.5.a','concat.5.b','concat.5.c'],out=>'concat.5.1.out',do_bcf=>1,args=>'-l --ligate-warn'); run_test(\&test_vcf_concat,$opts,in=>['concat.5.a','concat.5.b','concat.5.c'],out=>'concat.5.2.out',do_bcf=>1,args=>'-l --ligate-force'); +run_test(\&test_vcf_concat,$opts,in=>['concat.5.a','concat.5.b','concat.5.c'],out=>'concat.5.3.out',do_bcf=>0,args=>'-G -a -D'); +run_test(\&test_vcf_concat,$opts,in=>['concat.5.a','concat.5.b','concat.5.c'],out=>'concat.5.3.out',do_bcf=>1,args=>'-G -a -D'); run_test(\&test_vcf_reheader,$opts,in=>'reheader',out=>'reheader.1.out',header=>'reheader.hdr'); run_test(\&test_vcf_reheader,$opts,in=>'reheader',out=>'reheader.2.out',samples=>'reheader.samples'); run_test(\&test_vcf_reheader,$opts,in=>'reheader',out=>'reheader.2.out',samples=>'reheader.samples2'); @@ -712,6 +745,7 @@ run_test(\&test_vcf_reheader,$opts,in=>'empty',out=>'reheader.empty.out',header=>'reheader.empty.hdr'); run_test(\&test_vcf_reheader,$opts,in=>'reheader.2',out=>'reheader.5.out',args=>'-f {PATH}/reheader.fai',nostdin=>1); run_test(\&test_vcf_reheader,$opts,in=>'reheader.2',out=>'reheader.5.out',args=>'-h {PATH}/reheader.2.hdr -f {PATH}/reheader.fai',nostdin=>1); +run_test(\&test_vcf_reheader,$opts,in=>'reheader.3',out=>'reheader.6.out',args=>'-f {PATH}/reheader.3.fai',nostdin=>1); run_test(\&test_rename_chrs,$opts,in=>'annotate'); run_test(\&test_vcf_convert,$opts,in=>'convert',out=>'convert.gs.gt.gen',args=>'-g -,.'); run_test(\&test_vcf_convert,$opts,in=>'convert',out=>'convert.gs.gt.ids.gen',args=>'-g -,. --vcf-ids'); @@ -743,6 +777,7 @@ run_test(\&test_vcf_convert_hs2vcf,$opts,h=>'convert.hs.gt.ids.hap',s=>'convert.hs.gt.samples',out=>'convert.gt.noHead.ids.vcf',args=>'--vcf-ids --hapsample2vcf'); run_test(\&test_vcf_convert_gvcf,$opts,in=>'convert.gvcf',out=>'convert.gvcf.out',fa=>'gvcf.fa',args=>'--gvcf2vcf -i\'FILTER="PASS"\''); run_test(\&test_vcf_convert_tsv2vcf,$opts,in=>'convert.23andme',out=>'convert.23andme.vcf',args=>'-c ID,CHROM,POS,AA -s SAMPLE1',fai=>'23andme'); +run_test(\&test_vcf_convert_tsv2vcf,$opts,in=>'convert.tsv',out=>'convert.tsv.vcf',args=>'-c -,CHROM,POS,REF,ALT',fai=>'23andme'); run_test(\&test_vcf_consensus,$opts,in=>'consensus',out=>'consensus.1.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-s -'); run_test(\&test_vcf_consensus_chain,$opts,in=>'consensus',out=>'consensus.1.chain',chain=>'consensus.1.chain',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-s -'); run_test(\&test_vcf_consensus,$opts,in=>'consensus',out=>'consensus.2.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-H 1'); @@ -761,9 +796,13 @@ run_test(\&test_vcf_consensus,$opts,in=>'consensus5',out=>'consensus5.out',fa=>'consensus5.fa',args=>'--haplotype LA'); run_test(\&test_vcf_consensus,$opts,in=>'consensus6',out=>'consensus6.out',fa=>'consensus6.fa',args=>'-s -'); run_test(\&test_vcf_consensus,$opts,in=>'consensus7',out=>'consensus7a.out',fa=>'consensus7.fa',args=>'-H 2'); +run_test(\&test_vcf_consensus,$opts,in=>'consensus7',out=>'consensus7a.out',fa=>'consensus7.fa',args=>'-H 4'); run_test(\&test_vcf_consensus,$opts,in=>'consensus7',out=>'consensus7b.out',fa=>'consensus7.fa',args=>'-H 2pIu'); +run_test(\&test_vcf_consensus,$opts,in=>'consensus7',out=>'consensus7b.out',fa=>'consensus7.fa',args=>'-H 4pIu'); run_test(\&test_vcf_consensus,$opts,in=>'consensus7',out=>'consensus7c.out',fa=>'consensus7.fa',args=>'-H 1'); +run_test(\&test_vcf_consensus,$opts,in=>'consensus7',out=>'consensus7c.out',fa=>'consensus7.fa',args=>'-H 3'); run_test(\&test_vcf_consensus,$opts,in=>'consensus7',out=>'consensus7d.out',fa=>'consensus7.fa',args=>'-H 1pIu'); +run_test(\&test_vcf_consensus,$opts,in=>'consensus7',out=>'consensus7d.out',fa=>'consensus7.fa',args=>'-H 3pIu'); run_test(\&test_vcf_consensus,$opts,in=>'consensus8',out=>'consensus.8a.out',fa=>'consensus.fa',args=>'-s -'); run_test(\&test_vcf_consensus,$opts,in=>'consensus8',out=>'consensus.8b.out',fa=>'consensus.fa',args=>'-s - -a .'); run_test(\&test_vcf_consensus,$opts,in=>'consensus8',out=>'consensus.8c.out',fa=>'consensus.fa',args=>q[-s - -a . -i 'type="snp" || type="ref"']); @@ -780,6 +819,7 @@ run_test(\&test_vcf_consensus,$opts,in=>'consensus.13',out=>'consensus.13.out',fa=>'consensus.13.fa',args=>'-s -'); run_test(\&test_vcf_consensus,$opts,in=>'consensus.14',out=>'consensus.14.out',fa=>'consensus.14.fa',args=>'-s -'); run_test(\&test_vcf_consensus,$opts,in=>'consensus.12',out=>'consensus.15.out',fa=>'consensus.12.fa',args=>'-s - --mark-del - --mark-ins uc --mark-snv uc'); +run_test(\&test_vcf_consensus,$opts,in=>'consensus.12',out=>'consensus.19.out',fa=>'consensus.12.fa',args=>'-s - --mark-del - --mark-ins + --mark-snv :'); run_test(\&test_vcf_consensus,$opts,in=>'consensus.15',out=>'consensus.17.out',fa=>'consensus.15.fa',args=>'-H I --mark-ins lc --mark-snv lc'); run_test(\&test_vcf_consensus,$opts,in=>'consensus.16',out=>'consensus.18.out',fa=>'consensus.fa',args=>'-s - -I'); run_test(\&test_vcf_consensus,$opts,in=>'consensus.16',out=>'consensus.18.out',fa=>'consensus.fa',args=>'-H I'); @@ -790,6 +830,7 @@ run_test(\&test_vcf_consensus,$opts,in=>'consensus.20',out=>'consensus20.2.out',fa=>'consensus.20.fa',args=>''); run_test(\&test_vcf_consensus,$opts,in=>'consensus.20',out=>'consensus20.3.out',fa=>'consensus.20.fa',args=>'-M . -s b'); run_test(\&test_vcf_consensus,$opts,in=>'consensus.20',out=>'consensus20.4.out',fa=>'consensus.20.fa',args=>'-M . -s a'); +run_test(\&test_vcf_consensus,$opts,in=>'consensus.21',out=>'consensus21.1.out',fa=>'consensus.21.fa',args=>''); run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.1.out',args=>q[-r17:100-150],test_list=>1); run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.2.out',args=>q[-a DP,DV -r17:100-600]); # test files from samtools mpileup test suite run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1)],out=>'mpileup/mpileup.3.out',args=>q[-B --ff 0x14 -r17:1050-1060]); # test file converted to vcf from samtools mpileup test suite @@ -824,6 +865,14 @@ run_test(\&test_csq,$opts,in=>'csq',out=>'csq.1.out',cmd=>'-f {PATH}/csq.fa -g {PATH}/csq.chr.gff3'); run_test(\&test_csq,$opts,in=>'csq.2',out=>'csq.2.out',cmd=>'-f {PATH}/csq.fa -g {PATH}/csq.2.gff',tbcsq=>1); run_test(\&test_csq,$opts,in=>'csq.2',out=>'csq.3.out',cmd=>'-f {PATH}/csq.fa -g {PATH}/csq.2.gff --ncsq 64',tbcsq=>1); +run_test(\&test_csq,$opts,in=>'csq.nchr',out=>'csq.chr.out',cmd=>'-f {PATH}/csq.nchr.fa -g {PATH}/csq.nchr.gff',tbcsq=>1); +run_test(\&test_csq,$opts,in=>'csq.nchr',out=>'csq.chr.out',cmd=>'-f {PATH}/csq.ychr.fa -g {PATH}/csq.nchr.gff',tbcsq=>1); +run_test(\&test_csq,$opts,in=>'csq.nchr',out=>'csq.chr.out',cmd=>'-f {PATH}/csq.nchr.fa -g {PATH}/csq.ychr.gff',tbcsq=>1); +run_test(\&test_csq,$opts,in=>'csq.nchr',out=>'csq.chr.out',cmd=>'-f {PATH}/csq.ychr.fa -g {PATH}/csq.ychr.gff',tbcsq=>1); +run_test(\&test_csq,$opts,in=>'csq.ychr',out=>'csq.chr.out',cmd=>'-f {PATH}/csq.ychr.fa -g {PATH}/csq.ychr.gff',tbcsq=>1); +run_test(\&test_csq,$opts,in=>'csq.ychr',out=>'csq.chr.out',cmd=>'-f {PATH}/csq.ychr.fa -g {PATH}/csq.nchr.gff',tbcsq=>1); +run_test(\&test_csq,$opts,in=>'csq.ychr',out=>'csq.chr.out',cmd=>'-f {PATH}/csq.nchr.fa -g {PATH}/csq.ychr.gff',tbcsq=>1); +run_test(\&test_csq,$opts,in=>'csq.ychr',out=>'csq.chr.out',cmd=>'-f {PATH}/csq.nchr.fa -g {PATH}/csq.nchr.gff',tbcsq=>1); run_test(\&test_csq_real,$opts,in=>'csq'); run_test(\&test_roh,$opts,in=>'roh.1',out=>'roh.1.1.out',args=>q[-Or -G30 --AF-dflt 0.4]); run_test(\&test_roh,$opts,in=>'roh.1',out=>'roh.1.1.out',args=>q[-Or -G30 --AF-file {PATH}/roh.1.tab.gz]); @@ -955,11 +1004,12 @@ sub run_test my %args = @args; my $run = 0; if ( exists($$opts{run_function}{$name}) ) { $run = 1; } - if ( !$run && exists($args{cmd}) ) + if ( !$run ) { for my $func (keys %{$$opts{run_function}}) { - if ( $args{cmd}=~/$func/ ) { $run = 1; last; } + if ( exists($args{cmd}) && $args{cmd}=~/$func/ ) { $run = 1; last; } + if ( $name=~/$func/ ) { $run = 1; last; } } } if ( !$run ) { return; } @@ -1041,7 +1091,13 @@ sub test_cmd my ($ret,$out,$err) = _cmd3("$args{cmd}"); if ( length($err) ) { $err =~ s/\n/\n\t\t/gs; $err = "\n\n\t\t$err\n"; } - if ( $ret ) { failed($opts,$test,"Non-zero status $ret$err"); return; } + if ( $ret && !$args{expected_failure} ) { failed($opts,$test,"Non-zero status $ret$err"); return; } + if ( $args{expected_failure} ) + { + if ( !$ret ) { failed($opts,$test,"Expected failure but the test returned $ret$err"); } + else { passed($opts,$test,"ok, expected non-zero status"); } + return; + } if ( $$opts{redo_outputs} && -e "$$opts{path}/$args{out}" ) { rename("$$opts{path}/$args{out}","$$opts{path}/$args{out}.old"); @@ -1113,9 +1169,10 @@ sub failed } sub passed { - my ($opts,$test) = @_; + my ($opts,$test,$reason) = @_; $$opts{nok}++; - print ".. ok\n\n"; + if ( !defined $reason ) { $reason = 'ok'; } + print ".. $reason\n\n"; } sub is_file_newer { @@ -1144,6 +1201,18 @@ sub bgzip_tabix_vcf my ($opts,$file) = @_; bgzip_tabix($opts,file=>$file,suffix=>'vcf',args=>'-p vcf'); } +sub bgzip_index_bcf +{ + my ($opts,$file) = @_; + if ( !-e "$$opts{tmp}/$file.bcf" or is_file_newer("$$opts{path}/$file.vcf","$$opts{tmp}/$file.bcf") ) + { + cmd("$$opts{bin}/bcftools view -Ob $$opts{path}/$file.vcf -o $$opts{tmp}/$file.bcf"); + } + if ( !-e "$$opts{tmp}/$file.bcf.csi" or is_file_newer("$$opts{tmp}/$file.bcf","$$opts{tmp}/$file.bcf.csi") ) + { + cmd("$$opts{bin}/bcftools index -f $$opts{tmp}/$file.bcf"); + } +} # The tests -------------------------- @@ -1278,7 +1347,7 @@ sub test_vcf_merge $args =~ s/{PATH}/$$opts{path}/g; my $files = join(' ',@files); test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools merge --no-version $args $files", exp_fix=>1); - test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools merge -Ob $args $files | $$opts{bin}/bcftools view | grep -v ^##bcftools_", exp_fix => 1); + test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools merge --no-version -Ob $args $files | $$opts{bin}/bcftools view --no-version | grep -v ^##bcftools_", exp_fix => 1); } } sub test_vcf_isec @@ -1362,6 +1431,7 @@ sub test_vcf_norm my ($opts,%args) = @_; bgzip_tabix_vcf($opts,$args{in}); my $params = ''; + $args{args} =~ s/{PATH}/$$opts{path}/g; if ( exists($args{args}) ) { $params .= " $args{args}"; } if ( exists($args{fai} ) ) { $params .= " -f $$opts{path}/$args{fai}.fa"; } test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools norm --no-version $params $$opts{tmp}/$args{in}.vcf.gz",exp_fix=>1); @@ -1725,10 +1795,12 @@ sub test_vcf_consensus { my ($opts,%args) = @_; bgzip_tabix_vcf($opts,$args{in}); + bgzip_index_bcf($opts,$args{in}); $args{args} =~ s/{PATH}/$$opts{path}/g; my $mask = $args{mask} ? "-m $$opts{path}/$args{mask}" : ''; my $chain = $args{chain} ? "-c $$opts{tmp}/$args{chain}" : ''; test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools consensus $$opts{tmp}/$args{in}.vcf.gz -f $$opts{path}/$args{fa} $args{args} $mask $chain"); + test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools consensus $$opts{tmp}/$args{in}.bcf -f $$opts{path}/$args{fa} $args{args} $mask $chain"); } sub test_vcf_consensus_chain { diff --git a/test/weird-chr-names.1.out b/test/weird-chr-names.1.out new file mode 100644 index 000000000..5705c7575 --- /dev/null +++ b/test/weird-chr-names.1.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 1 . C T . . . +1 2 . C T . . . diff --git a/test/weird-chr-names.2.out b/test/weird-chr-names.2.out new file mode 100644 index 000000000..980818a71 --- /dev/null +++ b/test/weird-chr-names.2.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 1 . C T . . . diff --git a/test/weird-chr-names.3.out b/test/weird-chr-names.3.out new file mode 100644 index 000000000..5b3ac8e18 --- /dev/null +++ b/test/weird-chr-names.3.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1:1 1 . C T . . . +1:1 2 . C T . . . diff --git a/test/weird-chr-names.4.out b/test/weird-chr-names.4.out new file mode 100644 index 000000000..0d9e274ab --- /dev/null +++ b/test/weird-chr-names.4.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1:1 1 . C T . . . diff --git a/test/weird-chr-names.5.out b/test/weird-chr-names.5.out new file mode 100644 index 000000000..6cb41e14f --- /dev/null +++ b/test/weird-chr-names.5.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1:1-1 1 . C T . . . +1:1-1 2 . C T . . . diff --git a/test/weird-chr-names.6.out b/test/weird-chr-names.6.out new file mode 100644 index 000000000..a707ed85c --- /dev/null +++ b/test/weird-chr-names.6.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1:1-1 1 . C T . . . diff --git a/test/weird-chr-names.vcf b/test/weird-chr-names.vcf new file mode 100644 index 000000000..c367be477 --- /dev/null +++ b/test/weird-chr-names.vcf @@ -0,0 +1,12 @@ +##fileformat=VCFv4.3 +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 1 . C T . . . +1 2 . C T . . . +1:1 1 . C T . . . +1:1 2 . C T . . . +1:1-1 1 . C T . . . +1:1-1 2 . C T . . . diff --git a/tsv2vcf.c b/tsv2vcf.c index 596e75a0a..22dec3065 100644 --- a/tsv2vcf.c +++ b/tsv2vcf.c @@ -10,10 +10,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE diff --git a/variantkey.h b/variantkey.h index ccd4d8dd0..a74935fb7 100644 --- a/variantkey.h +++ b/variantkey.h @@ -3,14 +3,15 @@ // variantkey.h // // @category Libraries -// @author Nicola Asuni -// @copyright 2017-2018 GENOMICS plc -// @license MIT (see LICENSE) -// @link https://github.com/genomicsplc/variantkey +// @author Nicola Asuni +// @link https://github.com/tecnickcom/variantkey +// @license MIT [LICENSE](https://raw.githubusercontent.com/tecnickcom/variantkey/main/LICENSE) +// @copyright 2017-2018 GENOMICS plc, 2018-2023 Nicola Asuni - Tecnick.com // // LICENSE // // Copyright (c) 2017-2018 GENOMICS plc +// Copyright (c) 2018-2023 Nicola Asuni - Tecnick.com // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -54,6 +55,7 @@ #define VKMASK_REFALT 0x000000007FFFFFFF //!< VariantKey binary mask for REF+ALT [ 00000000 00000000 00000000 00000000 01111111 11111111 11111111 11111111 ] #define VKSHIFT_CHROM 59 //!< CHROM LSB position from the VariantKey LSB #define VKSHIFT_POS 31 //!< POS LSB position from the VariantKey LSB +#define MAXUINT32 0xFFFFFFFF //!< Maximum value for uint32_t /** * VariantKey struct. @@ -75,16 +77,54 @@ typedef struct vkrange_t uint64_t max; //!< Maximum VariantKey value for any given REF+ALT encoding } vkrange_t; -/** @brief Returns chromosome numerical encoding. +/** @brief Returns the encoding for a numerical chromosome input. * * @param chrom Chromosome. An identifier from the reference genome, no white-space permitted. * @param size Length of the chrom string, excluding the terminating null byte. * * @return CHROM code */ +static inline uint8_t encode_numeric_chrom(const char *chrom, size_t size) +{ + size_t i; + uint8_t v = (chrom[0] - '0'); + for (i = 1; i < size; i++) + { + if ((chrom[i] > '9') || (chrom[i] < '0')) + { + return 0; // NA: a character that is not a numebr was found. + } + v = ((v * 10) + (chrom[i] - '0')); + } + return v; +} + + +/** @brief Returns a true value (1) if the input chrom has 'chr' prefix (case insensitive). + * + * @param chrom Chromosome. An identifier from the reference genome, no white-space permitted. + * @param size Length of the chrom string, excluding the terminating null byte. + * + * @return True (1) if the chr prefix is present. + */ +static inline int has_chrom_chr_prefix(const char *chrom, size_t size) +{ + return ((size > 3) + && ((chrom[0] == 'c') || (chrom[0] == 'C')) + && ((chrom[1] == 'h') || (chrom[1] == 'H')) + && ((chrom[2] == 'r') || (chrom[2] == 'R'))); +} + +/** @brief Returns chromosome numerical encoding. + * + * @param chrom Chromosome. An identifier from the reference genome, no white-space permitted. + * @param size Length of the chrom string, excluding the terminating null byte. + * + * @return CHROM code or 0 in case of invalid input. + */ static inline uint8_t encode_chrom(const char *chrom, size_t size) { - // X > 23 ; Y > 24 ; M > 25 + // X = 23; Y = 24; M = 25; any other letter is mapped to 0: static const uint8_t onecharmap[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -98,12 +138,9 @@ static inline uint8_t encode_chrom(const char *chrom, size_t size) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; - // remove "chr" prefix - if ((size > 3) - && ((chrom[0] == 'c') || (chrom[0] == 'C')) - && ((chrom[1] == 'h') || (chrom[1] == 'H')) - && ((chrom[2] == 'r') || (chrom[2] == 'R'))) + if (has_chrom_chr_prefix(chrom, size)) { + // remove "chr" prefix chrom += 3; size -= 3; } @@ -111,19 +148,9 @@ static inline uint8_t encode_chrom(const char *chrom, size_t size) { return 0; } - if ((chrom[0] <= '9') && (chrom[0] >= '0')) // Number + if ((chrom[0] <= '9') && (chrom[0] >= '0')) { - size_t i; - uint8_t v = (chrom[0] - '0'); - for (i = 1; i < size; i++) - { - if ((chrom[i] > '9') || (chrom[i] < '0')) - { - return 0; // NA - } - v = ((v * 10) + (chrom[i] - '0')); - } - return v; + return encode_numeric_chrom(chrom, size); } if ((size == 1) || ((size == 2) && ((chrom[1] == 'T') || (chrom[1] == 't')))) { @@ -159,10 +186,10 @@ static inline uint32_t encode_base(const uint8_t c) { /* Encode base: - A > 0 - C > 1 - G > 2 - T > 3 + A = 0 + C = 1 + G = 2 + T = 3 */ static const uint32_t map[] = { @@ -205,7 +232,7 @@ static inline uint32_t encode_refalt_rev(const char *ref, size_t sizeref, const uint8_t bitpos = 23; if ((encode_allele(&h, &bitpos, ref, sizeref) < 0) || (encode_allele(&h, &bitpos, alt, sizealt) < 0)) { - return 0; // error code + return MAXUINT32; // error code } return h; } @@ -318,7 +345,7 @@ static inline uint32_t encode_refalt(const char *ref, size_t sizeref, const char if ((sizeref + sizealt) <= 11) { uint32_t h = encode_refalt_rev(ref, sizeref, alt, sizealt); - if (h != 0) + if (h != MAXUINT32) { return h; } @@ -486,7 +513,9 @@ static inline void decode_variantkey(uint64_t code, variantkey_t *vk) vk->refalt = extract_variantkey_refalt(code); } -/** @brief Returns a 64 bit variant key based on CHROM, POS (0-based), REF, ALT. +/** + * Returns a 64 bit variant key based on CHROM, POS (0-based), REF, ALT. + * The variant should be already normalized (see normalize_variant or use normalized_variantkey). * * @param chrom Chromosome. An identifier from the reference genome, no white-space or leading zeros permitted. * @param sizechrom Length of the chrom string, excluding the terminating null byte. diff --git a/vcfannotate.c b/vcfannotate.c index 495d2b5a3..b2e39ef7b 100644 --- a/vcfannotate.c +++ b/vcfannotate.c @@ -1,6 +1,6 @@ /* vcfannotate.c -- Annotate and edit VCF/BCF files. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -118,6 +118,8 @@ typedef struct _args_t htsFile *out_fh; int output_type, n_threads, clevel; bcf_sr_regions_t *tgts; + char *index_fn; + int write_index; regidx_t *tgt_idx; // keep everything in memory only with .tab annotation file and -c BEG,END columns regitr_t *tgt_itr; @@ -2863,9 +2865,16 @@ static void init_data(args_t *args) if ( args->mark_sites ) { - if ( !args->targets_fname ) error("The -a option not given\n"); - bcf_hdr_printf(args->hdr_out,"##INFO=", - args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites); + if ( !args->targets_fname ) + { + if ( args->mark_sites_logic!=MARK_LISTED ) error("The -a option not given but -%s logic was requested\n",args->mark_sites); + fprintf(stderr,"Note: The -a option not given, all sites will be annotated with INFO/%s\n",args->mark_sites); + bcf_hdr_printf(args->hdr_out,"##INFO=", + args->mark_sites,args->mark_sites); + } + else + bcf_hdr_printf(args->hdr_out,"##INFO=", + args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites); } if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate"); @@ -2881,6 +2890,7 @@ static void init_data(args_t *args) if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: failed to write the header to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); } } @@ -2943,7 +2953,19 @@ static void destroy_data(args_t *args) convert_destroy(args->set_ids); if ( args->filter ) filter_destroy(args->filter); - if (args->out_fh) hts_close(args->out_fh); + if (args->out_fh) + { + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + } free(args->sample_map); free(args->merge_method_str.s); } @@ -3072,6 +3094,7 @@ static void annotate(args_t *args, bcf1_t *line) for (j=0; jncols; j++) args->cols[j].done = 0; if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) ) { + hts_pos_t vcf_end = line->pos + line->rlen - 1; while ( regitr_overlap(args->tgt_itr) ) { annot_line_t *tmp = &args->alines[0]; @@ -3082,7 +3105,7 @@ static void annotate(args_t *args, bcf1_t *line) // Check min overlap int len_ann = tmp->end - tmp->start + 1; int len_vcf = line->rlen; - int isec = (tmp->end < line->pos+line->rlen-1 ? tmp->end : line->pos+line->rlen-1) - (tmp->start > line->pos ? tmp->start : line->pos) + 1; + int isec = (tmp->end < vcf_end ? tmp->end : vcf_end) - (tmp->start > line->pos ? tmp->start : line->pos) + 1; assert( isec > 0 ); if ( args->min_overlap_ann && args->min_overlap_ann > (float)isec/len_ann ) continue; if ( args->min_overlap_vcf && args->min_overlap_vcf > (float)isec/len_vcf ) continue; @@ -3096,9 +3119,9 @@ static void annotate(args_t *args, bcf1_t *line) error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); if ( ret==0 ) args->cols[j].done = 1; + has_overlap = 1; } } - has_overlap = 1; } for (j=0; jncols; j++) { @@ -3273,6 +3296,8 @@ static void annotate(args_t *args, bcf1_t *line) if ( args->mark_sites ) { + if ( !args->targets_fname ) has_overlap = 1; + // ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87 if ( args->mark_sites_logic==MARK_LISTED ) bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?1:0); @@ -3315,6 +3340,7 @@ static void usage(args_t *args) fprintf(stderr, " --single-overlaps Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); fprintf(stderr, " -x, --remove LIST List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); fprintf(stderr, " --threads INT Number of extra output compression threads [0]\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " http://samtools.github.io/bcftools/howtos/annotate.html\n"); @@ -3371,6 +3397,7 @@ int main_vcfannotate(int argc, char *argv[]) {"min-overlap",required_argument,NULL,12}, {"no-version",no_argument,NULL,8}, {"force",no_argument,NULL,'f'}, + {"write-index",no_argument,NULL,13}, {NULL,0,NULL,0} }; char *tmp; @@ -3447,6 +3474,7 @@ int main_vcfannotate(int argc, char *argv[]) case 10 : args->single_overlaps = 1; break; case 11 : args->rename_annots = optarg; break; case 12 : args->min_overlap_str = optarg; break; + case 13 : args->write_index = 1; break; case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } diff --git a/vcfcall.c b/vcfcall.c index 1cd6f504c..d2f6e2c5f 100644 --- a/vcfcall.c +++ b/vcfcall.c @@ -1,6 +1,6 @@ /* vcfcall.c -- SNP/indel variant calling from VCF/BCF. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -97,6 +97,8 @@ typedef struct int argc; char **argv; + char *index_fn; + int write_index; // int flag, prior_type, n1, n_sub, *sublist, n_perm; // uint32_t *trio_aux; @@ -715,6 +717,7 @@ static void init_data(args_t *args) if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call"); if ( bcf_hdr_write(args->out_fh, args->aux.hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->aux.hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); if ( args->flag&CF_INS_MISSED ) init_missed_line(args); } @@ -753,6 +756,15 @@ static void destroy_data(args_t *args) free(args->str.s); if ( args->gvcf ) gvcf_destroy(args->gvcf); bcf_hdr_destroy(args->aux.hdr); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); bcf_sr_destroy(args->aux.srs); } @@ -908,6 +920,7 @@ static void usage(args_t *args) fprintf(stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n"); fprintf(stderr, " -V, --skip-variants TYPE Skip indels/snps\n"); fprintf(stderr, " -v, --variants-only Output variant sites only\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Consensus/variant calling options:\n"); fprintf(stderr, " -c, --consensus-caller The original calling method (conflicts with -m)\n"); @@ -990,6 +1003,7 @@ int main_vcfcall(int argc, char *argv[]) {"chromosome-X",no_argument,NULL,'X'}, {"chromosome-Y",no_argument,NULL,'Y'}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,10}, {NULL,0,NULL,0} }; @@ -1076,6 +1090,7 @@ int main_vcfcall(int argc, char *argv[]) args.regions_overlap = parse_overlap_option(optarg); if ( args.regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; + case 10: args.write_index = 1; break; default: usage(&args); } } diff --git a/vcfconcat.c b/vcfconcat.c index 74fd036b8..8e25cc590 100644 --- a/vcfconcat.c +++ b/vcfconcat.c @@ -1,6 +1,6 @@ /* vcfconcat.c -- Concatenate or combine VCF/BCF files. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -46,6 +46,8 @@ typedef struct _args_t int output_type, n_threads, record_cmd_line, clevel; bcf_hdr_t *out_hdr; int *seen_seq; + char *index_fn; + int write_index; // phasing int *start_pos, start_tid, ifname; @@ -59,10 +61,21 @@ typedef struct _args_t int argc, nfnames, allow_overlaps, phased_concat, regions_is_file, regions_overlap; int compact_PS, phase_set_changed, naive_concat, naive_concat_trust_headers; int verbose, explicit_output_type, ligate_force, ligate_warn; + int sites_only; htsThreadPool *tpool; } args_t; +static bcf_hdr_t *drop_hdr_genotypes(args_t *args, bcf_hdr_t *hdr) +{ + if ( !args->sites_only ) return hdr; + bcf_hdr_t *rmme = hdr; + hdr = bcf_hdr_subset(rmme, 0, 0, 0); + bcf_hdr_remove(hdr, BCF_HL_FMT, NULL); + bcf_hdr_destroy(rmme); + return hdr; +} + static void init_data(args_t *args) { bcf1_t *line = NULL; @@ -83,6 +96,8 @@ static void init_data(args_t *args) { htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); + hdr = drop_hdr_genotypes(args, hdr); + args->out_hdr = bcf_hdr_merge(args->out_hdr,hdr); if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) ) error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]); @@ -142,6 +157,7 @@ static void init_data(args_t *args) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->tpool); } if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); if ( args->allow_overlaps ) { @@ -203,7 +219,16 @@ static void destroy_data(args_t *args) int i; if ( args->out_fh ) { - if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n"); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n",args->output_fname?args->output_fname:"stdout"); } if ( args->tpool && !args->files ) { @@ -264,7 +289,7 @@ static void phased_flush(args_t *args) bcf1_t *brec = args->buf[i+1]; int nGTs = bcf_get_genotypes(ahdr, arec, &args->GTa, &args->mGTa); - if ( nGTs < 0 ) + if ( nGTs < 0 ) { if ( !gt_absent_warned ) { @@ -359,7 +384,7 @@ static void phased_flush(args_t *args) bcf_update_format_int32(args->out_hdr,rec,"PQ",args->phase_qual,nsmpl); PQ_printed = 1; for (j=0; jphase_qual[j] < args->min_PQ ) + if ( args->phase_qual[j] < args->min_PQ ) { args->phase_set[j] = rec->pos+1; args->phase_set_changed = 1; @@ -582,13 +607,14 @@ static void concat(args_t *args) { bcf1_t *line = bcf_sr_get_line(args->files,i); if ( !line ) continue; + if ( args->sites_only ) bcf_subset(args->out_hdr, line, 0, 0); bcf_translate(args->out_hdr, args->files->readers[i].header, line); if ( bcf_write1(args->out_fh, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); if ( args->remove_dups ) break; } } } - else // concatenating + else // concatenate as is { struct timeval t0, t1; kstring_t tmp = {0,0,0}; @@ -604,6 +630,13 @@ static void concat(args_t *args) htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("\nFailed to open: %s\n", args->fnames[i]); if ( args->n_threads ) hts_set_opt(fp, HTS_OPT_THREAD_POOL, args->tpool); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("\nFailed to parse header: %s\n", args->fnames[i]); + if ( args->sites_only ) + { + bcf_hdr_t *hdr_ori = hdr; + hdr = bcf_hdr_subset(hdr_ori, 0, 0, 0); + bcf_hdr_remove(hdr, BCF_HL_FMT, NULL); + bcf_hdr_destroy(hdr_ori); + } if ( !fp->is_bin && args->output_type&FT_VCF ) { line->max_unpack = BCF_UN_STR; @@ -611,6 +644,22 @@ static void concat(args_t *args) while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) { char *str = fp->line.s; + + // remove genotypes + if ( args->sites_only ) + { + int ntab = 0; + while ( *str ) + { + if ( *str == '\t' && ++ntab==8 ) + { + *str = 0; + break; + } + str++; + } + str = fp->line.s; + } while ( *str && *str!='\t' ) str++; tmp.l = 0; kputsn(fp->line.s,str-fp->line.s,&tmp); @@ -639,6 +688,7 @@ static void concat(args_t *args) line->max_unpack = 0; while ( bcf_read(fp, hdr, line)==0 ) { + if ( args->sites_only ) bcf_subset(args->out_hdr, line, 0, 0); bcf_translate(args->out_hdr, hdr, line); if ( prev_chr_id!=line->rid ) @@ -917,6 +967,7 @@ static void usage(args_t *args) fprintf(stderr, " -d, --rm-dups STRING Output duplicate records present in multiple files only once: \n"); fprintf(stderr, " -D, --remove-duplicates Alias for -d exact\n"); fprintf(stderr, " -f, --file-list FILE Read the list of files from a file.\n"); + fprintf(stderr, " -G, --drop-genotypes Drop individual genotype information.\n"); fprintf(stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n"); fprintf(stderr, " --ligate-force Ligate even non-overlapping chunks, keep all sites\n"); fprintf(stderr, " --ligate-warn Drop sites in imperfect overlaps\n"); @@ -931,6 +982,7 @@ static void usage(args_t *args) fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); fprintf(stderr, " -v, --verbose 0|1 Set verbosity level [1]\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); } @@ -969,10 +1021,12 @@ int main_vcfconcat(int argc, char *argv[]) {"file-list",required_argument,NULL,'f'}, {"min-PQ",required_argument,NULL,'q'}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,13}, + {"drop-genotypes",no_argument,NULL,'G'}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cnv:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:Gr:R:cnv:",loptions,NULL)) >= 0) { switch (c) { case 'c': args->compact_PS = 1; break; @@ -980,7 +1034,7 @@ int main_vcfconcat(int argc, char *argv[]) case 'R': args->regions_list = optarg; args->regions_is_file = 1; break; case 'd': args->remove_dups = optarg; break; case 'D': args->remove_dups = "exact"; break; - case 'q': + case 'q': args->min_PQ = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg); break; @@ -988,6 +1042,7 @@ int main_vcfconcat(int argc, char *argv[]) case 'a': args->allow_overlaps = 1; break; case 'l': args->phased_concat = 1; break; case 'f': args->file_list = optarg; break; + case 'G': args->sites_only = 1; break; case 'o': args->output_fname = optarg; break; case 'O': args->explicit_output_type = 1; @@ -1021,6 +1076,7 @@ int main_vcfconcat(int argc, char *argv[]) args->verbose = strtol(optarg, &tmp, 0); if ( *tmp || args->verbose<0 || args->verbose>1 ) error("Error: currently only --verbose 0 or --verbose 1 is supported\n"); break; + case 13 : args->write_index = 1; break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); @@ -1035,6 +1091,7 @@ int main_vcfconcat(int argc, char *argv[]) } if ( args->ligate_force && args->ligate_warn ) error("The options cannot be combined: --ligate-force and --ligate-warn\n"); if ( args->allow_overlaps && args->phased_concat ) error("The options -a and -l should not be combined. Please run with -l only.\n"); + if ( args->sites_only && args->phased_concat ) error("The options --drop-genotypes and --ligate cannot be combined\n"); if ( args->compact_PS && !args->phased_concat ) error("The -c option is intended only with -l\n"); if ( args->file_list ) { @@ -1049,6 +1106,7 @@ int main_vcfconcat(int argc, char *argv[]) { if ( args->allow_overlaps ) error("The option --naive cannot be combined with --allow-overlaps\n"); if ( args->phased_concat ) error("The option --naive cannot be combined with --ligate\n"); + if ( args->sites_only ) error("The option --naive cannot be combined with --drop-genotypes\n"); naive_concat(args); destroy_data(args); free(args); diff --git a/vcfconvert.c b/vcfconvert.c index ce5ed9981..76c4a325a 100644 --- a/vcfconvert.c +++ b/vcfconvert.c @@ -1,6 +1,6 @@ /* vcfconvert.c -- convert between VCF/BCF and related formats. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -59,7 +59,7 @@ struct _args_t bcf_hdr_t *header; void (*convert_func)(struct _args_t *); struct { - int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing; + int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing, written; } n; kstring_t str; int32_t *gts; @@ -70,6 +70,11 @@ struct _args_t char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns; char *outfname, *infname, *ref_fname, *sex_fname; int argc, n_threads, record_cmd_line, keep_duplicates, clevel; + char *index_fn; + int write_index; + struct { + kstring_t ref,alt,refalt; + } tsv; }; static void destroy_data(args_t *args) @@ -139,6 +144,36 @@ static void open_vcf(args_t *args, const char *format_str) free(samples); } +static int _set_ref_alt(args_t *args, bcf1_t *rec) +{ + args->tsv.refalt.l = 0; + kputs(args->tsv.ref.s, &args->tsv.refalt); + if ( strcmp(".",args->tsv.alt.s) && strcmp(args->tsv.ref.s,args->tsv.alt.s) ) + { + kputc(',', &args->tsv.refalt); + kputs(args->tsv.alt.s, &args->tsv.refalt); + } + bcf_update_alleles_str(args->header, rec, args->tsv.refalt.s); + args->tsv.ref.l = 0; + args->tsv.alt.l = 0; + args->tsv.refalt.l = 0; + return 0; +} +static int tsv_setter_ref(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + args_t *args = (args_t*) usr; + kputsn(tsv->ss,tsv->se - tsv->ss,&args->tsv.ref); + if ( args->tsv.alt.l ) return _set_ref_alt(args,rec); + return 0; +} +static int tsv_setter_alt(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + args_t *args = (args_t*) usr; + kputsn(tsv->ss,tsv->se - tsv->ss,&args->tsv.alt); + if ( args->tsv.ref.l ) return _set_ref_alt(args,rec); + return 0; +} + // Try to set CHROM:POS_REF_ALT[_END]. Return 0 on success, -1 on error static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) { @@ -160,7 +195,7 @@ static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) // REF,ALT args->str.l = 0; se = ++ss; - while ( se < tsv->se && *se!='_' ) se++; + while ( se < tsv->se && *se!='_' ) se++; if ( *se!='_' ) return -1; kputsn(ss,se-ss,&args->str); ss = ++se; @@ -269,12 +304,12 @@ static int tsv_setter_gt_gp(tsv_t *tsv, bcf1_t *rec, void *usr) if ( aa >= ab ) { if ( aa >= bb ) args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(0); - else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1); + else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1); } - else if ( ab >= bb ) + else if ( ab >= bb ) { args->gts[2*i+0] = bcf_gt_unphased(0); - args->gts[2*i+1] = bcf_gt_unphased(1); + args->gts[2*i+1] = bcf_gt_unphased(1); } else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1); } @@ -293,7 +328,7 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) else { a0 = bcf_gt_phased(0); a1 = bcf_gt_phased(1); } // up is short for "unphased" - int nup = 0; + int nup = 0; for (i=0; iss + 4*i + nup; @@ -324,11 +359,11 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) break; default : fprintf(stderr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss); - return -1; + return -1; } if( ss[all*2+up+1]=='*' ) up = up + 1; } - + if(up && up != 2) { fprintf(stderr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss); @@ -356,13 +391,13 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) static void gensample_to_vcf(args_t *args) { /* - * Inpute: IMPUTE2 output (indentation changed here for clarity): + * Inpute: IMPUTE2 output (indentation changed here for clarity): * * 20:62116619_C_T 20:62116619 62116619 C T 0.969 0.031 0 ... * --- 20:62116698_C_A 62116698 C A 1 0 0 ... * * Second column is expected in the form of CHROM:POS_REF_ALT. We use second - * column because the first can be empty ("--") when filling sites from reference + * column because the first can be empty ("--") when filling sites from reference * panel. When the option --vcf-ids is given, the first column is used to set the * VCF ID. * @@ -455,6 +490,7 @@ static void gensample_to_vcf(args_t *args) if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); bcf1_t *rec = bcf_init(); nsamples -= 2; @@ -474,6 +510,15 @@ static void gensample_to_vcf(args_t *args) } while ( hts_getline(gen_fh, KS_SEP_LINE, &line)>0 ); + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); if ( hts_close(gen_fh) ) error("Close failed: %s\n", gen_fname); bcf_hdr_destroy(args->header); @@ -589,6 +634,7 @@ static void haplegendsample_to_vcf(args_t *args) if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); bcf1_t *rec = bcf_init(); args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2); @@ -616,6 +662,15 @@ static void haplegendsample_to_vcf(args_t *args) } } + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname); if ( hts_close(leg_fh) ) error("Close failed: %s\n", leg_fname); @@ -731,6 +786,7 @@ static void hapsample_to_vcf(args_t *args) if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); bcf1_t *rec = bcf_init(); nsamples -= 2; @@ -749,6 +805,15 @@ static void hapsample_to_vcf(args_t *args) } while ( hts_getline(hap_fh, KS_SEP_LINE, &line)>0 ); + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname); bcf_hdr_destroy(args->header); @@ -784,7 +849,7 @@ char *init_sample2sex(bcf_hdr_t *hdr, char *sex_fname) } for (i=0; isex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname); @@ -915,7 +980,7 @@ static void vcf_to_gensample(args_t *args) nok++; } } - fprintf(stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n", + fprintf(stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n", nok, no_alt+non_biallelic+filtered+ndup, no_alt, non_biallelic, filtered, ndup); if ( str.m ) free(str.s); @@ -976,7 +1041,7 @@ static void vcf_to_haplegendsample(args_t *args) { char *sample2sex = NULL; if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname); - + int i; BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu"); str.l = 0; @@ -1078,7 +1143,7 @@ static void vcf_to_hapsample(args_t *args) kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %ID %POS %REF %FIRST_ALT ", &str); else kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str); - + if ( args->hap2dip ) kputs("%_GT_TO_HAP2\n", &str); else @@ -1213,7 +1278,7 @@ static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[] { if ( se - ss > 2 ) return -1; // currently only SNPs - if ( ss[0]=='-' ) + if ( ss[0]=='-' || ss[0]=='.' ) { // missing GT gts[0] = bcf_gt_missing; @@ -1229,7 +1294,7 @@ static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[] if ( alleles[a0]<0 ) alleles[a0] = (*nals)++; if ( alleles[a1]<0 ) alleles[a1] = (*nals)++; - gts[0] = bcf_gt_unphased(alleles[a0]); + gts[0] = bcf_gt_unphased(alleles[a0]); gts[1] = ss[1] ? bcf_gt_unphased(alleles[a1]) : bcf_int32_vector_end; if ( ref==a0 && ref==a1 ) args->n.hom_rr++; // hom ref: RR @@ -1265,7 +1330,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr) } ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2); if ( ret==-1 ) error("Error parsing the site %s:%"PRId64", expected two characters\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1); - if ( ret==-2 ) + if ( ret==-2 ) { // something else than a SNP free(ref); @@ -1275,7 +1340,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr) args->str.l = 0; kputc(ref[0], &args->str); - for (i=0; i<5; i++) + for (i=0; i<5; i++) { if ( alleles[i]>0 ) { @@ -1293,7 +1358,6 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr) static void tsv_to_vcf(args_t *args) { if ( !args->ref_fname ) error("--tsv2vcf requires the --fasta-ref option\n"); - if ( !args->sample_list ) error("--tsv2vcf requires the --samples option\n"); args->ref = fai_load(args->ref_fname); if ( !args->ref ) error("Could not load the reference %s\n", args->ref_fname); @@ -1303,17 +1367,21 @@ static void tsv_to_vcf(args_t *args) bcf_hdr_append(args->header, "##FORMAT="); if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); - int i, n; - char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n); - if ( !smpls ) error("Could not parse %s\n", args->sample_list); - for (i=0; isample_list ) { - bcf_hdr_add_sample(args->header, smpls[i]); - free(smpls[i]); + smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl); + if ( !smpl ) error("Could not parse %s\n", args->sample_list); + for (i=0; iheader, smpl[i]); + free(smpl[i]); + } + free(smpl); + bcf_hdr_add_sample(args->header, NULL); + args->gts = (int32_t *) malloc(sizeof(int32_t)*nsmpl*2); } - free(smpls); - bcf_hdr_add_sample(args->header, NULL); - args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2); char wmode[8]; set_wmode(wmode,args->output_type,args->outfname,args->clevel); @@ -1321,12 +1389,18 @@ static void tsv_to_vcf(args_t *args) if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); tsv_t *tsv = tsv_init(args->columns ? args->columns : "ID,CHROM,POS,AA"); if ( tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header) < 0 ) error("Expected CHROM column\n"); if ( tsv_register(tsv, "POS", tsv_setter_pos, NULL) < 0 ) error("Expected POS column\n"); if ( tsv_register(tsv, "ID", tsv_setter_id, args->header) < 0 && !args->columns ) error("Expected ID column\n"); - if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 ) error("Expected AA column\n"); + if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 ) + { + if ( args->sample_list ) error("Expected AA column with -s/-S\n"); + if ( tsv_register(tsv, "REF", tsv_setter_ref, args) < 0 || tsv_register(tsv, "ALT", tsv_setter_alt, args) < 0 ) + error("Expected REF and ALT columns when AA was not given\n"); + } bcf1_t *rec = bcf_init(); bcf_float_set_missing(rec->qual); @@ -1343,6 +1417,7 @@ static void tsv_to_vcf(args_t *args) if ( !tsv_parse(tsv, rec, line.s) ) { if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); + args->n.written++; } else args->n.skipped++; @@ -1350,20 +1425,36 @@ static void tsv_to_vcf(args_t *args) if ( hts_close(in_fh) ) error("Close failed: %s\n", args->infname); free(line.s); + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } bcf_hdr_destroy(args->header); if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); tsv_destroy(tsv); bcf_destroy(rec); free(args->str.s); free(args->gts); + free(args->tsv.ref.s); + free(args->tsv.alt.s); + free(args->tsv.refalt.s); fprintf(stderr,"Rows total: \t%d\n", args->n.total); fprintf(stderr,"Rows skipped: \t%d\n", args->n.skipped); - fprintf(stderr,"Missing GTs: \t%d\n", args->n.missing); - fprintf(stderr,"Hom RR: \t%d\n", args->n.hom_rr); - fprintf(stderr,"Het RA: \t%d\n", args->n.het_ra); - fprintf(stderr,"Hom AA: \t%d\n", args->n.hom_aa); - fprintf(stderr,"Het AA: \t%d\n", args->n.het_aa); + fprintf(stderr,"Sites written: \t%d\n", args->n.written); + if ( args->sample_list ) + { + fprintf(stderr,"Missing GTs: \t%d\n", args->n.missing); + fprintf(stderr,"Hom RR: \t%d\n", args->n.hom_rr); + fprintf(stderr,"Het RA: \t%d\n", args->n.het_ra); + fprintf(stderr,"Hom AA: \t%d\n", args->n.hom_aa); + fprintf(stderr,"Het AA: \t%d\n", args->n.het_aa); + } } static void vcf_to_vcf(args_t *args) @@ -1377,6 +1468,7 @@ static void vcf_to_vcf(args_t *args) bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); while ( bcf_sr_next_line(args->files) ) { @@ -1389,6 +1481,15 @@ static void vcf_to_vcf(args_t *args) } if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); } + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); } @@ -1409,6 +1510,7 @@ static void gvcf_to_vcf(args_t *args) bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert"); if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,hdr,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); int32_t *itmp = NULL, nitmp = 0; @@ -1419,7 +1521,7 @@ static void gvcf_to_vcf(args_t *args) { int pass = filter_test(args->filter, line, NULL); if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; - if ( !pass ) + if ( !pass ) { if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); continue; @@ -1469,6 +1571,15 @@ static void gvcf_to_vcf(args_t *args) } } free(itmp); + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); } @@ -1497,6 +1608,7 @@ static void usage(void) fprintf(stderr, " -o, --output FILE Output file name [stdout]\n"); fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); fprintf(stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); fprintf(stderr, " -G, --gensample2vcf ... |,\n"); @@ -1528,7 +1640,7 @@ static void usage(void) fprintf(stderr, "\n"); fprintf(stderr, "TSV conversion:\n"); fprintf(stderr, " --tsv2vcf FILE\n"); - fprintf(stderr, " -c, --columns STRING Columns of the input tsv file [ID,CHROM,POS,AA]\n"); + fprintf(stderr, " -c, --columns STRING Columns of the input tsv file, see man page for details [ID,CHROM,POS,AA]\n"); fprintf(stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n"); fprintf(stderr, " -s, --samples LIST List of sample names\n"); fprintf(stderr, " -S, --samples-file FILE File of sample names\n"); @@ -1590,6 +1702,7 @@ int main_vcfconvert(int argc, char *argv[]) {"fasta-ref",required_argument,NULL,'f'}, {"no-version",no_argument,NULL,10}, {"keep-duplicates",no_argument,NULL,12}, + {"write-index",no_argument,NULL,16}, {NULL,0,NULL,0} }; char *tmp; @@ -1618,6 +1731,7 @@ int main_vcfconvert(int argc, char *argv[]) case 7 : args->convert_func = vcf_to_hapsample; args->outfname = optarg; break; case 8 : error("The --chrom option has been deprecated, please use --3N6 instead\n"); break; case 15 : args->gen_3N6 = 1; break; + case 16 : args->write_index = 1; break; case 'H': args->convert_func = haplegendsample_to_vcf; args->infname = optarg; break; case 'f': args->ref_fname = optarg; break; case 'c': args->columns = optarg; break; @@ -1667,7 +1781,7 @@ int main_vcfconvert(int argc, char *argv[]) else args->infname = argv[optind]; } if ( !args->infname ) usage(); - + if ( args->convert_func ) args->convert_func(args); else vcf_to_vcf(args); diff --git a/vcffilter.c b/vcffilter.c index 68d867247..8665409d1 100644 --- a/vcffilter.c +++ b/vcffilter.c @@ -1,6 +1,6 @@ /* vcffilter.c -- Apply fixed-threshold filters. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -77,6 +77,8 @@ typedef struct _args_t char **argv, *output_fname, *targets_list, *regions_list, *mask_list; int argc, record_cmd_line, mask_is_file, mask_overlap, mask_negate; regidx_t *mask; + char *index_fn; + int write_index; } args_t; @@ -491,6 +493,7 @@ static void usage(args_t *args) fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); } @@ -533,13 +536,14 @@ int main_vcffilter(int argc, char *argv[]) {"SnpGap",required_argument,NULL,'g'}, {"IndelGap",required_argument,NULL,'G'}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,12}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:o:O:g:G:S:",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:",loptions,NULL)) >= 0) { switch (c) { case 'g': - args->snp_gap = strtol(optarg,&tmp,10); + args->snp_gap = strtol(optarg,&tmp,10); if ( *tmp && *tmp!=':' ) error("Could not parse argument: --SnpGap %s\n", optarg); if ( *tmp==':' ) { @@ -625,6 +629,7 @@ int main_vcffilter(int argc, char *argv[]) else if ( !strcasecmp(optarg,"2") ) args->mask_overlap = 2; else error("Could not parse: --mask-overlap %s\n",optarg); break; + case 12 : args->write_index = 1; break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); @@ -672,6 +677,7 @@ int main_vcffilter(int argc, char *argv[]) init_data(args); if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); while ( bcf_sr_next_line(args->files) ) { bcf1_t *line = bcf_sr_get_line(args->files, 0); @@ -713,7 +719,15 @@ int main_vcffilter(int argc, char *argv[]) } } buffered_filters(args, NULL); - + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); destroy_data(args); bcf_sr_destroy(args->files); diff --git a/vcfgtcheck.c b/vcfgtcheck.c index f646e1f6d..561be62a5 100644 --- a/vcfgtcheck.c +++ b/vcfgtcheck.c @@ -1,6 +1,6 @@ /* vcfgtcheck.c -- Check sample identity. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -59,6 +59,7 @@ typedef struct int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file; int regions_overlap, targets_overlap; int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl; + int nused[2][2]; double *pdiff, *qry_prob, *gt_prob; uint32_t *ndiff,*ncnt,ncmp, npairs; int32_t *qry_arr,*gt_arr, nqry_arr,ngt_arr; @@ -309,7 +310,7 @@ static void init_data(args_t *args) init_samples(args->qry_samples, args->qry_samples_is_file, &args->qry_smpl, &args->nqry_smpl, args->qry_hdr, args->qry_fname); } if ( args->gt_samples ) - { + { init_samples(args->gt_samples, args->gt_samples_is_file, &args->gt_smpl, &args->ngt_smpl, args->gt_hdr ? args->gt_hdr : args->qry_hdr, args->gt_fname ? args->gt_fname : args->qry_fname); @@ -377,7 +378,7 @@ static void init_data(args_t *args) args->gt_prob = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob)); // dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing - // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding + // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding // probabilities of 0/0, 0/1, and 1/1 genotypes for (i=0; i<8; i++) for (j=0; j<3; j++) @@ -555,7 +556,9 @@ static void process_line(args_t *args) args->gt_arr = args->qry_arr; } + // stats: number of compared sites, and used tags args->ncmp++; + args->nused[qry_use_GT][gt_use_GT]++; double af,hwe_dsg[8]; if ( args->calc_hwe_prob ) @@ -636,7 +639,7 @@ static void process_line(args_t *args) gt_dsg = gt_use_GT ? gt_to_prob(args,ptr,gt_prob) : pl_to_prob(args,ptr,gt_prob); if ( !gt_dsg ) continue; // missing value if ( args->hom_only && !(gt_dsg&5) ) continue; // not a hom - + ptr = args->qry_arr + args->pairs[i].iqry*nqry1; qry_dsg = qry_use_GT ? gt_to_prob(args,ptr,qry_prob) : pl_to_prob(args,ptr,qry_prob); if ( !qry_dsg ) continue; // missing value @@ -797,11 +800,15 @@ static void report(args_t *args) fprintf(args->fp,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data); fprintf(args->fp,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT); fprintf(args->fp,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL); + fprintf(args->fp,"INFO\tsites-used-PL-vs-PL\t%u\n",args->nused[0][0]); + fprintf(args->fp,"INFO\tsites-used-PL-vs-GT\t%u\n",args->nused[0][1]); + fprintf(args->fp,"INFO\tsites-used-GT-vs-PL\t%u\n",args->nused[1][0]); + fprintf(args->fp,"INFO\tsites-used-GT-vs-GT\t%u\n",args->nused[1][1]); fprintf(args->fp,"# DC, discordance:\n"); fprintf(args->fp,"# - query sample\n"); fprintf(args->fp,"# - genotyped sample\n"); - fprintf(args->fp,"# - discordance (number of mismatches; smaller is better)\n"); - fprintf(args->fp,"# - negative log of HWE probability at matching sites (rare genotypes mataches are more informative, bigger is better)\n"); + fprintf(args->fp,"# - discordance (either an abstract score or number of mismatches, see -e/-u in the man page for details; smaller is better)\n"); + fprintf(args->fp,"# - negative log of HWE probability at matching sites (rare genotypes matches are more informative, bigger is better)\n"); fprintf(args->fp,"# - number of sites compared (bigger is better)\n"); fprintf(args->fp,"#DC\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]-log P(HWE)\t[6]Number of sites compared\n"); @@ -1023,7 +1030,7 @@ static int is_input_okay(args_t *args, int nmatch) return 1; not_okay: - fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n", + fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n", bcf_seqname(hdr,rec),rec->pos+1,msg); return 0; } @@ -1097,7 +1104,7 @@ int main_vcfgtcheck(int argc, char *argv[]) args->es_max_mem = strdup("500M"); // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23 - // - min_inter: pairs with smaller err value will be considered identical + // - min_inter: pairs with smaller err value will be considered identical // - max_intra: pairs with err value bigger than abs(max_intra_err) will be considered // different. If negative, the cutoff may be heuristically lowered args->min_inter_err = 0.23; @@ -1169,7 +1176,7 @@ int main_vcfgtcheck(int argc, char *argv[]) case 3 : args->calc_hwe_prob = 0; break; case 4 : error("The option -S, --target-sample has been deprecated\n"); break; case 5 : args->dry_run = 1; break; - case 6 : + case 6 : args->distinctive_sites = strtod(optarg,&tmp); if ( *tmp ) { @@ -1202,7 +1209,7 @@ int main_vcfgtcheck(int argc, char *argv[]) else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4; else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg); break; - case 'S': + case 'S': if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3, args->gt_samples_is_file = 1; else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4, args->qry_samples_is_file = 1; else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg); diff --git a/vcfisec.c b/vcfisec.c index a755a85b4..4ee29b4c8 100644 --- a/vcfisec.c +++ b/vcfisec.c @@ -1,6 +1,6 @@ /* vcfisec.c -- Create intersections, unions and complements of VCF files. - Copyright (C) 2012-2022 Genome Research Ltd. + Copyright (C) 2012-2023 Genome Research Ltd. Author: Petr Danecek @@ -60,6 +60,8 @@ typedef struct char **argv, *prefix, *output_fname, **fnames, *write_files, *targets_list, *regions_list; char *isec_exact; int argc, record_cmd_line; + char *index_fn; + int write_index; } args_t; @@ -148,6 +150,8 @@ void isec_vcf(args_t *args) if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); if ( bcf_hdr_write(out_fh, files->readers[args->iwrite].header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); + if ( args->write_index && init_index(out_fh,files->readers[args->iwrite].header,args->output_fname,&args->index_fn)<0 ) + error("Error: failed to initialise index for %s\n",args->output_fname?args->output_fname:"standard output"); } if ( !args->nwrite && !out_std && !args->prefix ) fprintf(stderr,"Note: -w option not given, printing list of sites...\n"); @@ -253,7 +257,19 @@ void isec_vcf(args_t *args) } } if ( str.s ) free(str.s); - if ( out_fh && hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-"); + if ( out_fh ) + { + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-"); + } } static void add_filter(args_t *args, char *expr, int logic) @@ -481,6 +497,7 @@ static void usage(void) fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); fprintf(stderr, " -w, --write LIST List of files to write with -p given as 1-based indexes. By default, all files are written\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " # Create intersection and complements of two sets saving the output in dir/*\n"); @@ -537,6 +554,7 @@ int main_vcfisec(int argc, char *argv[]) {"output-type",required_argument,NULL,'O'}, {"threads",required_argument,NULL,9}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,10}, {NULL,0,NULL,0} }; char *tmp; @@ -608,6 +626,7 @@ int main_vcfisec(int argc, char *argv[]) break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; + case 10 : args->write_index = 1; break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); diff --git a/vcfmerge.c b/vcfmerge.c index 621f4102c..87b6b8a39 100644 --- a/vcfmerge.c +++ b/vcfmerge.c @@ -1,6 +1,6 @@ /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. - Copyright (C) 2012-2022 Genome Research Ltd. + Copyright (C) 2012-2023 Genome Research Ltd. Author: Petr Danecek @@ -63,6 +63,19 @@ typedef khash_t(strdict) strdict_t; #define PL2PROB_MAX 1024 +// Rules for merging FORMAT Number=A,G,R vectors with missing values +#define MERGE_MISSING_DOT 0 // leave as is, i.e. use a missing value "." +#define MERGE_MISSING_CONST 1 // use a constant value +#define MERGE_MISSING_MAX 2 // use the existing maximum value + +typedef struct _missing_rule_t +{ + char *hdr_tag; + int type; + float value; +} +missing_rule_t; + // For merging INFO Number=A,G,R tags typedef struct { @@ -103,29 +116,37 @@ typedef struct int *map; // mapping from input alleles to the array of output alleles (set by merge_alleles) int mmap; // size of map array (only buffer[i].n_allele is actually used) int als_differ; + int var_types; // variant types in this record, shifted by <<1 to account for VCF_REF } maux1_t; + +// Buffered lines for a single reader typedef struct { int rid; // current rid int beg,end; // valid ranges in reader's buffer [beg,end). Maintained by maux_reset and gvcf_flush. + int unkn_allele;// the index of the unknown allele (<*>, ) int cur; // current line or -1 if none int mrec; // allocated size of buf maux1_t *rec; // buffer to keep reader's lines bcf1_t **lines; // source buffer: either gvcf or readers' buffer + int var_types; // reader's variant types in the active [beg,end] window } buffer_t; typedef struct { - int n, pos, var_types; // number of readers, current position, currently available variant types + int n, pos, var_types; // number of readers; current position; variant types at this position across all available records + int *als_types, // allele type of each output allele + mals_types; char *chr; // current chromosome char **als, **out_als; // merged alleles (temp, may contain empty records) and merged alleles ready for output int nals, mals, nout_als, mout_als; // size of the output array int *cnt, ncnt; // number of records that refer to the alleles int *smpl_ploidy, *smpl_nGsize; // ploidy and derived number of values in Number=G tags, updated for each line (todo: cache for missing cases) + const char **fmt_key;// temporary short-lived array to store output tag names bcf_fmt_t **fmt_map; // i-th output FORMAT field corresponds in j-th reader to i*nreader+j, first row is reserved for GT int nfmt_map; // number of rows in the fmt_map array - int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes + int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes, from src idxs to dst file idxs void *tmp_arr; size_t ntmp_arr; buffer_t *buf; @@ -156,6 +177,9 @@ typedef struct faidx_t *gvcf_fai; info_rule_t *rules; int nrules; + char *missing_rules_str; + missing_rule_t *missing_rules; // lookup for -M, --missing-rules + int nmissing_rules; strdict_t *tmph; kstring_t tmps; bcf_srs_t *files; @@ -166,6 +190,8 @@ typedef struct int argc, n_threads, record_cmd_line, clevel; int local_alleles; // the value of -L option int keep_AC_AN; + char *index_fn; + int write_index; } args_t; @@ -298,6 +324,89 @@ static void info_rules_merge_join(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rul } } +static int missing_rules_comp_key2(const void *a, const void *b) +{ + missing_rule_t *rule1 = (missing_rule_t*) a; + missing_rule_t *rule2 = (missing_rule_t*) b; + return strcmp(rule1->hdr_tag, rule2->hdr_tag); +} +static int missing_rules_comp_key(const void *a, const void *b) +{ + char *key = (char*) a; + missing_rule_t *rule = (missing_rule_t*) b; + return strcmp(key, rule->hdr_tag); +} +static void missing_rules_init(args_t *args) +{ + kstring_t str = {0,0,0}; + if ( args->missing_rules_str ) + { + if ( !strcmp("-",args->missing_rules_str) ) kputs("PL:.,AD:.",&str); + else kputs(args->missing_rules_str,&str); + } + else if ( args->do_gvcf ) kputs("PL:max,AD:0",&str); + else return; + + args->nmissing_rules = 1; + char *ss = str.s, *tmp = ss; + int n = 0; + while ( *ss ) + { + if ( *ss==':' ) { *ss = 0; n++; if ( n%2==0 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); } + else if ( *ss==',' ) { *ss = 0; args->nmissing_rules++; n++; if ( n%2==1 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); } + ss++; + } + if ( n%2==0 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); + args->missing_rules = (missing_rule_t*) calloc(args->nmissing_rules,sizeof(missing_rule_t)); + + n = args->nmissing_rules; + args->nmissing_rules = 0; + ss = tmp; + while ( args->nmissing_rules < n ) + { + missing_rule_t *rule = &args->missing_rules[args->nmissing_rules]; + rule->hdr_tag = strdup(ss); + int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag); + if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_FMT,id) ) + { + if ( args->missing_rules_str ) error("The FORMAT tag is not defined in the header: \"%s\"\n", rule->hdr_tag); + free(rule->hdr_tag); + n--; + ss = strchr(ss, '\0'); ss++; + if ( !*ss ) error("Could not parse --missing-rules, missing logic of \"%s\"\n", rule->hdr_tag); + ss = strchr(ss, '\0'); ss++; + continue; + } + + ss = strchr(ss, '\0'); ss++; + if ( !*ss ) error("Could not parse --missing-rules, missing logic of \"%s\"\n", rule->hdr_tag); + + if ( !strcasecmp(ss,".") ) rule->type = MERGE_MISSING_DOT; + else if ( !strcasecmp(ss,"max") ) rule->type = MERGE_MISSING_MAX; + else + { + char *tmp = ss; + rule->value = strtod(ss, &tmp); + if ( *tmp ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); + rule->type = MERGE_MISSING_CONST; + } + ss = strchr(ss, '\0'); ss++; + args->nmissing_rules++; + } + qsort(args->missing_rules, args->nmissing_rules, sizeof(*args->missing_rules), missing_rules_comp_key2); + free(str.s); +} +static void missing_rules_destroy(args_t *args) +{ + int i; + for (i=0; inmissing_rules; i++) + { + missing_rule_t *rule = &args->missing_rules[i]; + free(rule->hdr_tag); + } + free(args->missing_rules); +} + static int info_rules_comp_key2(const void *a, const void *b) { info_rule_t *rule1 = (info_rule_t*) a; @@ -770,6 +879,7 @@ void maux_destroy(maux_t *ma) int i,j; for (i=0; inout_smpl; i++) free(ma->str[i].s); free(ma->str); + free(ma->als_types); for (i=0; imals; i++) { free(ma->als[i]); @@ -793,6 +903,7 @@ void maux_destroy(maux_t *ma) free(ma->AGR_info); if (ma->ntmp_arr) free(ma->tmp_arr); if (ma->nfmt_map) free(ma->fmt_map); + free(ma->fmt_key); // ma->inf freed in bcf_destroy1 for (i=0; imals; i++) free(ma->als[i]); if (ma->mout_als) free(ma->out_als); @@ -820,7 +931,6 @@ void maux_reset(maux_t *ma, int *rid_tab) { int i,j; for (i=0; in; i++) maux_expand1(&ma->buf[i],ma->files->readers[i].nbuffer+1); - for (i=0; incnt; i++) ma->cnt[i] = 0; for (i=0; imals; i++) { free(ma->als[i]); @@ -856,6 +966,7 @@ void maux_reset(maux_t *ma, int *rid_tab) for (j=ma->buf[i].beg; j<=ma->files->readers[i].nbuffer; j++) { ma->buf[i].rec[j].skip = 0; + ma->buf[i].rec[j].var_types = 0; bcf1_t *line = ma->files->readers[i].buffer[j]; if ( line->rid!=ma->buf[i].rid || line->pos!=ma->pos ) break; } @@ -959,12 +1070,14 @@ void merge_chrom2qual(args_t *args, bcf1_t *out) int ir, j; for (ir=0; irnreaders; ir++) { + ma->buf[ir].unkn_allele = 0; bcf1_t *line = maux_get_line(args,ir); if ( !line ) continue; for (j=1; jn_allele; j++) { int irec = ma->buf[ir].cur; if ( ma->buf[ir].rec[irec].map[j]==i ) ma->buf[ir].rec[irec].map[j] = ma->nout_als; + if ( bcf_has_variant_type(line,j,VCF_REF) && line->d.allele[j][0]=='<' ) ma->buf[ir].unkn_allele = j; } } } @@ -1985,7 +2098,7 @@ void merge_localized_numberAR_format_field(args_t *args, bcf_fmt_t **fmt_map, bc bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize); ma->laa_dirty = 1; } -void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) +void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule, bcf1_t *out) { bcf_srs_t *files = args->files; bcf_hdr_t *out_hdr = args->out_hdr; @@ -2135,12 +2248,32 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) for (l=1; lsmpl_ploidy[ismpl+j]==1 ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \ - for (l=0; lsmpl_ploidy[ismpl+j]==1 ? 1 : 0; \ + int ngsize = haploid ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \ + if ( ma->buf[i].unkn_allele ) /* Use value from the unknown allele when available */ \ + { \ + src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \ + int iunkn = haploid ? ma->buf[i].unkn_allele : (ma->buf[i].unkn_allele+1)*(ma->buf[i].unkn_allele + 2)/2 - 1; \ + for (l=0; ltype==MERGE_MISSING_CONST ) \ + { \ + for (l=0; lvalue; tgt++; } \ + } \ + else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \ + { \ + src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \ + src_type_t max = src[0]; \ + for (l=1; ln; l++) if ( max < src[l] ) max = src[l]; \ + for (l=0; lsmpl_ploidy[ismpl+j]==1 ) \ + if ( haploid ) \ { \ - /* Haploid */ \ int iori, inew; \ for (iori=0; iorin_allele; iori++) \ { \ @@ -2194,7 +2327,26 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) continue; \ } \ src = (src_type_t*) (fmt_ori->p + j*fmt_ori->size); \ - for (l=0; lbuf[i].unkn_allele ) /* Use value from the unknown allele when available */ \ + { \ + int iunkn = ma->buf[i].unkn_allele; \ + for (l=0; ltype==MERGE_MISSING_CONST ) \ + { \ + for (l=0; lvalue; tgt++; } \ + } \ + else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \ + { \ + src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \ + src_type_t max = src[0]; \ + for (l=1; ln; l++) if ( max < src[l] ) max = src[l]; \ + for (l=0; ln_allele; iori++) \ { \ @@ -2234,6 +2386,7 @@ void merge_format(args_t *args, bcf1_t *out) { ma->nfmt_map = 2; ma->fmt_map = (bcf_fmt_t**) calloc(ma->nfmt_map*files->nreaders, sizeof(bcf_fmt_t*)); + ma->fmt_key = (const char**) malloc(ma->nfmt_map*sizeof(*ma->fmt_key)); } else memset(ma->fmt_map, 0, ma->nfmt_map*files->nreaders*sizeof(bcf_fmt_t**)); @@ -2250,7 +2403,7 @@ void merge_format(args_t *args, bcf1_t *out) bcf_hdr_t *hdr = reader->header; for (j=0; jn_fmt; j++) { - // Wat this tag already seen? + // Was this tag already seen? bcf_fmt_t *fmt = &line->d.fmt[j]; const char *key = hdr->id[BCF_DT_ID][fmt->id].key; kitr = kh_get(strdict, tmph, key); @@ -2269,9 +2422,11 @@ void merge_format(args_t *args, bcf1_t *out) { ma->fmt_map = (bcf_fmt_t**) realloc(ma->fmt_map, sizeof(bcf_fmt_t*)*(max_ifmt+1)*files->nreaders); memset(ma->fmt_map+ma->nfmt_map*files->nreaders, 0, (max_ifmt-ma->nfmt_map+1)*files->nreaders*sizeof(bcf_fmt_t*)); + ma->fmt_key = (const char**) realloc(ma->fmt_key, sizeof(*ma->fmt_key)*(max_ifmt+1)); ma->nfmt_map = max_ifmt+1; } if ( key[0]=='P' && key[1]=='L' && key[2]==0 ) { has_PL = ifmt; } + ma->fmt_key[max_ifmt] = key; } kitr = kh_put(strdict, tmph, key, &ret); kh_value(tmph, kitr) = ifmt; @@ -2298,7 +2453,10 @@ void merge_format(args_t *args, bcf1_t *out) update_AN_AC(out_hdr, out); for (i=1; i<=max_ifmt; i++) - merge_format_field(args, &ma->fmt_map[i*files->nreaders], out); + { + missing_rule_t *rule = (missing_rule_t*) bsearch(ma->fmt_key[i], args->missing_rules, args->nmissing_rules, sizeof(*args->missing_rules), missing_rules_comp_key); + merge_format_field(args, &ma->fmt_map[i*files->nreaders], rule, out); + } if ( ma->laa_dirty ) update_local_alleles(args, out); @@ -2406,6 +2564,9 @@ void gvcf_write_block(args_t *args, int start, int end) { int slen = 0; char *seq = faidx_fetch_seq(args->gvcf_fai,maux->chr,out->pos,out->pos,&slen); + if (!seq) + exit(1); // faidx_fetch_seq has already reported the error. + if (slen) { out->d.allele[0][0] = seq[0]; @@ -2520,16 +2681,6 @@ static inline int is_gvcf_block(bcf1_t *line) return 0; } -// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h -// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault) and -// to accommodate for VCF_GVCF_REF defined below -static const int - snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2), - indel_mask = VCF_INDEL<<2, - ins_mask = VCF_INS<<2, - del_mask = VCF_DEL<<2, - ref_mask = 2; - /* Check incoming lines for new gVCF blocks, set pointer to the current source buffer (gvcf or readers). In contrast to gvcf_flush, this function can be @@ -2629,7 +2780,7 @@ void clean_buffer(args_t *args) { if ( ma->gvcf[ir].active ) { - if ( ma->pos >= ma->gvcf[ir].end ) ma->gvcf[ir].active = 0; + if ( ma->pos > ma->gvcf[ir].end ) ma->gvcf[ir].active = 0; else if ( ma->buf[ir].cur==-1 ) ma->buf[ir].cur = ma->buf[ir].beg; // re-activate interrupted gVCF block } if ( !ma->gvcf[ir].active ) ma->buf[ir].cur = -1; @@ -2664,13 +2815,16 @@ void debug_maux(args_t *args) { bcf_sr_t *reader = &files->readers[j]; buffer_t *buf = &maux->buf[j]; - fprintf(stderr," reader %d: ", j); + fprintf(stderr," reader %d (k=%d-%d): ", j,buf->beg,buf->end); for (k=buf->beg; kend; k++) { - if ( buf->rec[k].skip & SKIP_DONE ) continue; - bcf1_t *line = reader->buffer[k]; + if ( buf->rec[k].skip & SKIP_DONE ) { fprintf(stderr," DONE"); continue; } + bcf1_t *line = reader->buffer[k]; // selected for merging by can_merge fprintf(stderr,"\t"); - if ( buf->rec[k].skip ) fprintf(stderr,"["); // this record will not be merged in this round + if ( buf->cur==k ) fprintf(stderr,"!"); // selected for merging by stage_line + if ( buf->rec[k].skip ) fprintf(stderr,"["); // this record cannot be merged in this round + if ( !line->n_allele && maux->gvcf[j].active ) + fprintf(stderr,"<*>"); for (l=0; ln_allele; l++) fprintf(stderr,"%s%s", l==0?"":",", line->d.allele[l]); if ( buf->rec[k].skip ) fprintf(stderr,"]"); @@ -2686,9 +2840,10 @@ void debug_state(args_t *args) { maux_t *maux = args->maux; int i,j; + fprintf(stderr,"State after position=%d done:\n",maux->pos+1); for (i=0; ifiles->nreaders; i++) { - fprintf(stderr,"reader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end); + fprintf(stderr,"\treader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end); if ( maux->buf[i].cur >=0 ) { bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i); @@ -2698,20 +2853,136 @@ void debug_state(args_t *args) } fprintf(stderr,"\n"); } - fprintf(stderr,"gvcf_min=%d\n", args->maux->gvcf_min); + fprintf(stderr,"\tgvcf_min=%d\n", args->maux->gvcf_min); for (i=0; ifiles->nreaders; i++) { - fprintf(stderr,"reader %d:\tgvcf_active=%d", i,maux->gvcf[i].active); + fprintf(stderr,"\t\treader %d:\tgvcf_active=%d", i,maux->gvcf[i].active); if ( maux->gvcf[i].active ) fprintf(stderr,"\tpos,end=%"PRId64",%"PRId64, (int64_t) maux->gvcf[i].line->pos+1,(int64_t) maux->gvcf[i].end+1); fprintf(stderr,"\n"); } fprintf(stderr,"\n"); } + +// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h +// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault) +static const int + snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), + indel_mask = (VCF_INDEL<<1), + ins_mask = VCF_INS<<1, + del_mask = VCF_DEL<<1, + ref_mask = 1; + +// Can these types be merged given the -m settings? Despite the function's name, its focus is on +// excluding incompatible records, there will be a finer matching later in stage_line() +static inline int types_compatible(args_t *args, int selected_types, buffer_t *buf, int irec) +{ + int k; + maux_t *maux = args->maux; + bcf1_t *rec = buf->lines[irec]; + int rec_types = buf->rec[irec].var_types; + + assert( selected_types ); // this is trivially true, set in can_merge() + + if ( args->collapse & COLLAPSE_ANY ) return 1; // can merge anything with anything + + // REF and gVCF_REF with no other alleles present can be merged with anything + if ( (selected_types&ref_mask) && !(selected_types&(~ref_mask)) ) return 1; + if ( (rec_types&ref_mask) && !(rec_types&(~ref_mask)) ) return 1; + + if ( args->collapse!=COLLAPSE_NONE ) + { + // If we are here, one the following modes must have been set: both,snps,indels,snp-ins-del + // Include the new record if + // - rec has SNV, we already have SNV, and -m is both,snps,snp-ins-del + // - rec has indel, we already have an indel, and -m both,indels,snp-ins-del + if ( args->collapse&(COLLAPSE_SNPS|COLLAPSE_SNP_INS_DEL) ) + { + if ( (rec_types&snp_mask) && (selected_types&snp_mask) ) return 1; + } + if ( args->collapse&COLLAPSE_INDELS ) + { + if ( (rec_types&indel_mask) && (selected_types&indel_mask) ) return 1; + } + if ( args->collapse&COLLAPSE_SNP_INS_DEL ) + { + if ( (rec_types&ins_mask) && (selected_types&ins_mask) ) return 1; + if ( (rec_types&del_mask) && (selected_types&del_mask) ) return 1; + } + // Whatever is left, allow to match if the alleles match exactly + } + + // The -m none mode or exact matching requested + // Simple test first: are the variants of the same type? + int x = selected_types >> 1; // remove REF + int y = rec_types >> 1; // remove REF + while ( x && y ) { x>>=1; y>>=1; } + if ( x || y ) return 0; // the types differ + + if ( vcmp_set_ref(args->vcmp,maux->als[0],rec->d.allele[0]) < 0 ) return 0; // refs are not compatible + for (k=1; kn_allele; k++) + { + if ( bcf_has_variant_type(rec,k,VCF_REF) ) continue; // this must be gVCF_REF (<*> or ) + if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,rec->d.allele[k])>=0 ) break; + } + if ( k==rec->n_allele ) return 0; // this record has a new allele rec->d.allele[k] + return 1; // all alleles in rec are also in the records selected thus far, perhaps save for gVCF_REF +} + +static void maux_update_alleles(args_t *args, int ireader, int irec) +{ + int k; + bcf_sr_t *reader = &args->files->readers[ireader]; + maux_t *maux = args->maux; + buffer_t *buf = &maux->buf[ireader]; + maux1_t *ma1 = &buf->rec[irec]; + bcf1_t *line = buf->lines[irec]; + hts_expand(int, line->n_allele, ma1->mmap, ma1->map); + if ( !maux->nals ) // first record to be merged, copy the alleles to the output + { + maux->nals = line->n_allele; + hts_expand0(char*, maux->nals, maux->mals, maux->als); + hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); + hts_expand0(int, maux->nals, maux->mals_types, maux->als_types); + for (k=0; knals; k++) + { + free(maux->als[k]); + maux->als[k] = strdup(line->d.allele[k]); + ma1->map[k] = k; + maux->cnt[k] = 1; + int var_type = bcf_has_variant_type(line, k, VCF_ANY); + if ( args->collapse==COLLAPSE_SNP_INS_DEL ) var_type &= ~VCF_INDEL; + maux->als_types[k] = var_type ? var_type<<1 : ref_mask; + } + return; + } + // normalize alleles + maux->als = merge_alleles(line->d.allele, line->n_allele, ma1->map, maux->als, &maux->nals, &maux->mals); + if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname); + hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); + hts_expand0(int, maux->nals, maux->mals_types, maux->als_types); + for (k=1; kn_allele; k++) + { + int ik = ma1->map[k]; + int var_type = bcf_has_variant_type(line, k, VCF_ANY); + if ( args->collapse==COLLAPSE_SNP_INS_DEL ) var_type &= ~VCF_INDEL; + maux->als_types[ik] = var_type ? var_type<<1 : ref_mask; + maux->cnt[ik]++; // how many times an allele appears in the files + } + maux->cnt[0]++; +} + /* - Determine which line should be merged from which reader: go through all - readers and all buffered lines, expand REF,ALT and try to match lines with - the same ALTs. + Determine which lines remain to be merged across readers at the current position and + are compatible given the -m criteria. This is indicated by maux1_t.skip: 0=compatible, + SKIP_DONE=the record is done, SKIP_DIFF=not compatible and will be included next time. + + At the same time count how many times is each allele present across the readers and records + so that we can prioritize the records with the same alleles to come first. In the end maximum + one record at a time can be selected from each reader and that witll be done in stage_line(). + + The function maux_reset already initialized structures for this position, so here each + reader comes with the beg,end indexes that point to records with the same maux_t.pos position. */ int can_merge(args_t *args) { @@ -2719,28 +2990,39 @@ int can_merge(args_t *args) maux_t *maux = args->maux; gvcf_aux_t *gaux = maux->gvcf; char *id = NULL, ref = 'N'; - int i,j,k, ntodo = 0; + int i,j, ntodo = 0; for (i=0; inals; i++) { free(maux->als[i]); maux->als[i] = NULL; + maux->cnt[i] = 0; } maux->var_types = maux->nals = 0; - // this is only for the `-m none -g` mode, ensure that <*> lines come last - #define VCF_GVCF_REF 1 - + // In this loop we do the following: + // - remember the first encountered ID if matching by ID + // - count the number of unprocessed records at this position + // - collect all variant types at this position. This is to be able to perform -m matching and + // print SNVs first, then indels, then gVCF blocks + // - init the 'skip' variable to SKIP_DIFF for each record that has not been used yet for (i=0; inreaders; i++) { buffer_t *buf = &maux->buf[i]; + buf->var_types = 0; - if ( gaux && gaux[i].active ) + if ( gaux && gaux[i].active ) // active gvcf block { - // skip readers with active gvcf blocks buf->rec[buf->beg].skip = SKIP_DIFF; + maux->var_types |= ref_mask; + buf->var_types |= ref_mask; + buf->rec[buf->beg].var_types = ref_mask; continue; } + + // for gvcf: find out REF at this position + if ( buf->beg < buf->end && ref=='N' ) ref = buf->lines[buf->beg]->d.allele[0][0]; + for (j=buf->beg; jend; j++) { if ( buf->rec[j].skip & SKIP_DONE ) continue; @@ -2749,118 +3031,70 @@ int can_merge(args_t *args) ntodo++; bcf1_t *line = buf->lines[j]; - if ( args->merge_by_id ) - id = line->d.id; - else + if ( args->merge_by_id && !id ) { id = line->d.id; continue; } // set ID when merging by id + + if ( !buf->rec[j].var_types ) { int var_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap); - if (var_type < 0) error("bcf_has_variant_types() failed."); + if ( var_type < 0 ) error("bcf_has_variant_types() failed."); if ( args->collapse==COLLAPSE_SNP_INS_DEL ) { // need to distinguish between ins and del so strip the VCF_INDEL flag var_type &= ~VCF_INDEL; } - maux->var_types |= var_type ? var_type<<2 : 2; - - // for the `-m none -g` mode - if ( args->collapse==COLLAPSE_NONE && args->do_gvcf && is_gvcf_block(line) ) - maux->var_types |= VCF_GVCF_REF; + var_type = var_type ? var_type<<1 : ref_mask; + if ( args->do_gvcf && is_gvcf_block(line) ) var_type |= ref_mask; + buf->rec[j].var_types = var_type; } + maux->var_types |= buf->rec[j].var_types; + buf->var_types |= buf->rec[j].var_types; } - - // for gvcf: find out REF at this position - if ( buf->beg < buf->end && ref=='N' ) - ref = buf->lines[buf->beg]->d.allele[0][0]; } if ( !ntodo ) return 0; + int selected_types = 0; + // In this loop we select from each reader compatible candidate lines. // (i.e. SNPs or indels). Go through all files and all lines at this // position and normalize relevant alleles. // REF-only sites may be associated with both SNPs and indels. for (i=0; inreaders; i++) { - bcf_sr_t *reader = &files->readers[i]; buffer_t *buf = &maux->buf[i]; - if ( gaux && gaux[i].active ) { + // gVCF records inherited from an upstream gVCF block have incorrect or missing allele and position gaux[i].line->d.allele[0][0] = ref; gaux[i].line->pos = maux->pos; + maux_update_alleles(args, i, buf->beg); + selected_types |= ref_mask; + continue; } - for (j=buf->beg; jend; j++) { if ( buf->rec[j].skip & SKIP_DONE ) continue; bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer - - int line_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap); - if (line_type < 0) error("bcf_has_variant_types() failed."); - line_type = line_type ? line_type<<2 : 2; + int line_types = buf->rec[j].var_types; // select relevant lines if ( args->merge_by_id ) { - if ( strcmp(id,line->d.id) ) continue; + if ( strcmp(id,line->d.id) ) continue; // matching by ID and it does not match the selected record } + else if ( selected_types && !types_compatible(args,selected_types,buf,j) ) continue; else { - // when merging gVCF in -m none mode, make sure that gVCF blocks with the same POS as variant - // records come last, otherwise infinite loop is created (#1164) - if ( args->collapse==COLLAPSE_NONE && args->do_gvcf ) - { - if ( is_gvcf_block(line) && (maux->var_types & (~(VCF_GVCF_REF|2))) ) continue; - } - if ( args->collapse==COLLAPSE_NONE && maux->nals ) - { - // All alleles of the tested record must be present in the - // selected maux record plus variant types must be the same - if ( (maux->var_types & line_type) != line_type ) continue; - if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) < 0 ) continue; // refs not compatible - for (k=1; kn_allele; k++) - { - if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,line->d.allele[k])>=0 ) break; - } - if ( !(line_type&ref_mask) && k==line->n_allele ) continue; // not a REF-only site and there is no matching allele - } - if ( !(args->collapse&COLLAPSE_ANY) ) - { - // Merge: - // - SNPs+SNPs+MNPs+REF if -m both,snps - // - indels+indels+REF if -m both,indels, REF only if SNPs are not present - // - SNPs come first - if ( line_type & (indel_mask|ins_mask|del_mask) ) - { - if ( !(line_type&snp_mask) && maux->var_types&snp_mask ) continue; // SNPs come first - if ( args->do_gvcf && maux->var_types&ref_mask ) continue; // never merge indels with gVCF blocks - } - } + // First time here, choosing the first line: prioritize SNPs when available in the -m snps,both modes + if ( (args->collapse&COLLAPSE_SNPS || args->collapse==COLLAPSE_NONE) // asked to merge SNVs into multiallelics + && (maux->var_types&snp_mask) // there are SNVs at the current position + && !(buf->rec[j].var_types&(snp_mask|ref_mask)) // and this record is not a SNV nor ref + ) continue; } - buf->rec[j].skip = 0; + selected_types |= line_types; - hts_expand(int, line->n_allele, buf->rec[j].mmap, buf->rec[j].map); - if ( !maux->nals ) // first record, copy the alleles to the output - { - maux->nals = line->n_allele; - hts_expand0(char*, maux->nals, maux->mals, maux->als); - hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); - for (k=0; knals; k++) - { - free(maux->als[k]); - maux->als[k] = strdup(line->d.allele[k]); - buf->rec[j].map[k] = k; - maux->cnt[k] = 1; - } - continue; - } - // normalize alleles - maux->als = merge_alleles(line->d.allele, line->n_allele, buf->rec[j].map, maux->als, &maux->nals, &maux->mals); - if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname); - hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); - for (k=1; kn_allele; k++) - maux->cnt[ buf->rec[j].map[k] ]++; // how many times an allele appears in the files - maux->cnt[0]++; + buf->rec[j].skip = 0; // the j-th record from i-th reader can be included. Final decision will be made in stage_line + maux_update_alleles(args, i, j); } } return 1; @@ -2878,48 +3112,61 @@ void stage_line(args_t *args) bcf_srs_t *files = args->files; maux_t *maux = args->maux; - // debug_maux(args); - - // take the most frequent allele present in multiple files, REF is skipped - int i,j,k,icnt = 1; - for (i=2; inals; i++) - if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i; + // Take the most frequent allele present in multiple files, REF and gVCF_REF is skipped. + int i,j,k,icnt = -1; + for (i=1; inals; i++) + { + if ( maux->als_types[i] & ref_mask ) continue; + if ( icnt==-1 || maux->cnt[icnt] < maux->cnt[i] ) icnt = i; + } + int selected_type = icnt>0 ? maux->als_types[icnt] : ref_mask; int nout = 0; for (i=0; inreaders; i++) { buffer_t *buf = &maux->buf[i]; buf->cur = -1; - if ( buf->beg >= buf->end ) continue; // no lines in the buffer + if ( buf->beg >= buf->end ) continue; // No lines in the buffer at this site // find lines with the same allele for (j=buf->beg; jend; j++) { - if ( buf->rec[j].skip ) continue; // done or not compatible - if ( args->merge_by_id ) break; - if ( maux->nals==1 && buf->lines[j]->n_allele==1 ) break; // REF-only record + if ( buf->rec[j].skip ) + { + int is_gvcf = maux->gvcf && maux->gvcf[i].active ? 1 : 0; + if ( !is_gvcf && is_gvcf_block(buf->lines[j]) ) is_gvcf = 1; + if ( !is_gvcf ) continue; // done or not compatible + } + if ( args->merge_by_id ) break; // if merging by ID and the line is compatible, the this is THE line + + // skip if the reader has a record that matches the most frequent allele and this record is not it + if ( (selected_type & buf->var_types) && !(selected_type & buf->rec[j].var_types) ) continue; + // if the reader does not have the most frequent allele type but is a ref, accept + if ( !(selected_type & buf->var_types) && (buf->rec[j].var_types & ref_mask) ) break; + if ( selected_type==ref_mask ) break; + + // accept if the record has the most frequent allele for (k=0; klines[j]->n_allele; k++) if ( icnt==buf->rec[j].map[k] ) break; - if ( klines[j]->n_allele ) break; } if ( j>=buf->end ) { // no matching allele found in this file - if ( args->collapse==COLLAPSE_NONE ) continue; + if ( args->collapse==COLLAPSE_NONE ) continue; // exact matching requested, skip + // choose something compatible to create a multiallelic site given the -m criteria for (j=buf->beg; jend; j++) { if ( buf->rec[j].skip ) continue; // done or not compatible if ( args->collapse&COLLAPSE_ANY ) break; // anything can be merged - int line_type = bcf_has_variant_types(buf->lines[j], VCF_ANY, bcf_match_overlap); - if (line_type < 0) error("bcf_has_variant_types() failed."); - if ( maux->var_types&snp_mask && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break; - if ( maux->var_types&indel_mask && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break; - if ( maux->var_types&ins_mask && line_type&VCF_INS && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; - if ( maux->var_types&del_mask && line_type&VCF_DEL && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; - if ( line_type==VCF_REF ) + int line_type = buf->rec[j].var_types; + if ( maux->var_types&snp_mask && line_type&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break; + if ( maux->var_types&indel_mask && line_type&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break; + if ( maux->var_types&ins_mask && line_type&ins_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; + if ( maux->var_types&del_mask && line_type&del_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; + if ( line_type&ref_mask ) { if ( maux->var_types&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break; if ( maux->var_types&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break; @@ -2940,12 +3187,21 @@ void stage_line(args_t *args) { // found a suitable line for merging buf->cur = j; - - // mark as finished so that it's ignored next time - buf->rec[j].skip = SKIP_DONE; - nout++; } } + + // debug_maux(args); + + // Mark lines staged for merging as finished so that they are ignored next time + for (i=0; inreaders; i++) + { + buffer_t *buf = &maux->buf[i]; + if ( buf->cur == -1 ) continue; + + buf->rec[buf->cur].skip = SKIP_DONE; + nout++; + } + assert( nout ); } @@ -3078,6 +3334,7 @@ void merge_vcf(args_t *args) error_errno("[%s] Failed to update header", __func__); } info_rules_init(args); + missing_rules_init(args); bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header)); if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); @@ -3087,6 +3344,7 @@ void merge_vcf(args_t *args) if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); return; } + else if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); if ( args->collapse==COLLAPSE_NONE ) args->vcmp = vcmp_init(); args->maux = maux_init(args); @@ -3122,9 +3380,19 @@ void merge_vcf(args_t *args) gvcf_flush(args,1); info_rules_destroy(args); + missing_rules_destroy(args); maux_destroy(args->maux); bcf_hdr_destroy(args->out_hdr); - if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname?args->output_fname:"stdout"); bcf_destroy1(args->out_line); kh_destroy(strdict, args->tmph); if ( args->tmps.m ) free(args->tmps.s); @@ -3146,11 +3414,12 @@ static void usage(void) fprintf(stderr, " -0 --missing-to-ref Assume genotypes at missing sites are 0/0\n"); fprintf(stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); fprintf(stderr, " -F, --filter-logic x|+ Remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n"); - fprintf(stderr, " -g, --gvcf -|REF.FA Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n"); + fprintf(stderr, " -g, --gvcf -|REF.FA Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max -M PL:max,AD:0\n"); fprintf(stderr, " -i, --info-rules TAG:METHOD,.. Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n"); fprintf(stderr, " -l, --file-list FILE Read file names from the file\n"); fprintf(stderr, " -L, --local-alleles INT EXPERIMENTAL: if more than ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n"); fprintf(stderr, " -m, --merge STRING Allow multiallelic records for , see man page for details [both]\n"); + fprintf(stderr, " -M, --missing-rules TAG:METHOD Rules for replacing missing values in numeric vectors (.,0,max) when unknown allele <*> is not present [.]\n"); fprintf(stderr, " --no-index Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n"); fprintf(stderr, " --no-version Do not append version and command line to the header\n"); fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); @@ -3159,6 +3428,7 @@ static void usage(void) fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); } @@ -3197,13 +3467,15 @@ int main_vcfmerge(int argc, char *argv[]) {"regions-file",required_argument,NULL,'R'}, {"regions-overlap",required_argument,NULL,4}, {"info-rules",required_argument,NULL,'i'}, + {"missing-rules",required_argument,NULL,'M'}, {"no-version",no_argument,NULL,8}, {"no-index",no_argument,NULL,10}, {"filter-logic",required_argument,NULL,'F'}, + {"write-index",no_argument,NULL,11}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0L:",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:",loptions,NULL)) >= 0) { switch (c) { case 'L': args->local_alleles = strtol(optarg,&tmp,10); @@ -3227,6 +3499,7 @@ int main_vcfmerge(int argc, char *argv[]) break; case 'l': args->file_list = optarg; break; case 'i': args->info_rules = optarg; break; + case 'M': args->missing_rules_str = optarg; break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { @@ -3254,7 +3527,7 @@ int main_vcfmerge(int argc, char *argv[]) else if ( !strcmp(optarg,"any") ) args->collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"all") ) args->collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"none") ) args->collapse = COLLAPSE_NONE; - else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL; + else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL|COLLAPSE_SNPS; else if ( !strcmp(optarg,"id") ) { args->collapse = COLLAPSE_NONE; args->merge_by_id = 1; } else error("The -m type \"%s\" is not recognised.\n", optarg); break; @@ -3271,6 +3544,7 @@ int main_vcfmerge(int argc, char *argv[]) case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 10 : args->no_index = 1; break; + case 11 : args->write_index = 1; break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); diff --git a/vcfnorm.c b/vcfnorm.c index 9538f8d01..02ad322d1 100644 --- a/vcfnorm.c +++ b/vcfnorm.c @@ -1,6 +1,6 @@ /* vcfnorm.c -- Left-align and normalize indels. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -40,6 +40,8 @@ THE SOFTWARE. */ #include "bcftools.h" #include "rbuf.h" #include "abuf.h" +#include "gff.h" +#include "regidx.h" #define CHECK_REF_EXIT 1 #define CHECK_REF_WARN 2 @@ -86,8 +88,8 @@ typedef struct int32_t *int32_arr; int ntmp_arr1, ntmp_arr2, nint32_arr; kstring_t *tmp_str; - kstring_t *tmp_als, tmp_kstr; - int ntmp_als; + kstring_t *tmp_als, *tmp_del, tmp_kstr; + int ntmp_als, ntmp_del; rbuf_t rbuf; int buf_win; // maximum distance between two records to consider int aln_win; // the realignment window size (maximum repeat size) @@ -105,6 +107,13 @@ typedef struct int use_star_allele, ma_use_ref_allele; char *old_rec_tag; htsFile *out; + char *index_fn; + int write_index; + int right_align; + char *gff_fname; + gff_t *gff; + regidx_t *idx_tscript; + regitr_t *itr_tscript; } args_t; @@ -344,6 +353,157 @@ static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt) error("An error occurred while updating INFO/%s\n",args->old_rec_tag); } +static int is_left_align(args_t *args, bcf1_t *line) +{ + if ( args->right_align ) return 0; + if ( !args->gff ) return 1; + const char *chr = bcf_seqname(args->hdr,line); + if ( !strncasecmp("chr",chr,3) ) chr += 3; // strip 'chr' prefix, that's what we requested the GFF reader to do + if ( !regidx_overlap(args->idx_tscript,chr,line->pos,line->pos+line->rlen, args->itr_tscript) ) return 1; + + // if there are two conflicting overlapping transcripts, go with the default left-alignment + int has_fwd = 0; + while ( regitr_overlap(args->itr_tscript) ) + { + gf_tscript_t *tr = regitr_payload(args->itr_tscript, gf_tscript_t*); + if ( tr->strand==STRAND_FWD ) has_fwd = 1; + if ( tr->strand==STRAND_REV ) return 1; + } + // either no hit at all (then left-align) or everything was on fwd strand (then right-align) + return has_fwd ? 0 : 1; +} +static hts_pos_t realign_left(args_t *args, bcf1_t *line) +{ + // trim from right + char *ref = NULL; + int i; + hts_pos_t nref=0, new_pos = line->pos; + kstring_t *als = args->tmp_als; + while (1) + { + // is the rightmost base identical in all alleles? + int min_len = als[0].l; + for (i=1; in_allele; i++) + { + if ( toupper(als[0].s[ als[0].l-1 ]) != toupper(als[i].s[ als[i].l-1 ]) ) break; + if ( als[i].l < min_len ) min_len = als[i].l; + } + if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed + if ( min_len<=1 && new_pos==0 ) break; + + int pad_from_left = 0; + for (i=0; in_allele; i++) // trim all alleles + { + als[i].l--; + if ( !als[i].l ) pad_from_left = 1; + } + if ( pad_from_left ) + { + // extend all alleles to the left by aln_win bases (unless close to the chr start). + // Extra bases will be trimmed from the left after this loop is done + int npad = new_pos >= args->aln_win ? args->aln_win : new_pos; + free(ref); + ref = faidx_fetch_seq64(args->fai, bcf_seqname(args->hdr,line), new_pos-npad, new_pos-1, &nref); + if ( !ref ) error("faidx_fetch_seq64 failed at %s:%"PRId64"\n", bcf_seqname(args->hdr,line), (int64_t) new_pos-npad+1); + replace_iupac_codes(ref,nref); + for (i=0; in_allele; i++) + { + ks_resize(&als[i], als[i].l + npad); + if ( als[i].l ) memmove(als[i].s+npad,als[i].s,als[i].l); + memcpy(als[i].s,ref,npad); + als[i].l += npad; + } + new_pos -= npad; + } + } + free(ref); + + // trim from left + int ntrim_left = 0; + while (1) + { + // is the first base identical in all alleles? + int min_len = als[0].l - ntrim_left; + for (i=1; in_allele; i++) + { + if ( toupper(als[0].s[ntrim_left]) != toupper(als[i].s[ntrim_left]) ) break; + if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left; + } + if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed + ntrim_left++; + } + if ( ntrim_left ) + { + for (i=0; in_allele; i++) + { + memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left); + als[i].l -= ntrim_left; + } + new_pos += ntrim_left; + } + return new_pos; +} + +static hts_pos_t realign_right(args_t *args, bcf1_t *line) +{ + char *ref = NULL; + int i; + hts_pos_t new_pos = line->pos, nref = 0; + kstring_t *als = args->tmp_als; + + // trim from left + int ntrim_left = 0, npad_right = line->rlen, has_indel = 0; + while (1) + { + // is the leftmost base identical in all alleles? + int min_len = als[0].l - ntrim_left; + for (i=1; in_allele; i++) + { + if ( als[0].l!=als[i].l ) has_indel = 1; + if ( toupper(als[0].s[ntrim_left]) != toupper(als[i].s[ntrim_left]) ) break; + if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left; + } + if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed further + + ntrim_left++; + if ( min_len<=1 ) // pad from the right + { + free(ref); + ref = faidx_fetch_seq64(args->fai, bcf_seqname(args->hdr,line), line->pos + npad_right, line->pos + npad_right + args->aln_win, &nref); + if ( !ref ) error("faidx_fetch_seq64 failed at %s:%"PRIhts_pos"\n",bcf_seqname(args->hdr,line), new_pos + ntrim_left); + npad_right += args->aln_win; + replace_iupac_codes(ref,nref); + for (i=0; in_allele; i++) kputs(ref, &als[i]); + } + } + ntrim_left -= has_indel; + if ( ntrim_left > 0 ) + { + for (i=0; in_allele; i++) + { + memmove(als[i].s, als[i].s + ntrim_left, als[i].l - ntrim_left); + als[i].l -= ntrim_left; + } + new_pos += ntrim_left; + } + free(ref); + + // trim from right + while (1) + { + // is the last base identical in all alleles? + int min_len = als[0].l; + for (i=1; in_allele; i++) + { + if ( toupper(als[0].s[ als[0].l-1 ]) != toupper(als[i].s[ als[i].l-1 ]) ) break; + if ( min_len > als[i].l ) min_len = als[i].l; + } + if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed more + for (i=0; in_allele; i++) { als[i].l--; als[i].s[als[i].l]=0; } + } + return new_pos; +} + #define ERR_DUP_ALLELE -2 #define ERR_REF_MISMATCH -1 #define ERR_OK 0 @@ -396,10 +556,32 @@ static int realign(args_t *args, bcf1_t *line) // make a copy of each allele for trimming hts_expand0(kstring_t,line->n_allele,args->ntmp_als,args->tmp_als); + hts_expand0(kstring_t,line->n_allele,args->ntmp_del,args->tmp_del); kstring_t *als = args->tmp_als; + kstring_t *del = args->tmp_del; for (i=0; in_allele; i++) { - if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC; // symbolic allele + del[i].l = 0; + if ( line->d.allele[i][0]=='<' ) + { + // symbolic allele, only will be realigned + if ( strncmp("d.allele[i],4) ) return ERR_SYMBOLIC; + if ( nref < line->rlen ) + { + free(ref); + reflen = line->rlen; + ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref); + if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1); + seq_to_upper(ref,0); + replace_iupac_codes(ref,nref); // any non-ACGT character in fasta ref is replaced with N + als[0].l = 0; + kputs(ref, &als[0]); + als[i].l = 0; + kputsn(ref,1,&als[i]); + kputs(line->d.allele[i],&del[i]); + continue; + } + } if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION; // spanning deletion if ( has_non_acgtn(line->d.allele[i],line->shared.l) ) { @@ -416,69 +598,17 @@ static int realign(args_t *args, bcf1_t *line) if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE; } - - // trim from right - int new_pos = line->pos; - while (1) - { - // is the rightmost base identical in all alleles? - int min_len = als[0].l; - for (i=1; in_allele; i++) - { - if ( toupper(als[0].s[ als[0].l-1 ])!=toupper(als[i].s[ als[i].l-1 ]) ) break; - if ( als[i].l < min_len ) min_len = als[i].l; - } - if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed - if ( min_len<=1 && new_pos==0 ) break; - - int pad_from_left = 0; - for (i=0; in_allele; i++) // trim all alleles - { - als[i].l--; - if ( !als[i].l ) pad_from_left = 1; - } - if ( pad_from_left ) - { - int npad = new_pos >= args->aln_win ? args->aln_win : new_pos; - free(ref); - ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, new_pos-npad, new_pos-1, &nref); - if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) new_pos-npad+1); - replace_iupac_codes(ref,nref); - for (i=0; in_allele; i++) - { - ks_resize(&als[i], als[i].l + npad); - if ( als[i].l ) memmove(als[i].s+npad,als[i].s,als[i].l); - memcpy(als[i].s,ref,npad); - als[i].l += npad; - } - new_pos -= npad; - } - } free(ref); + ref = NULL; - // trim from left - int ntrim_left = 0; - while (1) - { - // is the first base identical in all alleles? - int min_len = als[0].l - ntrim_left; - for (i=1; in_allele; i++) - { - if ( als[0].s[ntrim_left]!=als[i].s[ntrim_left] ) break; - if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left; - } - if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed - ntrim_left++; - } - if ( ntrim_left ) - { - for (i=0; in_allele; i++) - { - memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left); - als[i].l -= ntrim_left; - } - new_pos += ntrim_left; - } + // which direction are we aligning? + int left_align = is_left_align(args, line); + + hts_pos_t new_pos; + if ( left_align ) + new_pos = realign_left(args, line); + else + new_pos = realign_right(args, line); // Have the alleles changed? als[0].s[ als[0].l ] = 0; // in order for strcmp to work @@ -491,7 +621,8 @@ static int realign(args_t *args, bcf1_t *line) for (i=0; in_allele; i++) { if (i>0) kputc(',',&args->tmp_kstr); - kputsn(als[i].s,als[i].l,&args->tmp_kstr); + if ( del[i].l ) kputs(del[i].s,&args->tmp_kstr); + else kputsn(als[i].s,als[i].l,&args->tmp_kstr); } args->tmp_kstr.s[ args->tmp_kstr.l ] = 0; bcf_update_alleles_str(args->out_hdr,line,args->tmp_kstr.s); @@ -1281,10 +1412,12 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_ ngts2 /= nsmpl; if ( ngts!=ngts2 ) error("Error at %s:%"PRId64": cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); - int32_t *gt = (int32_t*) args->tmp_arr1; - int32_t *gt2 = (int32_t*) args->tmp_arr2; + int32_t *gt = (int32_t*) args->tmp_arr1; // the first, destination line + int32_t *gt2 = (int32_t*) args->tmp_arr2; // one of the subsequent lines, i.e. the source line for (j=0; j=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial2); + + // The destination allele int ial = args->maps[i].map[ial2]; - for (k=0; kabuf, const char*, INFO_TAG, args->old_rec_tag); abuf_set_opt(args->abuf, int, STAR_ALLELE, args->use_star_allele); } + if ( args->gff_fname ) + { + args->gff = gff_init(args->gff_fname); + gff_set(args->gff,verbosity,1); + gff_set(args->gff,strip_chr_names,1); + gff_parse(args->gff); + args->idx_tscript = gff_get(args->gff,idx_tscript); + args->itr_tscript = regitr_init(NULL); + } } static void destroy_data(args_t *args) { + if ( args->gff ) + { + gff_destroy(args->gff); + regitr_destroy(args->itr_tscript); + } cmpals_destroy(&args->cmpals_in); cmpals_destroy(&args->cmpals_out); int i; @@ -1929,7 +2082,10 @@ static void destroy_data(args_t *args) free(args->maps[i].map); for (i=0; intmp_als; i++) free(args->tmp_als[i].s); + for (i=0; intmp_del; i++) + free(args->tmp_del[i].s); free(args->tmp_als); + free(args->tmp_del); free(args->tmp_kstr.s); if ( args->tmp_str ) { @@ -2018,6 +2174,7 @@ static void normalize_vcf(args_t *args) hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p); if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_norm"); if ( bcf_hdr_write(args->out, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); bcf1_t *line; int prev_rid = -1, prev_pos = -1, prev_type = 0; @@ -2081,6 +2238,15 @@ static void normalize_vcf(args_t *args) if ( j>0 ) flush_buffer(args, args->out, j); } flush_buffer(args, args->out, args->rbuf.n); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out)<0 ) + { + if ( hts_close(args->out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); fprintf(stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped); @@ -2104,6 +2270,7 @@ static void usage(void) fprintf(stderr, " -d, --rm-dup TYPE Remove duplicate snps|indels|both|all|exact\n"); fprintf(stderr, " -f, --fasta-ref FILE Reference sequence\n"); fprintf(stderr, " --force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n"); + fprintf(stderr, " -g, --gff-annot FILE Follow HGVS 3'rule and right-align variants in transcripts on the forward strand\n"); fprintf(stderr, " --keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360)\n"); fprintf(stderr, " -m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); fprintf(stderr, " --multi-overlaps 0|. Fill in the reference (0) or missing (.) allele when splitting multiallelics [0]\n"); @@ -2121,6 +2288,7 @@ static void usage(void) fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); fprintf(stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " # normalize and left-align indels\n"); @@ -2163,6 +2331,8 @@ int main_vcfnorm(int argc, char *argv[]) {"old-rec-tag",required_argument,NULL,12}, {"keep-sum",required_argument,NULL,10}, {"fasta-ref",required_argument,NULL,'f'}, + {"gff-annot",required_argument,NULL,'g'}, + {"right-align",no_argument,NULL,15}, // undocumented, only for debugging {"do-not-normalize",no_argument,NULL,'N'}, {"multiallelics",required_argument,NULL,'m'}, {"multi-overlaps",required_argument,NULL,13}, @@ -2181,10 +2351,11 @@ int main_vcfnorm(int argc, char *argv[]) {"check-ref",required_argument,NULL,'c'}, {"strict-filter",no_argument,NULL,'s'}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,14}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNa",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNag:",loptions,NULL)) >= 0) { switch (c) { case 10: // possibly generalize this also to INFO/AD and other tags @@ -2192,6 +2363,7 @@ int main_vcfnorm(int argc, char *argv[]) error("Error: only --keep-sum AD is currently supported. See https://github.com/samtools/bcftools/issues/360 for more.\n"); args->keep_sum_ad = 1; // this will be set to the header id or -1 in init_data break; + case 'g': args->gff_fname = optarg; break; case 'a': args->atomize = SPLIT; break; case 11 : if ( optarg[0]=='*' ) args->use_star_allele = 1; @@ -2204,6 +2376,8 @@ int main_vcfnorm(int argc, char *argv[]) else if ( optarg[0]=='.' ) args->ma_use_ref_allele = 0; else error("Invalid argument to --multi-overlaps\n"); break; + case 14 : args->write_index = 1; break; + case 15 : args->right_align = 1; break; case 'N': args->do_indels = 0; break; case 'd': if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS; diff --git a/vcfplugin.c b/vcfplugin.c index 45686680a..687751961 100644 --- a/vcfplugin.c +++ b/vcfplugin.c @@ -1,6 +1,6 @@ /* vcfplugin.c -- plugin modules for operating on VCF/BCF files. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -149,6 +149,8 @@ typedef struct _args_t char **argv, *output_fname, *regions_list, *targets_list; int argc, drop_header, verbose, record_cmd_line, plist_only; + char *index_fn; + int write_index; } args_t; @@ -548,6 +550,7 @@ static void init_data(args_t *args) if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); } } @@ -569,7 +572,19 @@ static void destroy_data(args_t *args) } if ( args->filter ) filter_destroy(args->filter); - if (args->out_fh && hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + if (args->out_fh ) + { + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + } } static void usage(args_t *args) @@ -598,6 +613,7 @@ static void usage(args_t *args) fprintf(stderr, " -l, --list-plugins List available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); fprintf(stderr, " -v, --verbose Print verbose information, -vv increases verbosity\n"); fprintf(stderr, " -V, --version Print version string and exit\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); } @@ -643,9 +659,9 @@ int main_plugin(int argc, char *argv[]) if ( argv[1][0]!='-' ) { args->verbose = is_verbose(argc, argv); - plugin_name = argv[1]; - argc--; - argv++; + plugin_name = argv[1]; + argc--; + argv++; load_plugin(args, plugin_name, 1, &args->plugin); if ( args->plugin.run ) { @@ -675,6 +691,7 @@ int main_plugin(int argc, char *argv[]) {"targets-file",required_argument,NULL,'T'}, {"targets-overlap",required_argument,NULL,2}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,10}, {NULL,0,NULL,0} }; char *tmp; @@ -723,6 +740,7 @@ int main_plugin(int argc, char *argv[]) break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; + case 10 : args->write_index = 1; break; case '?': case 'h': usage_only = 1; break; default: error("Unknown argument: %s\n", optarg); diff --git a/vcfquery.c b/vcfquery.c index 889f36324..5f4eb07c6 100644 --- a/vcfquery.c +++ b/vcfquery.c @@ -1,6 +1,6 @@ /* vcfquery.c -- Extracts fields from VCF/BCF file. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -94,6 +94,7 @@ static void init_data(args_t *args) smpl_ilist_destroy(ilist); } args->convert = convert_init(args->header, samples, nsamples, args->format_str); + convert_set_option(args->convert, force_newline, 1); convert_set_option(args->convert, subset_samples, &args->smpl_pass); if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1); free(samples); diff --git a/vcfsort.c b/vcfsort.c index 1de2b2867..3b208a0d3 100644 --- a/vcfsort.c +++ b/vcfsort.c @@ -1,6 +1,6 @@ /* vcfsort.c -- sort subcommand - Copyright (C) 2017-2022 Genome Research Ltd. + Copyright (C) 2017-2023 Genome Research Ltd. Author: Petr Danecek @@ -62,6 +62,8 @@ typedef struct _args_t uint8_t *mem_block; size_t nbuf, mbuf, nblk; blk_t *blk; + char *index_fn; + int write_index; } args_t; @@ -300,6 +302,7 @@ void merge_blocks(args_t *args) set_wmode(wmode,args->output_type,args->output_fname,args->clevel); htsFile *out = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(out,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); while ( bhp->ndat ) { blk_t *blk = bhp->dat[0]; @@ -307,6 +310,15 @@ void merge_blocks(args_t *args) khp_delete(blk, bhp); blk_read(args, bhp, args->hdr, blk); } + if ( args->write_index ) + { + if ( bcf_idx_save(out)<0 ) + { + if ( hts_close(out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", args->output_fname); clean_files(args); @@ -333,6 +345,7 @@ static void usage(args_t *args) #else fprintf(stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n"); #endif + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); } @@ -395,6 +408,7 @@ int main_sort(int argc, char *argv[]) {"output-file",required_argument,NULL,'o'}, {"output",required_argument,NULL,'o'}, {"help",no_argument,NULL,'h'}, + {"write-index",no_argument,NULL,1}, {0,0,0,0} }; char *tmp; @@ -423,6 +437,7 @@ int main_sort(int argc, char *argv[]) if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; + case 1 : args->write_index = 1; break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); diff --git a/vcfstats.c b/vcfstats.c index 10189fef9..e2744ab3c 100644 --- a/vcfstats.c +++ b/vcfstats.c @@ -70,6 +70,13 @@ typedef struct } idist_t; +// variant allele frequency (fraction of alt allele in pileup as determined from AD) collected into 0.05 bins +typedef struct +{ + int snv[21], indel[21]; +} +vaf_t; + typedef struct { uint64_t n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; @@ -93,7 +100,8 @@ typedef struct int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl; int *smpl_hapRef, *smpl_hapAlt, *smpl_missing; int *smpl_ins_hets, *smpl_del_hets, *smpl_ins_homs, *smpl_del_homs; - int *smpl_frm_shifts; // not-applicable, in-frame, out-frame + int *smpl_frm_shifts; // not-applicable, in-frame, out-frame + vaf_t vaf, *smpl_vaf; // total (INFO/AD) and per-sample (FMT/VAF) VAF distributions unsigned long int *smpl_dp; idist_t dp, dp_sites; int nusr; @@ -141,7 +149,9 @@ typedef struct gtcmp_t *af_gts_snps, *af_gts_indels; // first bin of af_* stats are singletons bin_t *af_bins; float *farr; - int mfarr; + int32_t *iarr; + int mfarr, miarr; + int nref_tot, nhet_tot, nalt_tot, n_nref, i_nref; // indel context indel_ctx_t *indel_ctx; @@ -447,6 +457,8 @@ static void init_stats(args_t *args) if ( args->af_tag && !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,bcf_hdr_id2int(hdr,BCF_DT_ID,args->af_tag)) ) error("No such INFO tag: %s\n", args->af_tag); + int id, has_fmt_ad = ((id=bcf_hdr_id2int(hdr,BCF_DT_ID,"AD"))>=0 && bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id)) ? 1 : 0; + #if QUAL_STATS args->m_qual = 999; #endif @@ -501,6 +513,8 @@ static void init_stats(args_t *args) stats->smpl_dp = (unsigned long int *) calloc(args->files->n_smpl,sizeof(unsigned long int)); stats->smpl_ndp = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_sngl = (int *) calloc(args->files->n_smpl,sizeof(int)); + if ( has_fmt_ad ) + stats->smpl_vaf = (vaf_t*) calloc(args->files->n_smpl,sizeof(vaf_t)); #if HWE_STATS stats->af_hwe = (int*) calloc(args->m_af*args->naf_hwe,sizeof(int)); #endif @@ -586,6 +600,7 @@ static void destroy_stats(args_t *args) free(stats->smpl_dp); free(stats->smpl_ndp); free(stats->smpl_sngl); + free(stats->smpl_vaf); idist_destroy(&stats->dp); idist_destroy(&stats->dp_sites); for (j=0; jnusr; j++) @@ -602,6 +617,7 @@ static void destroy_stats(args_t *args) for (j=0; jnusr; j++) free(args->usr[j].tag); if ( args->af_bins ) bin_destroy(args->af_bins); free(args->farr); + free(args->iarr); free(args->usr); free(args->tmp_frm); free(args->tmp_iaf); @@ -615,6 +631,8 @@ static void destroy_stats(args_t *args) if (args->filter[1]) filter_destroy(args->filter[1]); } +// The arary tmp_iaf keeps the index of AF bin for each allele, the first bin is for singletons. +// The number of bins, either m_af (101) or as given by the user in --af-bins static void init_iaf(args_t *args, bcf_sr_t *reader) { bcf1_t *line = reader->buffer[0]; @@ -869,205 +887,279 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) } } -static inline void update_dvaf(stats_t *stats, bcf1_t *line, bcf_fmt_t *fmt, int ismpl, int ial, int jal) +// Returns the max non-ref AD value +static inline int get_ad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int *ial) { - if ( !fmt ) return; - - float dvaf; + int iv, ad = 0; + *ial = 0; #define BRANCH_INT(type_t,missing,vector_end) { \ - type_t *p = (type_t *) (fmt->p + fmt->size*ismpl); \ - if ( p[ial]==vector_end || p[jal]==vector_end ) return; \ - if ( p[ial]==missing || p[jal]==missing ) return; \ - if ( !p[ial] && !p[jal] ) return; \ - dvaf = (float)p[ial]/(p[ial]+p[jal]); \ + type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \ + for (iv=1; ivn; iv++) \ + { \ + if ( ptr[iv]==vector_end ) break; \ + if ( ptr[iv]==missing ) continue; \ + if ( ad < ptr[iv] ) { ad = ptr[iv]; *ial = iv; }\ + } \ } - switch (fmt->type) { + switch (ad_fmt_ptr->type) { case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; - default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt->type); exit(1); break; + default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); exit(1); break; } #undef BRANCH_INT - + return ad; +} +static inline int get_iad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int ial) +{ + #define BRANCH_INT(type_t,missing,vector_end) { \ + type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \ + if ( ptr[ial]==vector_end ) return 0; \ + if ( ptr[ial]==missing ) return 0; \ + return ptr[ial]; \ + } + switch (ad_fmt_ptr->type) { + case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); exit(1); break; + } + #undef BRANCH_INT +} +static inline void update_dvaf(stats_t *stats, bcf1_t *line, int ial, float vaf) +{ int len = line->d.var[ial].n; if ( len < -stats->m_indel ) len = -stats->m_indel; else if ( len > stats->m_indel ) len = stats->m_indel; int bin = stats->m_indel + len; stats->nvaf[bin]++; - stats->dvaf[bin] += dvaf; + stats->dvaf[bin] += vaf; +} +#define vaf2bin(vaf) ((int)nearbyintf((vaf)/0.05)) +static inline void update_vaf(vaf_t *smpl_vaf, bcf1_t *line, int ial, float vaf) +{ + int idx = vaf2bin(vaf); + if ( bcf_get_variant_type(line,ial)==VCF_SNP ) smpl_vaf->snv[idx]++; + else smpl_vaf->indel[idx]++; } -static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched) +static inline int calc_sample_depth(args_t *args, int ismpl, bcf_fmt_t *ad_fmt_ptr, bcf_fmt_t *dp_fmt_ptr) { - bcf_srs_t *files = args->files; - bcf1_t *line = reader->buffer[0]; - bcf_fmt_t *fmt_ptr; - int nref_tot = 0, nhet_tot = 0, nalt_tot = 0; - int line_type = bcf_get_variant_types(line); + if ( dp_fmt_ptr ) + { + #define BRANCH_INT(type_t,missing,vector_end) { \ + type_t *ptr = (type_t *) (dp_fmt_ptr->p + dp_fmt_ptr->size*ismpl); \ + if ( *ptr==missing || *ptr==vector_end ) return -1; \ + return *ptr; \ + } + switch (dp_fmt_ptr->type) { + case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, dp_fmt_ptr->type); exit(1); break; + } + #undef BRANCH_INT + } + if ( ad_fmt_ptr ) + { + int iv, dp = 0, has_value = 0; + #define BRANCH_INT(type_t,missing,vector_end) { \ + type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \ + for (iv=0; ivn; iv++) \ + { \ + if ( ptr[iv]==vector_end ) break; \ + if ( ptr[iv]==missing ) continue; \ + has_value = 1; \ + dp += ptr[iv]; \ + } \ + } + switch (ad_fmt_ptr->type) { + case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); exit(1); break; + } + #undef BRANCH_INT + if ( !has_value ) return -1; + return dp; + } + return -1; +} +static inline void sample_gt_stats(args_t *args, stats_t *stats, bcf1_t *line, int ismpl, int gt, int ial, int jal) +{ + if ( gt==GT_UNKN ) + { + stats->smpl_missing[ismpl]++; + return; + } - if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT")) ) + int var_type = 0; + if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial); + if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal); + if ( gt==GT_HAPL_R || gt==GT_HAPL_A ) { - bcf_fmt_t *ad_fmt_ptr = bcf_get_variant_types(line)&VCF_INDEL ? bcf_get_fmt(reader->header,reader->buffer[0],"AD") : NULL; + if ( var_type&VCF_INDEL && stats->smpl_frm_shifts ) + { + assert( ialn_allele ); + stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[ial]]++; + } + if ( gt == GT_HAPL_R ) stats->smpl_hapRef[ismpl]++; + if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[ismpl]++; + return; + } + if ( gt != GT_HOM_RR ) { args->n_nref++; args->i_nref = ismpl; } + #if HWE_STATS + switch (gt) + { + case GT_HOM_RR: args->nref_tot++; break; + case GT_HET_RA: args->nhet_tot++; break; + case GT_HET_AA: + case GT_HOM_AA: args->nalt_tot++; break; + } + #endif - int ref = bcf_acgt2int(*line->d.allele[0]); - int is, n_nref = 0, i_nref = 0; - for (is=0; isfiles->n_smpl; is++) + if ( var_type&VCF_SNP || var_type==VCF_REF ) // count ALT=. as SNP + { + if ( gt == GT_HET_RA ) stats->smpl_hets[ismpl]++; + else if ( gt == GT_HET_AA ) stats->smpl_hets[ismpl]++; + else if ( gt == GT_HOM_RR ) stats->smpl_homRR[ismpl]++; + else if ( gt == GT_HOM_AA ) stats->smpl_homAA[ismpl]++; + if ( gt != GT_HOM_RR && line->d.var[ial].type&VCF_SNP ) // this is safe, bcf_get_variant_types has been already called { - int ial, jal; - int gt = bcf_gt_type(fmt_ptr, reader->samples[is], &ial, &jal); - if ( gt==GT_UNKN ) - { - stats->smpl_missing[is]++; - continue; - } - if ( gt==GT_HAPL_R || gt==GT_HAPL_A ) + int ref = bcf_acgt2int(*line->d.allele[0]); + int alt = bcf_acgt2int(*line->d.allele[ial]); + if ( alt<0 ) return; + if ( abs(ref-alt)==2 ) + stats->smpl_ts[ismpl]++; + else + stats->smpl_tv[ismpl]++; + } + if ( gt != GT_HOM_RR && line->d.var[jal].type&VCF_SNP && ial!=jal ) + { + int ref = bcf_acgt2int(*line->d.allele[0]); + int alt = bcf_acgt2int(*line->d.allele[jal]); + if ( alt<0 ) return; + if ( abs(ref-alt)==2 ) + stats->smpl_ts[ismpl]++; + else + stats->smpl_tv[ismpl]++; + } + } + if ( var_type&VCF_INDEL ) + { + if ( gt != GT_HOM_RR ) + { + stats->smpl_indels[ismpl]++; + if ( gt==GT_HET_RA || gt==GT_HET_AA ) { - if ( line_type&VCF_INDEL && stats->smpl_frm_shifts ) + int is_ins = 0, is_del = 0; + if ( bcf_get_variant_type(line,ial)&VCF_INDEL ) { - assert( ialn_allele ); - stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++; + if ( line->d.var[ial].n < 0 ) is_del = 1; + else is_ins = 1; } - if ( gt == GT_HAPL_R ) stats->smpl_hapRef[is]++; - if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[is]++; - continue; - } - if ( gt != GT_HOM_RR ) { n_nref++; i_nref = is; } - #if HWE_STATS - switch (gt) + if ( bcf_get_variant_type(line,jal)&VCF_INDEL ) { - case GT_HOM_RR: nref_tot++; break; - case GT_HET_RA: nhet_tot++; break; - case GT_HET_AA: - case GT_HOM_AA: nalt_tot++; break; - } - #endif - int var_type = 0; - if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial); - if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal); - if ( var_type&VCF_SNP || var_type==VCF_REF ) // count ALT=. as SNP - { - if ( gt == GT_HET_RA ) stats->smpl_hets[is]++; - else if ( gt == GT_HET_AA ) stats->smpl_hets[is]++; - else if ( gt == GT_HOM_RR ) stats->smpl_homRR[is]++; - else if ( gt == GT_HOM_AA ) stats->smpl_homAA[is]++; - if ( gt != GT_HOM_RR && line->d.var[ial].type&VCF_SNP ) // this is safe, bcf_get_variant_types has been already called - { - int alt = bcf_acgt2int(*line->d.allele[ial]); - if ( alt<0 ) continue; - if ( abs(ref-alt)==2 ) - stats->smpl_ts[is]++; - else - stats->smpl_tv[is]++; + if ( line->d.var[jal].n < 0 ) is_del = 1; + else is_ins = 1; } + // Note that alt-het genotypes with both ins and del allele are counted twice!! + if ( is_del ) stats->smpl_del_hets[ismpl]++; + if ( is_ins ) stats->smpl_ins_hets[ismpl]++; } - if ( var_type&VCF_INDEL ) + else if ( gt==GT_HOM_AA ) { - if ( gt != GT_HOM_RR ) - { - stats->smpl_indels[is]++; - - if ( gt==GT_HET_RA || gt==GT_HET_AA ) - { - int is_ins = 0, is_del = 0; - if ( bcf_get_variant_type(line,ial)&VCF_INDEL ) - { - if ( line->d.var[ial].n < 0 ) is_del = 1; - else is_ins = 1; - update_dvaf(stats,line,ad_fmt_ptr,is,ial,jal); - } - if ( bcf_get_variant_type(line,jal)&VCF_INDEL ) - { - if ( line->d.var[jal].n < 0 ) is_del = 1; - else is_ins = 1; - update_dvaf(stats,line,ad_fmt_ptr,is,jal,ial); - } - // Note that alt-het genotypes with both ins and del allele are counted twice!! - if ( is_del ) stats->smpl_del_hets[is]++; - if ( is_ins ) stats->smpl_ins_hets[is]++; - } - else if ( gt==GT_HOM_AA ) - { - if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[is]++; - else stats->smpl_ins_homs[is]++; - } - } - if ( stats->smpl_frm_shifts ) - { - assert( ialn_allele && jaln_allele ); - stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++; - stats->smpl_frm_shifts[is*3 + args->tmp_frm[jal]]++; - } + if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[ismpl]++; + else stats->smpl_ins_homs[ismpl]++; } } - if ( n_nref==1 ) stats->smpl_sngl[i_nref]++; + if ( stats->smpl_frm_shifts ) + { + assert( ialn_allele && jaln_allele ); + stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[ial]]++; + stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[jal]]++; + } } +} +static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched) +{ + bcf_srs_t *files = args->files; + bcf1_t *line = reader->buffer[0]; - #if HWE_STATS - if ( nhet_tot + nref_tot + nalt_tot ) + args->nref_tot = 0; + args->nhet_tot = 0; + args->nalt_tot = 0; + args->n_nref = 0; + args->i_nref = 0; + + bcf_fmt_t *gt_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT"); + bcf_fmt_t *ad_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD"); + bcf_fmt_t *dp_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"DP"); + + int is; + for (is=0; isfiles->n_smpl; is++) + { + // Determine depth + int dp = calc_sample_depth(args,is,ad_fmt_ptr,dp_fmt_ptr); + if ( dp>0 ) { - float het_frac = (float)nhet_tot/(nhet_tot + nref_tot + nalt_tot); - int idx = het_frac*(args->naf_hwe - 1); -//check me: what is this? - if ( line->n_allele>1 ) idx += args->naf_hwe*args->tmp_iaf[1]; - stats->af_hwe[idx]++; + (*idist(&stats->dp, dp))++; + stats->smpl_ndp[is]++; + stats->smpl_dp[is] += dp; } - #endif - if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"DP")) ) - { - #define BRANCH_INT(type_t,missing,vector_end) { \ - int is; \ - for (is=0; isfiles->n_smpl; is++) \ - { \ - type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \ - if ( *p==vector_end ) continue; \ - if ( *p!=missing ) \ - { \ - (*idist(&stats->dp, *p))++; \ - stats->smpl_ndp[is]++; \ - stats->smpl_dp[is] += *p; \ - } \ - } \ + // Determine genotype + int ial, jal, gt=GT_UNKN; + if ( gt_fmt_ptr ) + { + gt = bcf_gt_type(gt_fmt_ptr, reader->samples[is], &ial, &jal); + sample_gt_stats(args,stats,line,is,gt,ial,jal); } - switch (fmt_ptr->type) { - case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; - default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break; + + // Determine variant allele frequency + if ( dp>0 && ad_fmt_ptr ) + { + float iad = 0, jad = 0; + if ( gt==GT_UNKN ) // GT not available + { + iad = get_ad(line,ad_fmt_ptr,is,&ial); + } + else if ( gt!=GT_UNKN ) + { + iad = ial==0 ? 0 : get_iad(line,ad_fmt_ptr,is,ial); + jad = jal==0 ? 0 : get_iad(line,ad_fmt_ptr,is,jal); + } + if ( iad ) + { + update_dvaf(stats,line,ial,(float)iad/dp); + update_vaf(&stats->smpl_vaf[is],line,ial,(float)iad/dp); + } + if ( jad && iad!=jad ) + { + update_dvaf(stats,line,jal,(float)jad/dp); + update_vaf(&stats->smpl_vaf[is],line,jal,(float)jad/dp); + } } - #undef BRANCH_INT } - else if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD")) ) + if ( args->n_nref==1 ) stats->smpl_sngl[args->i_nref]++; + +#if HWE_STATS + if ( gt_fmt_ptr && line->n_allele > 1 && (args->nref_tot || args->nhet_tot || args->nalt_tot) ) { - #define BRANCH_INT(type_t,missing,vector_end) { \ - int is,iv; \ - for (is=0; isfiles->n_smpl; is++) \ - { \ - type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \ - int dp = 0, has_value = 0; \ - for (iv=0; ivn; iv++) \ - { \ - if ( p[iv]==vector_end ) break; \ - if ( p[iv]==missing ) continue; \ - has_value = 1; \ - dp += p[iv]; \ - } \ - if ( has_value ) \ - { \ - (*idist(&stats->dp, dp))++; \ - stats->smpl_ndp[is]++; \ - stats->smpl_dp[is] += dp; \ - } \ - } \ - } - switch (fmt_ptr->type) { - case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; - default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break; - } - #undef BRANCH_INT + // Number of heterozygous genotypes observed for any given allele frequency. This is used + // by plot-vcfstats to show the observed vs expected number of hets. There the expected number + // of hets is calculated from the probability P(het) = 2*AF*(1-AF). + // The array af_hwe is organized as follows + // m_af .. number of allele frequency bins + // naf_hwe .. the number of het genotype frequency bins + // iallele_freq*naf_hwe + ihet_freq + // + float het_frac = (float)args->nhet_tot / (args->nref_tot + args->nhet_tot + args->nalt_tot); + int ihet_freq = het_frac * (args->naf_hwe - 1); + int idx = ihet_freq + args->tmp_iaf[1] * args->naf_hwe; + stats->af_hwe[idx]++; } +#endif if ( matched==3 ) { @@ -1200,8 +1292,8 @@ static void do_vcf_stats(args_t *args) if ( files->n_smpl ) do_sample_stats(args, stats, reader, ret); - if ( bcf_get_info_int32(reader->header,line,"DP",&args->tmp_iaf,&args->ntmp_iaf)==1 ) - (*idist(&stats->dp_sites, args->tmp_iaf[0]))++; + if ( bcf_get_info_int32(reader->header,line,"DP",&args->iarr,&args->miarr)==1 ) + (*idist(&stats->dp_sites, args->iarr[0]))++; } } @@ -1736,6 +1828,24 @@ static void print_stats(args_t *args) } #endif } + + if ( args->stats[0].smpl_vaf ) + { + printf("# VAF, Variant Allele Frequency determined as fraction of alternate reads in FORMAT/AD\n"); + printf("# VAF\t[2]id\t[3]sample\t[4]SNV VAF distribution\t[5]indel VAF distribution\n"); + for (id=0; idnstats; id++) + { + stats_t *stats = &args->stats[id]; + for (i=0; ifiles->n_smpl; i++) + { + printf("VAF\t%d\t%s\t", id,args->files->samples[i]); + for (j=0; j<21; j++) printf("%s%d",j?",":"",stats->smpl_vaf[i].snv[j]); + printf("\t"); + for (j=0; j<21; j++) printf("%s%d",j?",":"",stats->smpl_vaf[i].indel[j]); + printf("\n"); + } + } + } } static void usage(void) diff --git a/vcfview.c b/vcfview.c index 96dcbc7b5..e09efa0bc 100644 --- a/vcfview.c +++ b/vcfview.c @@ -1,6 +1,6 @@ /* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Shane McCarthy @@ -76,6 +76,8 @@ typedef struct _args_t char *include_types, *exclude_types; int include, exclude; int record_cmd_line; + char *index_fn; + int write_index; htsFile *out; } args_t; @@ -532,6 +534,7 @@ static void usage(args_t *args) fprintf(stderr, " -u/U, --uncalled/--exclude-uncalled Select/exclude sites without a called genotype\n"); fprintf(stderr, " -v/V, --types/--exclude-types LIST Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n"); fprintf(stderr, " -x/X, --private/--exclude-private Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); } @@ -548,6 +551,7 @@ int main_vcfview(int argc, char *argv[]) args->output_type = FT_VCF; args->n_threads = 0; args->record_cmd_line = 1; + args->write_index = 0; args->min_ac = args->max_ac = args->min_af = args->max_af = -1; args->regions_overlap = 1; args->targets_overlap = 0; @@ -596,6 +600,7 @@ int main_vcfview(int argc, char *argv[]) {"phased",no_argument,NULL,'p'}, {"exclude-phased",no_argument,NULL,'P'}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,10}, {NULL,0,NULL,0} }; char *tmp; @@ -727,6 +732,7 @@ int main_vcfview(int argc, char *argv[]) break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; + case 10 : args->write_index = 1; break; case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } @@ -783,6 +789,8 @@ int main_vcfview(int argc, char *argv[]) else if ( args->output_type & FT_BCF ) error("BCF output requires header, cannot proceed with -H\n"); + if ( args->write_index && init_index(args->out,out_hdr,args->fn_out,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->fn_out); + int ret = 0; if (!args->header_only) { @@ -795,7 +803,18 @@ int main_vcfview(int argc, char *argv[]) ret = args->files->errnum; if ( ret ) fprintf(stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum)); } - hts_close(args->out); + + if (args->write_index) + { + if (bcf_idx_save(args->out) < 0) + { + if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + + if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"stdout"); destroy_data(args); bcf_sr_destroy(args->files); free(args); diff --git a/version.c b/version.c index 4306d4011..38417a78b 100644 --- a/version.c +++ b/version.c @@ -1,6 +1,6 @@ /* version.c -- report version numbers for plugins. - Copyright (C) 2014-2021 Genome Research Ltd. + Copyright (C) 2014-2023 Genome Research Ltd. Author: Petr Danecek @@ -72,22 +72,26 @@ const char *hts_bcf_wmode(int file_type) const char *hts_bcf_wmode2(int file_type, const char *fname) { if ( !fname ) return hts_bcf_wmode(file_type); - int len = strlen(fname); - if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) return hts_bcf_wmode(FT_BCF|FT_GZ); - if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) return hts_bcf_wmode(FT_VCF); - if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) return hts_bcf_wmode(FT_VCF|FT_GZ); - if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) return hts_bcf_wmode(FT_VCF|FT_GZ); + const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL; + if ( !end ) end = fname ? fname + strlen(fname) : fname; + int len = end - fname; + if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) return hts_bcf_wmode(FT_BCF|FT_GZ); + if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) return hts_bcf_wmode(FT_VCF); + if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) return hts_bcf_wmode(FT_VCF|FT_GZ); + if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) return hts_bcf_wmode(FT_VCF|FT_GZ); return hts_bcf_wmode(file_type); } void set_wmode(char dst[8], int file_type, const char *fname, int clevel) { const char *ret = NULL; - int len = fname ? strlen(fname) : 0; - if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ); - else if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) ret = hts_bcf_wmode(FT_VCF); - else if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); - else if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); + const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL; + if ( !end ) end = fname ? fname + strlen(fname) : fname; + int len = end - fname; + if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ); + else if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_VCF); + else if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); + else if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); else ret = hts_bcf_wmode(file_type); if ( clevel>=0 && clevel<=9 ) { @@ -107,3 +111,33 @@ int parse_overlap_option(const char *arg) else if ( strcasecmp(arg, "variant") == 0 || strcmp(arg, "2") == 0 ) return 2; else return -1; } + +// See also samtools/sam_utils.c auto_index() +int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname) +{ + int min_shift = 14; // CSI + + if ( !fname || !*fname || !strcmp(fname, "-") ) return -1; + + char *delim = strstr(fname, HTS_IDX_DELIM); + if (delim) + { + delim += strlen(HTS_IDX_DELIM); + *idx_fname = strdup(delim); + if ( !*idx_fname ) return -1; + + size_t l = strlen(*idx_fname); + if ( l >= 4 && strcmp(*idx_fname + l - 4, ".tbi")==0 ) min_shift = 0; + } + else + { + if ( !(*idx_fname = malloc(strlen(fname)+6)) ) return -1; + sprintf(*idx_fname, "%s.csi", fname); + } + + if ( bcf_idx_init(fh, hdr, min_shift, *idx_fname) < 0 ) return -1; + + return 0; +} + + diff --git a/version.sh b/version.sh index 55d804296..69bf963de 100755 --- a/version.sh +++ b/version.sh @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.17 +VERSION=1.18 # If we have a git clone, then check against the current tag if [ -e .git ]