diff --git a/.cirrus.yml b/.cirrus.yml index a5b08ebce..06edba506 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -60,6 +60,8 @@ gcc_task: LC_ALL: C CIRRUS_CLONE_DEPTH: 1 HTSDIR: ./htslib + CFLAGS: -fsanitize=address + LDFLAGS: -fsanitize=address matrix: - environment: @@ -82,6 +84,7 @@ ubuntu_task: environment: CC: clang + CFLAGS: -g -O2 -Werror -Wall -Wformat -Wformat=2 LC_ALL: C CIRRUS_CLONE_DEPTH: 1 HTSDIR: ./htslib @@ -93,8 +96,8 @@ ubuntu_task: memory: 2G environment: USE_CONFIG: yes - CFLAGS: -g -Wall -O3 -fsanitize=address - LDFLAGS: -fsanitize=address -Wl,-rpath,`pwd`/inst/lib + CFLAGS: -g -Wall -O3 + LDFLAGS: -Wl,-rpath,`pwd`/inst/lib # NB: we could consider building a docker image with these # preinstalled and specifying that instead, to speed up testing. diff --git a/HMM.h b/HMM.h index 3a6cab30a..75d3f8b49 100644 --- a/HMM.h +++ b/HMM.h @@ -127,7 +127,7 @@ double *hmm_get_fwd_bwd_prob(hmm_t *hmm); * @sites: list of positions * * Same as hmm_run_fwd_bwd, in addition a pointer to a matrix with the new - * transition probabilities is returned. In this verison, emission + * transition probabilities is returned. In this version, emission * probabilities are not updated. */ double *hmm_run_baum_welch(hmm_t *hmm, int nsites, double *eprob, uint32_t *sites); diff --git a/Makefile b/Makefile index 12fb185e7..f170d6bb6 100644 --- a/Makefile +++ b/Makefile @@ -41,7 +41,7 @@ OBJS = main.o vcfindex.o tabix.o \ vcfcnv.o vcfhead.o HMM.o consensus.o ploidy.o bin.o hclust.o version.o \ regidx.o smpl_ilist.o csq.o vcfbuf.o \ mpileup.o bam2bcf.o bam2bcf_indel.o bam2bcf_iaux.o bam2bcf_edlib.o \ - read_consensus.o bam_sample.o \ + read_consensus.o bam_sample.o \ vcfsort.o cols.o extsort.o dist.o abuf.o \ ccall.o em.o prob1.o kmin.o str_finder.o gff.o edlib.o PLUGIN_OBJS = vcfplugin.o @@ -105,7 +105,7 @@ endif include config.mk -PACKAGE_VERSION = 1.20 +PACKAGE_VERSION = 1.21 # If building from a Git repository, replace $(PACKAGE_VERSION) with the Git # description of the working tree: either a release tag with the same value @@ -235,7 +235,6 @@ vcfbuf_h = vcfbuf.h $(htslib_vcf_h) abuf_h = abuf.h $(htslib_vcf_h) dbuf_h = dbuf.h $(htslib_vcf_h) bam2bcf_h = bam2bcf.h $(htslib_hts_h) $(htslib_vcf_h) -edlib.h = edlib.h bam_sample_h = bam_sample.h $(htslib_sam_h) cigar_state_h = cigar_state.h $(htslib_hts_h) $(htslib_sam_h) read_consensus_h = read_consensus.h $(htslib_hts_h) $(htslib_sam_h) @@ -249,17 +248,17 @@ vcfcall.o: vcfcall.c $(htslib_vcf_h) $(htslib_kfunc_h) $(htslib_synced_bcf_reade vcfconcat.o: vcfconcat.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) $(bcftools_h) vcfconvert.o: vcfconvert.c $(htslib_faidx_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kseq_h) $(htslib_hts_endian_h) $(bcftools_h) $(filter_h) $(convert_h) $(tsv2vcf_h) vcffilter.o: vcffilter.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) rbuf.h regidx.h -vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(htslib_hts_os_h) $(htslib_bgzf_h) $(bcftools_h) extsort.h filter.h +vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(htslib_hts_os_h) $(htslib_bgzf_h) $(bcftools_h) extsort.h $(filter_h) vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h) $(htslib_kstring_h) $(htslib_bgzf_h) $(bcftools_h) -vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) $(filter_h) -vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(htslib_kbitset_h) $(htslib_hts_endian_h) $(bcftools_h) regidx.h vcmp.h $(htslib_khash_h) $(htslib_kbitset_h) -vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h abuf.h gff.h regidx.h +vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(htslib_hts_defs_h) $(bcftools_h) $(filter_h) +vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(htslib_kbitset_h) $(htslib_hts_endian_h) $(bcftools_h) regidx.h vcmp.h $(htslib_khash_h) +vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h abuf.h gff.h regidx.h $(filter_h) vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h) $(smpl_ilist_h) vcfroh.o: vcfroh.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(bcftools_h) HMM.h $(smpl_ilist_h) $(filter_h) -vcfcnv.o: vcfcnv.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(bcftools_h) HMM.h rbuf.h +vcfcnv.o: vcfcnv.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(htslib_hts_defs_h) $(bcftools_h) HMM.h rbuf.h vcfhead.o: vcfhead.c $(htslib_kstring_h) $(htslib_vcf_h) $(bcftools_h) -vcfsom.o: vcfsom.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) -vcfsort.o: vcfsort.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_hts_os_h) kheap.h $(bcftools_h) +vcfsom.o: vcfsom.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(htslib_hts_defs_h) $(bcftools_h) +vcfsort.o: vcfsort.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_hts_os_h) $(htslib_hts_defs_h) $(htslib_bgzf_h) kheap.h $(bcftools_h) vcfstats.o: vcfstats.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) bin.h dist.h vcfview.o: vcfview.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(htslib_khash_str2int_h) $(htslib_kbitset_h) reheader.o: reheader.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_kseq_h) $(htslib_thread_pool_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) $(khash_str2str_h) @@ -276,7 +275,7 @@ mcall.o: mcall.c $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(call_h) $(prob1_h prob1.o: prob1.c $(prob1_h) vcmp.o: vcmp.c $(htslib_hts_h) $(htslib_vcf_h) vcmp.h ploidy.o: ploidy.c $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_hts_h) $(bcftools_h) $(ploidy_h) -polysomy.o: polysomy.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(bcftools_h) peakfit.h +polysomy.o: polysomy.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_hts_defs_h) $(bcftools_h) peakfit.h peakfit.o: peakfit.c peakfit.h $(htslib_hts_h) $(htslib_kstring_h) bin.o: bin.c $(bcftools_h) bin.h dist.o: dist.c dist.h @@ -287,14 +286,15 @@ mpileup.o: mpileup.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(hts bam2bcf.o: bam2bcf.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(bam2bcf_h) mw.h bam2bcf_indel.o: bam2bcf_indel.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bam2bcf_h) $(htslib_ksort_h) $(str_finder_h) bam2bcf_iaux.o: bam2bcf_iaux.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bcftools_h) $(bam2bcf_h) $(htslib_ksort_h) $(read_consensus_h) $(cigar_state_h) -bam2bcf_edlib.o: bam2bcf_edlib.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bcftools_h) $(bam2bcf_h) $(htslib_ksort_h) $(read_consensus_h) $(cigar_state_h) $(edlib.h) +bam2bcf_edlib.o: bam2bcf_edlib.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bam2bcf_h) $(str_finder_h) $(htslib_ksort_h) edlib.h read_consensus.o: read_consensus.c $(read_consensus_h) $(cigar_state_h) $(bcftools_h) kheap.h bam_sample.o: bam_sample.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(khash_str2str_h) $(bam_sample_h) $(bcftools_h) version.o: version.h version.c hclust.o: hclust.c $(htslib_hts_h) $(htslib_kstring_h) $(bcftools_h) hclust.h HMM.o: HMM.c $(htslib_hts_h) HMM.h -vcfbuf.o: vcfbuf.c $(htslib_vcf_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) $(vcfbuf_h) rbuf.h +vcfbuf.o: vcfbuf.c $(htslib_vcf_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(htslib_kbitset_h) $(bcftools_h) $(vcfbuf_h) rbuf.h abuf.o: abuf.c $(htslib_vcf_h) $(bcftools_h) rbuf.h abuf.h +edlib.o: edlib.c edlib.h extsort.o: extsort.c $(bcftools_h) extsort.h kheap.h smpl_ilist.o: smpl_ilist.c $(bcftools_h) $(smpl_ilist_h) gff.o: gff.c $(htslib_hts_h) $(htslib_khash_h) $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(bcftools_h) gff.h regidx.h @@ -326,7 +326,7 @@ test/test-rbuf.o: test/test-rbuf.c rbuf.h test/test-rbuf: test/test-rbuf.o $(CC) $(LDFLAGS) -o $@ $^ $(ALL_LIBS) -test/test-regidx.o: test/test-regidx.c $(htslib_kstring_h) $(htslib_hts_os_h) regidx.h +test/test-regidx.o: test/test-regidx.c $(htslib_kstring_h) $(htslib_hts_os_h) $(htslib_hts_defs_h) regidx.h test/test-regidx: test/test-regidx.o regidx.o | $(HTSLIB) $(CC) $(ALL_LDFLAGS) -o $@ $^ $(HTSLIB_LIB) -lpthread $(ALL_LIBS) diff --git a/NEWS b/NEWS index 5621a8bd8..4066ceb37 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,124 @@ +## Release 1.21 (12th September 2024) + + + +Changes affecting the whole of bcftools, or multiple commands: + +* Support multiple semicolon-separated strings when filtering by ID using -i/-e (#2190). + For example, `-i 'ID="rs123"'` now correctly matches `rs123;rs456` + +* The filtering expression ILEN can be positive (insertion), negative (deletion), zero + (balanced substitutions), or set to missing value (symbolic alleles). + +* bcftools query +* bcftools +split-vep + + - The columns indices printed by default with `-H` (e.g., "#[1]CHROM") can be now + suppressed by giving the option twice `-HH` (#2152) + + +Changes affecting specific commands: + +* bcftools annotate + + - Support dynamic variables read from a tab-delimited annotation file (#2151) + For example, in the two cases below the field 'STR' from the -a file is required to match + the INFO/TAG in VCF. In the first example the alleles REF,ALT must match, in the second + example they are ignored. The option -k is required to output also records that were not + annotated: + + bcftools annotate -a ann.tsv.gz -c CHROM,POS,REF,ALT,SCORE,~STR -i'TAG={STR}' -k in.vcf + bcftools annotate -a ann.tsv.gz -c CHROM,POS,-,-,SCORE,~STR -i'TAG={STR}' -k in.vcf + + - When adding Type=String annotations from a tab-delimited file, encode characters with + special meaning using percent encoding (';', '=' in INFO and ':' in FORMAT) (#2202) + +* bcftools consensus + + - Allow to apply a reference allele which overlaps a previous deletion, there is no + need to complain about overlapping alleles in such case + + - Fix a bug which required `-s -` to be present even when there were no samples in the VCF + (#2260) + +* bcftools csq + + - Fix a rare bug where indel combined with a substitution ending at exon boundary is + incorrectly predicted to have 'inframe' rather than 'frameshift' consequence (#2212) + +* bcftools gtcheck + + - Fix a segfault with --no-HWE-prob. The bug was introduced with the output format change in + 1.19 which replaced the DC section with DCv2 (#2180) + + - The number of matching genotypes in the DCv2 output was not calculated correctly with + non-zero `-E, --error-probability`. Consequently, also the average HWE score was incorrect. + The main output, the discordance score, was not affected by the bug + +* bcftools +mendelian2 + + - Include the number of good cases where at least one of the trio genotypes has an alternate + allele (#2204) + + - Fix the error message which would report the wrong sample when non-existent sample is given. + Note that bug only affected the error message, the program otherwise assigns the family + members correctly (#2242) + +* bcftools merge + + - Fix a severe bug in merging of FORMAT fields with Number=R and Number=A values. For example, + rows with high-coverage FORMAT/AD values (bigger or equal to 128) could have been assigned + to incorrect samples. The bug was introduced in version 1.19. For details see #2244. + +* bcftools mpileup + + - Return non-zero error code when the input BAM/CRAM file is truncated (#2177) + + - Add FORMAT/AD annotation by default, disable with `-a -AD` + +* bcftools norm + + - Support realignment of symbolic alleles, similarly to added previously + (#1919,#2145) + + - Fix in reporting reference allele genotypes with `--multi-overlaps .` (#2160) + + - Support of duplicate removal of symbolic alleles of the same type but different SVLEN (#2182) + + - New `-S, --sort` switch to optionally sort output records by allele (#1484) + + - Add the `-i/-e` filtering options to select records for normalization. Note duplicate + removal ignores this option. + + - Fix a bug where `--atomize` would not fill GT alleles for atomized SNVs followed by + an indel (#2239) + +* bcftools +remove-overlaps + + - Revamp the program to allow greater flexibility, with the following new options: + + -M, --mark-tag TAG Mark -m sites with INFO/TAG + -m, --mark EXPR Mark (if also -M is present) or remove sites [overlap] + dup .. all overlapping sites + overlap .. overlapping sites + min(QUAL) .. mark sites with lowest QUAL until overlaps are resolved + --missing EXPR Value to use for missing tags with -m 'min(QUAL)' + 0 .. the default + DP .. heuristics, scale maximum QUAL value proportionally to INFO/DP + --reverse Apply the reverse logic, for example preserve duplicates instead of removing + -O, --output-type t t: plain list of sites (chr,pos), tz: compressed list + +* bcftools +tag2tag + + - The conversions --LXX-to-XX, --XX-to-LXX were working but specific cases such as --LAD-to-AD were not. + + - Print more informative error message when source tag type violiates VCF specification + +* bcftools +trio-dnm2 + + - Better handling of the --strictly-novel functionality, especically with respect to chrX inheritance + + ## Release 1.20 (15th April 2024) @@ -716,7 +837,7 @@ Changes affecting specific commands: annotating from a tab-delimited text file, this feature can be invoked by using `-c INFO/END`. - - add a new '.' modifier to control wheter missing values should be carried + - add a new '.' modifier to control whether missing values should be carried over from a tab-delimited file or not. For example: -c TAG .. adds TAG if the source value is not missing. If TAG @@ -1068,7 +1189,7 @@ Changes affecting specific commands: * bcftools csq: - - Fix a bug wich caused incorrect FORMAT/BCSQ formatting at sites with too + - Fix a bug which caused incorrect FORMAT/BCSQ formatting at sites with too many per-sample consequences - Fix a bug which incorrectly handled the --ncsq parameter and could clash @@ -1785,7 +1906,7 @@ Updates, improvements and bugfixes for many other commands: * `roh`: Now possible to process multiple samples at once. This allows considerable speedups for files with thousands of samples where the cost of - HMM is neglibible compared to I/O and decompressing. In order to fit tens of + HMM is negligible compared to I/O and decompressing. In order to fit tens of thousands samples in memory, a sliding HMM can be used (new `--buffer-size` option). Viterbi training now uses Baum-Welch algorithm, and works much better. Support for gVCFs or FORMAT/PL tags. Added `-o, output` and diff --git a/abuf.c b/abuf.c index 34abddd0e..b125679b9 100644 --- a/abuf.c +++ b/abuf.c @@ -43,6 +43,7 @@ typedef struct kstring_t ref, alt; int ial; // the index of the original ALT allele, 1-based int beg, end; // 0-based inclusive offsets to ref,alt + int plen; // the ref,alt prefix length, eg plen=1 for C>CA } atom_t; @@ -175,8 +176,9 @@ static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial) atom->alt.l = 0; kputc(refb, &atom->ref); kputc(refb, &atom->alt); - atom->beg = atom->end = i; - atom->ial = ial; + atom->beg = atom->end = i; + atom->ial = ial; + atom->plen = 1; } continue; } @@ -202,6 +204,35 @@ static int _atoms_inconsistent(const atom_t *a, const atom_t *b) if ( rcmp ) return rcmp; return strcasecmp(a->alt.s,b->alt.s); } + +// returns +// 0 .. identical beg,ref,alt +// 1 .. non-overlapping variants, but record may overlap (A>AT vs A>C) +// 2 .. overlapping (conflicting) variants +static int _atoms_overlap(const atom_t *a, const atom_t *b) +{ + if ( a->beg < b->beg ) return 2; + if ( a->beg > b->beg ) return 2; + + // consider SNV followed by DEL as not overlapping + // CC > C a.plen=1 (ref,alt prefix len=1) + // C > T b.plen=0 (ref,alt prefix len=0) + if ( a->plen && a->plen >= b->ref.l ) return 1; + if ( b->plen && b->plen >= a->ref.l ) return 1; + + int rcmp = strcasecmp(a->ref.s,b->ref.s); + if ( rcmp ) return 2; + + // consider SNV followed by INS as not overlapping + // A > AT a.plen=1 (ref,alt prefix len=1) + // A > C b.plen=0 (ref,alt prefix len=0) + if ( a->plen && a->plen >= b->alt.l ) return 1; + if ( b->plen && b->plen >= a->alt.l ) return 1; + + rcmp = strcasecmp(a->alt.s,b->alt.s); + if ( rcmp ) return 2; + return 0; +} /* For reproducibility of tests on different platforms, we need to guarantee the same order of identical atoms originating from different source ALTs. Even though they are consistent, different values can be @@ -238,7 +269,14 @@ static void _split_table_new(abuf_t *buf, atom_t *atom) static void _split_table_overlap(abuf_t *buf, int iout, atom_t *atom) { uint8_t *ptr = buf->split.tbl + iout*buf->split.nori; - ptr[atom->ial-1] = _atoms_inconsistent(atom,buf->split.atoms[iout]) ? 2 : 1; + int olap = _atoms_overlap(atom,buf->split.atoms[iout]); + ptr[atom->ial-1] = olap > 1 ? 2 : 1; + + // The test test/atomize.split.5.vcf shows why we sometimes can and sometimes + // cannot remove the star allele like this + // buf->split.overlaps[iout] = olap > 1 ? 1 : 0; + // I forgot the details of the code, so don't immediately see + // if this could be made smarter buf->split.overlaps[iout] = 1; } #if 0 @@ -745,7 +783,7 @@ void _abuf_split(abuf_t *buf, bcf1_t *rec) _split_table_init(buf,rec,buf->natoms); for (i=0; inatoms; i++) { - if ( i && !_atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i]) ) continue; + if ( i && _atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i])==0 ) continue; _split_table_new(buf, &buf->atoms[i]); // add a new unique output atom } for (i=0; inatoms; i++) diff --git a/bam2bcf_edlib.c b/bam2bcf_edlib.c index 298781e8b..4e0a38c33 100644 --- a/bam2bcf_edlib.c +++ b/bam2bcf_edlib.c @@ -312,6 +312,7 @@ static int bcf_cgp_append_cons(str_freq *sf, char *str, int len, int freq) { * different locations. Ideally we'd like to consider these as all * the same insertion if the size is the same and it's comparable seq. */ +#define MAX_INS 8192 static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, int ref_len, int left, int right, @@ -406,13 +407,13 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, local_band_max = local_band; } - char ins[1024]; + char ins[MAX_INS]; for (j = 0; j < len; j++, y++) { if (x < left) continue; if (x >= right) break; base = bam_seqi(seq, y); - if (j < 1024) + if (j < MAX_INS) ins[j] = seq_nt16_int[base]; } @@ -421,7 +422,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // {IIIII,M} M M M M choice. So we need to include the // next match in our sequence when choosing the consensus. if (x >= left && x < right) { - int ilen = j<1024?j:1024; + int ilen = jindel == type /*&& x == pos+1*/) { // Assume any ins of the same size is the same ins. // (This rescues misaligned insertions.) @@ -581,7 +582,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // instead of storing separately and merging later (here). // Ie str_freq.str is [NI][5] instead. for (i = 0; i < right-left; i++) { - int ins[1024][5]; + int ins[MAX_INS][5]; for (j = 0; j < NI; j++) { if (!cons_ins[i].str[j]) break; @@ -652,7 +653,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // Het call filled out in cnum==0 (+ve or -ve). // Used in cnum==1 to do the opposite of whichever way we did before. - int heti[1024] = {0}, hetd[1024] = {0}; + int heti[MAX_INS] = {0}, hetd[MAX_INS] = {0}; *cpos_pos = -1; for (cnum = 0; cnum < 2; cnum++) { @@ -684,7 +685,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, if (!cons_ins[i].str[j]) goto err; memset(cons_ins[i].str[j] + cons_ins[i].len[j], - 'N', type - cons_ins[i].len[j]); + 4, type - cons_ins[i].len[j]); cons_ins[i].len[j] = type; } if (!cons_ins[i].str[j]) @@ -709,12 +710,12 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // Candidate HET ins. if (cnum == 0) { het_ins = max_v_ins > CONS_CUTOFF_INC * tot_sum; - if (i < 1024) heti[i] = het_ins + if (i < MAX_INS) heti[i] = het_ins ? 1 : (max_v_ins > .3*tot_sum ? -1:0); } else { // HET but uncalled before - het_ins = i < 1024 ? (heti[i] == -1) : 0; + het_ins = i < MAX_INS ? (heti[i] == -1) : 0; } } @@ -748,7 +749,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, tot2 = total_span_str - type_depth; het_del = cons_base[i][5] >= CONS_CUTOFF_DEL * tot2; - if (i < 1024) { + if (i < MAX_INS) { if (i > pos-left && i <= pos-left-biggest_del) hetd[i] = 0; else @@ -758,7 +759,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } } else { // HET del uncalled on cnum 0 - het_del = i < 1024 ? (hetd[i] == -1) : 0; + het_del = i < MAX_INS ? (hetd[i] == -1) : 0; if (max_j == 5 && het_del == 0) { max_v = max_v2; max_j = max_j2; diff --git a/bam2bcf_iaux.c b/bam2bcf_iaux.c index 2e0add15a..3fe4fdea7 100644 --- a/bam2bcf_iaux.c +++ b/bam2bcf_iaux.c @@ -396,7 +396,7 @@ static int iaux_set_consensus(indel_aux_t *iaux, int ismpl) // Finds the smallest index in the seq_pos array holding value equal to pos, or if there is no // such value, the largest index with value smaller than pos. Starts at initial guess ioff. // This could use a binary search but the assumption is that the initial guess is indel-size close -// to the actuall coordinate. +// to the actual coordinate. // // TODO: remove this function and seq_pos from cns creation as it seems unnecessary static int find_ref_offset(hts_pos_t pos, hts_pos_t *seq_pos, int nseq_pos, int ioff) diff --git a/bcftools.h b/bcftools.h index eedd4f72a..51c2d040f 100644 --- a/bcftools.h +++ b/bcftools.h @@ -29,6 +29,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #define FT_TAB_TEXT 0 // custom tab-delimited text file @@ -50,10 +51,8 @@ void error(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2 void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2); // For on the fly index creation with --write-index -int init_index2(htsFile *fh, bcf_hdr_t *hdr, const char *fname, - char **idx_fname, int idx_fmt); -int init_index(htsFile *fh, bcf_hdr_t *hdr, const char *fname, - char **idx_fname); +int init_index2(htsFile *fh, bcf_hdr_t *hdr, const char *fname, char **idx_fname, int idx_fmt); +int init_index(htsFile *fh, bcf_hdr_t *hdr, const char *fname, char **idx_fname); // Used to set args->write_index in CLI. // It will be true if set correctly. @@ -68,6 +67,10 @@ char *init_tmp_prefix(const char *prefix); int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq); int parse_overlap_option(const char *arg); +// Default sort order: chr,pos,alleles +int cmp_bcf_pos(const void *aptr, const void *bptr); +int cmp_bcf_pos_ref_alt(const void *aptr, const void *bptr); + static inline int iupac2bitmask(char iupac) { const int A = 1; @@ -129,6 +132,23 @@ static inline double phred_score(double prob) return prob>99 ? 99 : prob; } +static inline double calc_binom_two_sided(int na, int nb, double aprob) +{ + if ( !na && !nb ) return -1; + if ( na==nb ) return 1; + + // kfunc.h implements kf_betai, which is the regularized beta function P(X<=k/N;p) = I_{1-p}(N-k,k+1) + + double prob = na > nb ? 2 * kf_betai(na, nb+1, aprob) : 2 * kf_betai(nb, na+1, aprob); + + if ( prob > 1 ) prob = 1; // this can happen, machine precision error, eg. kf_betai(1,0,0.5) + return prob; +} +static inline double calc_binom_one_sided(int na, int nb, double aprob, int ge) +{ + return ge ? kf_betai(na, nb + 1, aprob) : kf_betai(nb, na + 1, 1 - aprob); +} + static const uint64_t bcf_double_missing = 0x7ff0000000000001; static const uint64_t bcf_double_vector_end = 0x7ff0000000000002; static inline void bcf_double_set(double *ptr, uint64_t value) diff --git a/consensus.c b/consensus.c index 66a65b329..54f17c221 100644 --- a/consensus.c +++ b/consensus.c @@ -229,7 +229,14 @@ static void init_data(args_t *args) args->hdr = args->files->readers[0].header; args->isample = -1; if ( !args->sample ) + { args->smpl = smpl_ilist_init(args->hdr,NULL,0,SMPL_NONE|SMPL_VERBOSE); + if ( !args->smpl->n ) + { + smpl_ilist_destroy(args->smpl); + args->smpl = NULL; + } + } else if ( args->sample && strcmp("-",args->sample) ) { args->smpl = smpl_ilist_init(args->hdr,args->sample,0,SMPL_NONE|SMPL_VERBOSE); @@ -244,12 +251,22 @@ static void init_data(args_t *args) { if ( args->haplotype || args->allele ) { - if ( args->smpl->n > 1 ) error("Too many samples, only one can be used with -H\n"); + if ( args->smpl->n > 1 ) error("Too many samples, only one can be used with -H; check the -s,-S options\n"); args->isample = args->smpl->idx[0]; } else + { args->iupac_GTs = 1; + if ( args->smpl->n==1 ) + fprintf(stderr,"Note: applying IUPAC codes based on FORMAT/GT in sample %s\n",bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,args->smpl->idx[0])); + else + fprintf(stderr,"Note: applying IUPAC codes based on FORMAT/GT in %d samples\n",args->smpl->n); + } } + else if ( args->output_iupac ) + fprintf(stderr,"Note: applying IUPAC codes based on REF,ALT%s\n",bcf_hdr_nsamples(args->hdr)?", ignoring samples":""); + else + fprintf(stderr,"Note: applying REF,ALT variants%s\n",bcf_hdr_nsamples(args->hdr)?", ignoring samples":""); int i; for (i=0; inmask; i++) { @@ -272,7 +289,6 @@ static void init_data(args_t *args) if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno)); } else args->fp_out = stdout; - if ( args->isample<0 && !args->iupac_GTs ) fprintf(stderr,"Note: the --samples option not given, applying all records regardless of the genotype\n"); if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); args->rid = -1; @@ -781,6 +797,19 @@ static void apply_variant(args_t *args, bcf1_t *rec) else if ( (var_type & VCF_OTHER) && !strncasecmp(rec->d.allele[ialt],"pos <= args->fa_frz_pos && rec->pos + rec->rlen - 1 > args->fa_frz_pos ) + { + // Applying the reference allele which overlaps a previous deletion. If we are here, it + // means it goes beyond the freezed position, hence the record can be trimmed and moved + // forward + int ntrim = args->fa_frz_pos - rec->pos + 1; + int nref = strlen(rec->d.allele[0]); + assert( ntrim < nref ); + rec->pos += ntrim; + rec->rlen -= ntrim; + memmove(rec->d.allele[0],rec->d.allele[0]+ntrim,nref-ntrim); + rec->d.allele[0][nref-ntrim] = 0; + } if ( rec->pos <= args->fa_frz_pos ) { // Can be still OK iff this is an insertion (and which does not follow another insertion, see #888). @@ -795,7 +824,6 @@ static void apply_variant(args_t *args, bcf1_t *rec) fprintf(stderr,"The site %s:%"PRId64" overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); return; } - } char *ref_allele = rec->d.allele[0]; @@ -848,8 +876,9 @@ static void apply_variant(args_t *args, bcf1_t *rec) } } } - if ( idx>0 && idx>=args->fa_buf.l ) - error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off); + + // the variant is beyond the available fasta sequence + if ( idx>0 && idx>=args->fa_buf.l ) return; // sanity check the reference base if ( alt_allele[0]=='<' ) @@ -983,7 +1012,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) ks_resize(&args->fa_buf, args->fa_buf.l + len_diff); memmove(args->fa_buf.s + idx + rec->rlen + len_diff, args->fa_buf.s + idx + rec->rlen, args->fa_buf.l - idx - rec->rlen); - // This can get tricky, make sure the bases unchanged by the insertion do not overwrite preceeding variants. + // This can get tricky, make sure the bases unchanged by the insertion do not overwrite preceding variants. // For example, here we want to get TAA: // POS REF ALT // 1 C T diff --git a/convert.c b/convert.c index 00fcb5ef6..c459c8387 100644 --- a/convert.c +++ b/convert.c @@ -1,6 +1,6 @@ /* convert.c -- functions for converting between VCF/BCF and related formats. - Copyright (C) 2013-2023 Genome Research Ltd. + Copyright (C) 2013-2024 Genome Research Ltd. Author: Petr Danecek @@ -111,6 +111,7 @@ struct _convert_t int allow_undef_tags; int force_newline; int header_samples; + int no_hdr_indices; uint8_t **subset_samples; }; @@ -1208,11 +1209,11 @@ static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa if ( n[0]==n[1] ) kputc(n[0]==0 ? '.':'0', str); else { - double pval = n[0] < n[1] ? kf_betai(n[1], n[0] + 1, 0.5) : kf_betai(n[0], n[1] + 1, 0.5); - pval *= 2; - if ( pval>=1 ) pval = 0; // this can happen, machine precision error, eg. kf_betai(1,0,0.5) - else - pval = -4.34294481903*log(pval); + double pval = calc_binom_two_sided(n[0],n[1],0.5); + + // convrt to phred + if ( pval>=1 ) pval = 0; + else pval = -4.34294481903*log(pval); kputd(pval, str); } return; @@ -1570,7 +1571,7 @@ int convert_header(convert_t *convert, kstring_t *str) int i, icol = 0, l_ori = str->l; bcf_hdr_t *hdr = convert->header; - // Supress the header output if LINE is present + // Suppress the header output if LINE is present for (i=0; infmt; i++) if ( convert->fmt[i].type == T_LINE ) break; if ( i!=convert->nfmt ) @@ -1609,9 +1610,17 @@ int convert_header(convert_t *convert, kstring_t *str) } } else if ( convert->header_samples ) - ksprintf(str, "[%d]%s:%s", ++icol, hdr->samples[ks], convert->fmt[k].key); + { + icol++; + if ( !convert->no_hdr_indices ) ksprintf(str,"[%d]",icol); + ksprintf(str,"%s:%s", hdr->samples[ks], convert->fmt[k].key); + } else - ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key); + { + icol++; + if ( !convert->no_hdr_indices ) ksprintf(str,"[%d]",icol); + ksprintf(str,"%s", convert->fmt[k].key); + } } if ( has_fmt_newline ) { @@ -1633,7 +1642,9 @@ int convert_header(convert_t *convert, kstring_t *str) if ( convert->fmt[i].key ) kputs(convert->fmt[i].key, str); continue; } - ksprintf(str, "[%d]%s", ++icol, convert->fmt[i].key); + icol++; + if ( !convert->no_hdr_indices ) ksprintf(str,"[%d]",icol); + ksprintf(str,"%s", convert->fmt[i].key); } if ( has_fmt_newline ) kputc('\n',str); return str->l - l_ori; @@ -1776,6 +1787,9 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...) convert->force_newline = va_arg(args, int); if ( convert->force_newline ) force_newline_(convert); break; + case no_hdr_indices: + convert->no_hdr_indices = va_arg(args, int); + break; default: ret = -1; } diff --git a/convert.h b/convert.h index 188b38124..150751481 100644 --- a/convert.h +++ b/convert.h @@ -1,6 +1,6 @@ /* convert.h -- functions for converting between VCF/BCF and related formats. - Copyright (C) 2014-2023 Genome Research Ltd. + Copyright (C) 2014-2024 Genome Research Ltd. Author: Petr Danecek @@ -35,6 +35,7 @@ enum convert_option header_samples, // include sample name in bracketed tags (e.g. SAMPLE1:GT SAMPLE2:GT for [ %GT]) force_newline, // automatically insert a newline when not part of the formatting expression print_filtered, // print the provided string instead of discarding samples not included in subset_samples + no_hdr_indices, // drop column indices when printing header, i.e. "#CHROM", not "#[1]CHROM" }; convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *str); diff --git a/csq.c b/csq.c index 639194517..b38eba107 100644 --- a/csq.c +++ b/csq.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2016-2023 Genome Research Ltd. + Copyright (c) 2016-2024 Genome Research Ltd. Author: Petr Danecek @@ -34,7 +34,7 @@ Read about transcript types here http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html - http://www.ensembl.org/info/genome/variation/predicted_data.html + https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html https://www.gencodegenes.org/pages/biotypes.html List of supported biotypes @@ -149,6 +149,7 @@ #include #include #include +#include #include "bcftools.h" #include "filter.h" #include "regidx.h" @@ -625,7 +626,7 @@ void destroy_data(args_t *args) } /* - The splice_* functions are for consquences around splice sites: start,stop,splice_* + The splice_* functions are for consequences around splice sites: start,stop,splice_* */ #define SPLICE_VAR_REF 0 // ref: ACGT>ACGT, csq not applicable, skip completely #define SPLICE_OUTSIDE 1 // splice acceptor or similar; csq set and is done, does not overlap the region @@ -793,7 +794,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32 static inline void csq_stage_splice(args_t *args, bcf1_t *rec, gf_tscript_t *tr, uint32_t type, int ial) { #if XDBG -fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type); +fprintf(stderr,"csq_stage_splice %d: type=%d\n",(int)rec->pos+1,type); #endif if ( !type ) return; csq_t csq; @@ -1181,7 +1182,9 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,% splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt); if ( (splice->ref_beg+1 < ex_beg && splice->ref_end >= ex_beg) || (splice->ref_beg+1 < ex_end && splice->ref_end >= ex_end) ) // ouch, ugly ENST00000409523/long-overlapping-del.vcf { - splice->csq |= (splice->ref_end - splice->ref_beg)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION; + int ref_beg = splice->ref_beg + splice->kalt.l - 1; // 0 for AAA>A, 1 for AAA>AC + if ( ref_beg < splice->ref_end ) + splice->csq |= (splice->ref_end - ref_beg)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION; return SPLICE_OVERLAP; } } @@ -1708,7 +1711,7 @@ void tscript_splice_ref(gf_tscript_t *tr) int csq_push(args_t *args, csq_t *csq, bcf1_t *rec) { #if XDBG -fprintf(stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); +fprintf(stderr,"csq_push: %d .. %d\n",(int)rec->pos+1,csq->type.type); #endif khint_t k = kh_get(pos2vbuf, args->pos2vbuf, (int)csq->pos); vbuf_t *vbuf = (k == kh_end(args->pos2vbuf)) ? NULL : kh_val(args->pos2vbuf, k); @@ -3335,11 +3338,11 @@ static const char *usage(void) " -W, --write-index[=FMT] Automatically index the output files [off]\n" "\n" "Example:\n" - " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n" + " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.87.gff3.gz in.vcf\n" "\n" " # GFF3 annotation files can be downloaded from Ensembl. e.g. for human:\n" - " ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/\n" - " ftp://ftp.ensembl.org/pub/grch37/release-84/gff3/homo_sapiens/\n" + " http://ftp.ensembl.org/pub/current_gff3/homo_sapiens/\n" + " http://ftp.ensembl.org/pub/grch37/current/gff3/homo_sapiens/\n" "\n"; } diff --git a/doc/bcftools.1 b/doc/bcftools.1 index 60743b2a0..fcbbcf2f1 100644 --- a/doc/bcftools.1 +++ b/doc/bcftools.1 @@ -2,12 +2,12 @@ .\" Title: bcftools .\" Author: [see the "AUTHOR(S)" section] .\" Generator: Asciidoctor 2.0.16.dev -.\" Date: 2024-04-15 +.\" Date: 2024-09-12 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "BCFTOOLS" "1" "2024-04-15" "\ \&" "\ \&" +.TH "BCFTOOLS" "1" "2024-09-12" "\ \&" "\ \&" .ie \n(.g .ds Aq \(aq .el .ds Aq ' .ss \n[.ss] 0 @@ -51,7 +51,7 @@ standard input (stdin) and outputs to the standard output (stdout). Several commands can thus be combined with Unix pipes. .SS "VERSION" .sp -This manual page was last updated \fB2024\-04\-15\fP and refers to bcftools git version \fB1.20\fP. +This manual page was last updated \fB2024\-09\-12\fP and refers to bcftools git version \fB1.21\fP. .SS "BCF1" .sp The obsolete BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP @@ -676,7 +676,7 @@ See also \fB\-c, \-\-columns\fP and \fB\-h, \-\-header\-lines\fP. Comma\-separated list of columns or tags to carry over from the annotation file (see also \fB\-a, \-\-annotations\fP). If the annotation file is not a VCF/BCF, \fIlist\fP describes the columns of the annotation file and must include CHROM, -POS (or, alternatively, FROM and TO), and optionally REF and ALT. Unused +POS (or, alternatively, FROM,TO or BEG,END), and optionally REF and ALT. Unused columns which should be ignored can be indicated by "\-". \~ .br @@ -706,8 +706,14 @@ exist in the target file; existing tags will not be overwritten. To append to existing values (rather than replacing or leaving untouched), use "=TAG" (instead of "TAG" or "+TAG"). To replace only existing values without modifying missing annotations, use "\-TAG". +As a special case of this, if position needs to be replaced, mark the column with the new coordinate as "\-POS". +(Note that in previous releases this used to be "~POS", now deprecated.) +\~ +.br +\~ +.br To match the record also by ID or INFO/END, in addition to REF and ALT, use "~ID" or "~INFO/END". -If position needs to be replaced, mark the column with the new position as "~POS". +Note that this works only for ID and POS, for other fields see the description of \fB\-i\fP below. \~ .br \~ @@ -719,8 +725,39 @@ defined via \fB\-h, \-\-header\-lines\fP. \~ .br See also the \fB\-l, \-\-merge\-logic\fP option. +\~ +.br +\~ +.br +\fBSummary of \f(CR\-c, \-\-columns\fP:\fP .RE .sp +.if n .RS 4 +.nf +.fam C + CHROM,POS,TAG .. match by chromosome and position, transfer annotation from TAG + CHROM,POS,\-,TAG .. same as above, but ignore the third column of the annotation file + CHROM,BEG,END,TAG .. match by region (BEG,END are synonymous to FROM,TO) + CHROM,POS,REF,ALT .. match by CHROM, POS, REF and ALT + + DST_TAG:=SRC_TAG .. transfer the SRC_TAG using the new name DST_TAG + INFO .. transfer all INFO annotations + ^INFO/TAG .. transfer all INFO annotations except "TAG" + + TAG .. add or overwrite existing target value if source is not "." and skip otherwise + +TAG .. add or overwrite existing target value only it is "." + .TAG .. add or overwrite existing target value even if source is "." + .+TAG .. add new but never overwrite existing tag, regardless of its value; can transfer "." if target does not exist + \-TAG .. overwrite existing value, never add new if target does not exist + =TAG .. do not overwrite but append value to existing tags + + ~FIELD .. use this column to match lines with \-i/\-e expression (see the description of \-i below) + ~ID .. in addition to CHROM,POS,REF,ALT match by also ID + ~INFO/END .. in addition to CHROM,POS,REF,ALT match by also INFO/END +.fam +.fi +.if n .RE +.sp \fB\-C, \-\-columns\-file\fP \fIfile\fP .RS 4 Read the list of columns from a file (normally given via the \fB\-c, \-\-columns\fP option). @@ -733,7 +770,7 @@ This is useful when many annotations are added at once. \fB\-e, \-\-exclude\fP \fIEXPRESSION\fP .RS 4 exclude sites for which \fIEXPRESSION\fP is true. For valid expressions see -\fBEXPRESSIONS\fP. +\fBEXPRESSIONS\fP and the extension described in \fB\-i, \-\-include\fP below. .RE .sp \fB\-\-force\fP @@ -777,8 +814,28 @@ one can use .RS 4 include only sites for which \fIEXPRESSION\fP is true. For valid expressions see \fBEXPRESSIONS\fP. +\~ +.br +\~ +.br +Additionally, the command \fBbcftools annotate\fP supports expressions updated from the annotation +file dynamically for each record: .RE .sp +.if n .RS 4 +.nf +.fam C + # The field \*(AqSTR\*(Aq from the \-a file is required to match INFO/TAG in VCF. In the first example + # the alleles REF,ALT must match, in the second example they are ignored. The option \-k is required + # to output also records that are not annotated. The third example shows the same concept with + # a numerical expression. + bcftools annotate \-a annots.tsv.gz \-c CHROM,POS,REF,ALT,SCORE,~STR \-i\*(AqTAG={STR}\*(Aq \-k input.vcf + bcftools annotate \-a annots.tsv.gz \-c CHROM,POS,\-,\-,SCORE,~STR \-i\*(AqTAG={STR}\*(Aq \-k input.vcf + bcftools annotate \-a annots.tsv.gz \-c CHROM,POS,\-,\-,SCORE,~INT \-i\*(AqTAG>{INT}\*(Aq \-k input.vcf +.fam +.fi +.if n .RE +.sp \fB\-k, \-\-keep\-sites\fP .RS 4 keep sites which do not pass \fB\-i\fP and \fB\-e\fP expressions instead of discarding them @@ -1109,11 +1166,15 @@ in low coverage data this inflates the rate of false positives.) The \fB\-G\fP o per\-sample FORMAT/QS or FORMAT/AD tag generated with \fBbcftools mpileup \-a QS\fP (or \fB\-a AD\fP). .RE .sp -\fB\-g, \-\-gvcf\fP \fIINT\fP +\fB\-g, \-\-gvcf\fP \fIINT\fP[,...] .RS 4 -output also gVCF blocks of homozygous REF calls. The parameter \fIINT\fP is the -minimum per\-sample depth required to include a site in the non\-variant -block. +output gVCF blocks of homozygous REF calls, with depth (DP) ranges +specified by the list of integers. For example, passing \fI5,15\fP will +group sites into two types of gVCF blocks, the first with minimum +per\-sample DP from the interval [5,15) and the latter with minimum +depth 15 or more. In this example, sites with minimum per\-sample +depth less than 5 will be printed as separate records, outside of +gVCF blocks. .RE .sp \fB\-i, \-\-insert\-missed\fP \fIINT\fP @@ -2083,7 +2144,7 @@ transcripts in malformatted GFFs with incorrect phase \fB\-g, \-\-gff\-annot\fP \fIFILE\fP .RS 4 GFF3 annotation file (required), such as \c -.URL "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens" "" "." +.URL "http://ftp.ensembl.org/pub/current_gff3/homo_sapiens/" "" "." The script \fBgff2gff\fP can help with conversion from non\-standard GFF formats. An example of a minimal working GFF file: .RE @@ -2253,6 +2314,11 @@ and VCF, such as "chrX" vs "X". The chromosome names in the output VCF will matc that of the input VCF. The default is to attempt the automatic translation. .RE .sp +\fB\-v, \-\-verbose\fP \fIINT\fP +.RS 4 +verbosity level (0\-2) +.RE +.sp \fB\-W\fP[\fIFMT\fP]\fB, \-W\fP[=\fIFMT\fP]\fB, \-\-write\-index\fP[=\fIFMT\fP] .RS 4 Automatically index the output file. \fIFMT\fP is optional and can be @@ -2503,7 +2569,7 @@ Without the \fB\-g\fP option, multi\-sample cross\-check of samples in \fIquery. .sp Note that the interpretation of the discordance score depends on the options provided (specifically \fB\-e\fP and \fB\-u\fP) and on the available annotations (FORMAT/PL vs FORMAT/GT). -The discordance score can be interpreted as the number of mismatching genotypes if only GT\-vs\-GT matching is performed. +The discordance score can be interpreted as the number of mismatching genotypes only if GT\-vs\-GT matching is performed. .sp \fB\-\-distinctive\-sites\fP \fINUM[,MEM[,DIR]]\fP .RS 4 @@ -3138,7 +3204,8 @@ Assume the quality is in the Illumina 1.3+ encoding. .sp \fB\-A, \-\-count\-orphans\fP .RS 4 -Do not skip anomalous read pairs in variant calling. +Include anomalous read pairs in variant calling, i.e. reads with +flag PAIRED but not PROPER_PAIR set. By default such reads are discarded. .RE .sp \fB\-b, \-\-bam\-list\fP \fIFILE\fP @@ -3156,10 +3223,96 @@ by misalignments. .sp \fB\-C, \-\-adjust\-MQ\fP \fIINT\fP .RS 4 -Coefficient for downgrading mapping quality for reads containing -excessive mismatches. Given a read with a phred\-scaled probability q of -being generated from the mapped position, the new mapping quality is -about sqrt((INT\-q)/INT)*INT. A zero value (the default) disables this functionality. +Coefficient for downgrading mapping quality for reads containing +excessive mismatches. Mismatches are counted as a proportion of the +number of aligned bases ("M", "X" or "=" CIGAR operations), along with +their quality, to derive an upper\-bound of the mapping quality. +Original mapping qualities lower than this are left intact, while +higher ones are capped at the new adjusted score. +.sp +The exact formula is complex and likely tuned to specific instruments +and specific alignment tools, so this option is disabled by default +(indicated as having a zero value). Variables in the formulae and +their meaning are defined below. +.sp +.if n .RS 4 +.nf +.fam C +Variable Meaning / formula +M The number of matching CIGAR bases (operation "M", "X" or "="). +X The number of substitutions with quality >= 13. +SubQ The summed quality of substitution bases included in X, capped + at a maximum of quality 33 per mismatching base. +ClipQ The summed quality of soft\-clipped or hard\-clipped bases. This + has no minimum or maximum quality threshold per base. For + hard\-clipped bases the per\-base quality is taken as 13. + +T SubQ \- 10 * log10(M^X / X!) + ClipQ/5 +Cap MAX(0, INT * sqrt((INT \- T) / INT)) +.fam +.fi +.if n .RE +.sp +Some notes on the impact of this. +.sp +.RS 4 +.ie n \{\ +\h'-04'\(bu\h'+03'\c +.\} +.el \{\ +. sp -1 +. IP \(bu 2.3 +.\} +As the number of mismatches increases, the mapping quality cap +reduces, eventually resulting in discarded alignments. +.RE +.sp +.RS 4 +.ie n \{\ +\h'-04'\(bu\h'+03'\c +.\} +.el \{\ +. sp -1 +. IP \(bu 2.3 +.\} +High quality mismatches reduces the cap faster than low quality +mismatches. +.RE +.sp +.RS 4 +.ie n \{\ +\h'-04'\(bu\h'+03'\c +.\} +.el \{\ +. sp -1 +. IP \(bu 2.3 +.\} +The starting INT value also acts as a hard cap on mapping quality, +even when zero mismatches are observed. +.RE +.sp +.RS 4 +.ie n \{\ +\h'-04'\(bu\h'+03'\c +.\} +.el \{\ +. sp -1 +. IP \(bu 2.3 +.\} +Indels have no impact on the mapping quality. +.sp +The intent of this option is to work around aligners that compute a +mapping quality using a local alignment without having any regard to +the degree of clipping required or consideration of potential +contamination or large scale insertions with respect to the reference. +A record may align uniquely and have no close second match, but having +a high number of mismatches may still imply that the reference is not +the correct site. +.sp +However we do not recommend use of this parameter unless you fully +understand the impact of it and have determined that it is appropriate +for your sequencing technology. +.RE .RE .sp \fB\-D, \-\-full\-BAQ\fP @@ -3771,6 +3924,12 @@ If a record is present in multiple files, output only the first instance. Alias for \fB\-d none\fP, deprecated. .RE .sp +\fB\-e, \-\-exclude\fP \fIEXPRESSION\fP +.RS 4 +do not normalize input records for which \fIEXPRESSION\fP is true. For valid expressions see +\fBEXPRESSIONS\fP. Note that duplicate removal ignores this option. +.RE +.sp \fB\-f, \-\-fasta\-ref\fP \fIFILE\fP .RS 4 reference sequence. Supplying this option will turn on left\-alignment @@ -3791,6 +3950,12 @@ strand. In case of overlapping transcripts, the default mode is to left\-align description of the supported GFF3 file format see \fBbcftools csq\fP. .RE .sp +\fB\-i, \-\-include\fP \fIEXPRESSION\fP +.RS 4 +normalize only input records for which \fIEXPRESSION\fP is true. For valid expressions see +\fBEXPRESSIONS\fP. Note that duplicate removal ignores this option. +.RE +.sp \fB\-\-keep\-sum\fP \fITAG\fP[,...] .RS 4 keep vector sum constant when splitting multiallelic sites. Only AD tag @@ -3869,6 +4034,12 @@ see \fBCommon Options\fP when merging (\fI\-m+\fP), merged site is PASS only if all sites being merged PASS .RE .sp +\fB\-S, \-\-sort\fP \fIpos\fP|\fIlex\fP +.RS 4 +when splitting sites or processing duplicates, sort records on output by +POS only (\fIpos\fP, the default) or by POS and lexicographically by REF+ALT (\fIlex\fP) +.RE +.sp \fB\-t, \-\-targets\fP \fILIST\fP .RS 4 see \fBCommon Options\fP @@ -3889,6 +4060,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-v, \-\-verbose\fP \fIINT\fP +.RS 4 +verbosity level of GFF parsing (0\-2) +.RE +.sp \fB\-w, \-\-site\-win\fP \fIINT\fP .RS 4 maximum distance between two records to consider when locally @@ -4651,7 +4827,8 @@ fields will be still printed but instead of their actual value, \fISTR\fP will b .sp \fB\-H, \-\-print\-header\fP .RS 4 -print header +print header. By default, the header is printed with column indices, e.g. "#[1]CHROM". +These can be suppressed by giving the option twice, "\f(CR\-HH\fP". .RE .sp \fB\-i, \-\-include\fP \fIEXPRESSION\fP @@ -4753,6 +4930,7 @@ process multiple VCFs listed in the file %TBCSQ Translated FORMAT/BCSQ. See the csq command above for explanation and examples. %TGT Translated genotype (e.g. C/A) %TYPE Variant type (REF, SNP, MNP, INDEL, BND, OTHER) +%VKX VariantKey, biallelic hexadecimal encoding of CHROM,POS,REF,ALT (https://github.com/tecnickcom/variantkey) [] Format fields must be enclosed in brackets to loop over all samples \(rsn new line \(rst tab character @@ -6111,7 +6289,7 @@ number of samples; count of alternate alleles; minor allele count (similar to AC but is always smaller than 0.5); frequency of alternate alleles (AF=AC/AN); frequency of minor alleles (MAF=MAC/AN); number of alleles in called genotypes; number of samples with missing genotype; fraction of samples with missing genotype; -indel length (deletions negative, insertions positive) +indel length (deletions negative, insertions positive, balanced substitutions zero) .sp .if n .RS 4 .nf diff --git a/doc/bcftools.html b/doc/bcftools.html index 82080c049..390ba30d5 100644 --- a/doc/bcftools.html +++ b/doc/bcftools.html @@ -50,7 +50,7 @@

DESCRIPTION

VERSION

-

This manual page was last updated 2024-04-15 and refers to bcftools git version 1.20.

+

This manual page was last updated 2024-09-12 and refers to bcftools git version 1.21.

@@ -490,7 +490,7 @@

bcftools annotate [OPTIONS] FILE

Comma-separated list of columns or tags to carry over from the annotation file (see also -a, --annotations). If the annotation file is not a VCF/BCF, list describes the columns of the annotation file and must include CHROM, -POS (or, alternatively, FROM and TO), and optionally REF and ALT. Unused +POS (or, alternatively, FROM,TO or BEG,END), and optionally REF and ALT. Unused columns which should be ignored can be indicated by "-".  
 
@@ -514,16 +514,50 @@

bcftools annotate [OPTIONS] FILE

To append to existing values (rather than replacing or leaving untouched), use "=TAG" (instead of "TAG" or "+TAG"). To replace only existing values without modifying missing annotations, use "-TAG". +As a special case of this, if position needs to be replaced, mark the column with the new coordinate as "-POS". +(Note that in previous releases this used to be "~POS", now deprecated.) + 

To match the record also by ID or INFO/END, in addition to REF and ALT, use "~ID" or "~INFO/END". -If position needs to be replaced, mark the column with the new position as "~POS". +Note that this works only for ID and POS, for other fields see the description of -i below.  
 
If the annotation file is not a VCF/BCF, all new annotations must be defined via -h, --header-lines.  
 
-See also the -l, --merge-logic option.

+See also the -l, --merge-logic option. + 

+Summary of -c, --columns:

+ +
+
+
+
    CHROM,POS,TAG       .. match by chromosome and position, transfer annotation from TAG
+    CHROM,POS,-,TAG     .. same as above, but ignore the third column of the annotation file
+    CHROM,BEG,END,TAG   .. match by region (BEG,END are synonymous to FROM,TO)
+    CHROM,POS,REF,ALT   .. match by CHROM, POS, REF and ALT
+
+    DST_TAG:=SRC_TAG    .. transfer the SRC_TAG using the new name DST_TAG
+    INFO                .. transfer all INFO annotations
+    ^INFO/TAG           .. transfer all INFO annotations except "TAG"
+
+    TAG       .. add or overwrite existing target value if source is not "." and skip otherwise
+    +TAG      .. add or overwrite existing target value only it is "."
+    .TAG      .. add or overwrite existing target value even if source is "."
+    .+TAG     .. add new but never overwrite existing tag, regardless of its value; can transfer "." if target does not exist
+    -TAG      .. overwrite existing value, never add new if target does not exist
+    =TAG      .. do not overwrite but append value to existing tags
+
+    ~FIELD    .. use this column to match lines with -i/-e expression (see the description of -i below)
+    ~ID       .. in addition to CHROM,POS,REF,ALT match by also ID
+    ~INFO/END .. in addition to CHROM,POS,REF,ALT match by also INFO/END
+
+
+
+
-C, --columns-file file

Read the list of columns from a file (normally given via the -c, --columns option). @@ -535,7 +569,7 @@

bcftools annotate [OPTIONS] FILE

-e, --exclude EXPRESSION

exclude sites for which EXPRESSION is true. For valid expressions see -EXPRESSIONS.

+EXPRESSIONS and the extension described in -i, --include below.

--force
@@ -576,8 +610,27 @@

bcftools annotate [OPTIONS] FILE

-i, --include EXPRESSION

include only sites for which EXPRESSION is true. For valid expressions see -EXPRESSIONS.

+EXPRESSIONS. + 

+Additionally, the command bcftools annotate supports expressions updated from the annotation +file dynamically for each record:

+
+
+
+
+
    # The field 'STR' from the -a file is required to match INFO/TAG in VCF. In the first example
+    # the alleles REF,ALT must match, in the second example they are ignored. The option -k is required
+    # to output also records that are not annotated. The third example shows the same concept with
+    # a numerical expression.
+    bcftools annotate -a annots.tsv.gz -c CHROM,POS,REF,ALT,SCORE,~STR -i'TAG={STR}' -k input.vcf
+    bcftools annotate -a annots.tsv.gz -c CHROM,POS,-,-,SCORE,~STR     -i'TAG={STR}' -k input.vcf
+    bcftools annotate -a annots.tsv.gz -c CHROM,POS,-,-,SCORE,~INT     -i'TAG>{INT}' -k input.vcf
+
+
+
+
-k, --keep-sites

keep sites which do not pass -i and -e expressions instead of discarding them

@@ -882,11 +935,15 @@

Input/output options:

in low coverage data this inflates the rate of false positives.) The -G option requires the presence of per-sample FORMAT/QS or FORMAT/AD tag generated with bcftools mpileup -a QS (or -a AD).

-
-g, --gvcf INT
+
-g, --gvcf INT[,…​]
-

output also gVCF blocks of homozygous REF calls. The parameter INT is the -minimum per-sample depth required to include a site in the non-variant -block.

+

output gVCF blocks of homozygous REF calls, with depth (DP) ranges +specified by the list of integers. For example, passing 5,15 will +group sites into two types of gVCF blocks, the first with minimum +per-sample DP from the interval [5,15) and the latter with minimum +depth 15 or more. In this example, sites with minimum per-sample +depth less than 5 will be printed as separate records, outside of +gVCF blocks.

-i, --insert-missed INT
@@ -1814,7 +1871,7 @@

bcftools csq [OPTIONS] FILE

-g, --gff-annot FILE
-

GFF3 annotation file (required), such as ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens. +

GFF3 annotation file (required), such as http://ftp.ensembl.org/pub/current_gff3/homo_sapiens/. The script gff2gff can help with conversion from non-standard GFF formats. An example of a minimal working GFF file:

@@ -1966,6 +2023,10 @@

bcftools csq [OPTIONS] FILE

and VCF, such as "chrX" vs "X". The chromosome names in the output VCF will match that of the input VCF. The default is to attempt the automatic translation.

+
-v, --verbose INT
+
+

verbosity level (0-2)

+
-W[FMT], -W[=FMT], --write-index[=FMT]

Automatically index the output file. FMT is optional and can be @@ -2203,7 +2264,7 @@

bcftools gtcheck [OPTIONS] [-g ge

Note that the interpretation of the discordance score depends on the options provided (specifically -e and -u) and on the available annotations (FORMAT/PL vs FORMAT/GT). -The discordance score can be interpreted as the number of mismatching genotypes if only GT-vs-GT matching is performed.

+The discordance score can be interpreted as the number of mismatching genotypes only if GT-vs-GT matching is performed.

@@ -2806,7 +2867,8 @@

Input options

-A, --count-orphans
-

Do not skip anomalous read pairs in variant calling.

+

Include anomalous read pairs in variant calling, i.e. reads with +flag PAIRED but not PROPER_PAIR set. By default such reads are discarded.

-b, --bam-list FILE
@@ -2821,10 +2883,69 @@

Input options

-C, --adjust-MQ INT
-

Coefficient for downgrading mapping quality for reads containing -excessive mismatches. Given a read with a phred-scaled probability q of -being generated from the mapped position, the new mapping quality is -about sqrt((INT-q)/INT)*INT. A zero value (the default) disables this functionality.

+

Coefficient for downgrading mapping quality for reads containing +excessive mismatches. Mismatches are counted as a proportion of the +number of aligned bases ("M", "X" or "=" CIGAR operations), along with +their quality, to derive an upper-bound of the mapping quality. +Original mapping qualities lower than this are left intact, while +higher ones are capped at the new adjusted score.

+
+

The exact formula is complex and likely tuned to specific instruments +and specific alignment tools, so this option is disabled by default +(indicated as having a zero value). Variables in the formulae and +their meaning are defined below.

+
+
+
+
Variable    Meaning / formula
+M           The number of matching CIGAR bases (operation "M", "X" or "=").
+X           The number of substitutions with quality >= 13.
+SubQ        The summed quality of substitution bases included in X, capped
+            at a maximum of quality 33 per mismatching base.
+ClipQ       The summed quality of soft-clipped or hard-clipped bases. This
+            has no minimum or maximum quality threshold per base.  For
+            hard-clipped bases the per-base quality is taken as 13.
+
+T           SubQ - 10 * log10(M^X / X!) + ClipQ/5
+Cap         MAX(0, INT * sqrt((INT - T) / INT))
+
+
+
+

Some notes on the impact of this.

+
+
+
    +
  • +

    As the number of mismatches increases, the mapping quality cap +reduces, eventually resulting in discarded alignments.

    +
  • +
  • +

    High quality mismatches reduces the cap faster than low quality +mismatches.

    +
  • +
  • +

    The starting INT value also acts as a hard cap on mapping quality, +even when zero mismatches are observed.

    +
  • +
  • +

    Indels have no impact on the mapping quality.

    +
    +

    The intent of this option is to work around aligners that compute a +mapping quality using a local alignment without having any regard to +the degree of clipping required or consideration of potential +contamination or large scale insertions with respect to the reference. +A record may align uniquely and have no close second match, but having +a high number of mismatches may still imply that the reference is not +the correct site.

    +
    +
    +

    However we do not recommend use of this parameter unless you fully +understand the impact of it and have determined that it is appropriate +for your sequencing technology.

    +
    +
  • +
+
-D, --full-BAQ
@@ -3370,6 +3491,11 @@

bcftools norm [OPTIONS] file.vcf.gz

If a record is present in multiple files, output only the first instance. Alias for -d none, deprecated.

+
-e, --exclude EXPRESSION
+
+

do not normalize input records for which EXPRESSION is true. For valid expressions see +EXPRESSIONS. Note that duplicate removal ignores this option.

+
-f, --fasta-ref FILE

reference sequence. Supplying this option will turn on left-alignment @@ -3387,6 +3513,11 @@

bcftools norm [OPTIONS] file.vcf.gz

strand. In case of overlapping transcripts, the default mode is to left-align the variant. For a description of the supported GFF3 file format see bcftools csq.

+
-i, --include EXPRESSION
+
+

normalize only input records for which EXPRESSION is true. For valid expressions see +EXPRESSIONS. Note that duplicate removal ignores this option.

+
--keep-sum TAG[,…​]

keep vector sum constant when splitting multiallelic sites. Only AD tag @@ -3450,6 +3581,11 @@

bcftools norm [OPTIONS] file.vcf.gz

when merging (-m+), merged site is PASS only if all sites being merged PASS

+
-S, --sort pos|lex
+
+

when splitting sites or processing duplicates, sort records on output by +POS only (pos, the default) or by POS and lexicographically by REF+ALT (lex)

+
-t, --targets LIST

see Common Options

@@ -3466,6 +3602,10 @@

bcftools norm [OPTIONS] file.vcf.gz

see Common Options

+
-v, --verbose INT
+
+

verbosity level of GFF parsing (0-2)

+
-w, --site-win INT

maximum distance between two records to consider when locally @@ -4027,7 +4167,8 @@

bcftools query [OPTIONS] file.vcf.gz [file.

-H, --print-header
-

print header

+

print header. By default, the header is printed with column indices, e.g. "#[1]CHROM". +These can be suppressed by giving the option twice, "-HH".

-i, --include EXPRESSION
@@ -4116,6 +4257,7 @@

Format:

%TBCSQ Translated FORMAT/BCSQ. See the csq command above for explanation and examples. %TGT Translated genotype (e.g. C/A) %TYPE Variant type (REF, SNP, MNP, INDEL, BND, OTHER) +%VKX VariantKey, biallelic hexadecimal encoding of CHROM,POS,REF,ALT (https://github.com/tecnickcom/variantkey) [] Format fields must be enclosed in brackets to loop over all samples \n new line \t tab character @@ -5211,7 +5353,7 @@

FILTERING EXPRESSIONS

AC but is always smaller than 0.5); frequency of alternate alleles (AF=AC/AN); frequency of minor alleles (MAF=MAC/AN); number of alleles in called genotypes; number of samples with missing genotype; fraction of samples with missing genotype; -indel length (deletions negative, insertions positive)

+indel length (deletions negative, insertions positive, balanced substitutions zero)

N_ALT, N_SAMPLES, AC, MAC, AF, MAF, AN, N_MISSING, F_MISSING, ILEN
@@ -5508,7 +5650,7 @@

COPYING

diff --git a/doc/bcftools.txt b/doc/bcftools.txt index ad7187af5..a5f4c3fd2 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -354,7 +354,7 @@ Add or remove annotations. Comma-separated list of columns or tags to carry over from the annotation file (see also *-a, --annotations*). If the annotation file is not a VCF/BCF, 'list' describes the columns of the annotation file and must include CHROM, - POS (or, alternatively, FROM and TO), and optionally REF and ALT. Unused + POS (or, alternatively, FROM,TO or BEG,END), and optionally REF and ALT. Unused columns which should be ignored can be indicated by "-". {nbsp} + {nbsp} + @@ -378,8 +378,12 @@ Add or remove annotations. To append to existing values (rather than replacing or leaving untouched), use "=TAG" (instead of "TAG" or "+TAG"). To replace only existing values without modifying missing annotations, use "-TAG". + As a special case of this, if position needs to be replaced, mark the column with the new coordinate as "-POS". + (Note that in previous releases this used to be "~POS", now deprecated.) + {nbsp} + + {nbsp} + To match the record also by ID or INFO/END, in addition to REF and ALT, use "~ID" or "~INFO/END". - If position needs to be replaced, mark the column with the new position as "~POS". + Note that this works only for ID and POS, for other fields see the description of *-i* below. {nbsp} + {nbsp} + If the annotation file is not a VCF/BCF, all new annotations must be @@ -387,6 +391,30 @@ Add or remove annotations. {nbsp} + {nbsp} + See also the *-l, --merge-logic* option. + {nbsp} + + {nbsp} + + *Summary of `-c, --columns`:* +---- + CHROM,POS,TAG .. match by chromosome and position, transfer annotation from TAG + CHROM,POS,-,TAG .. same as above, but ignore the third column of the annotation file + CHROM,BEG,END,TAG .. match by region (BEG,END are synonymous to FROM,TO) + CHROM,POS,REF,ALT .. match by CHROM, POS, REF and ALT + + DST_TAG:=SRC_TAG .. transfer the SRC_TAG using the new name DST_TAG + INFO .. transfer all INFO annotations + ^INFO/TAG .. transfer all INFO annotations except "TAG" + + TAG .. add or overwrite existing target value if source is not "." and skip otherwise + +TAG .. add or overwrite existing target value only it is "." + .TAG .. add or overwrite existing target value even if source is "." + .+TAG .. add new but never overwrite existing tag, regardless of its value; can transfer "." if target does not exist + -TAG .. overwrite existing value, never add new if target does not exist + =TAG .. do not overwrite but append value to existing tags + + ~FIELD .. use this column to match lines with -i/-e expression (see the description of -i below) + ~ID .. in addition to CHROM,POS,REF,ALT match by also ID + ~INFO/END .. in addition to CHROM,POS,REF,ALT match by also INFO/END +---- *-C, --columns-file* 'file':: Read the list of columns from a file (normally given via the *-c, --columns* option). @@ -397,7 +425,7 @@ Add or remove annotations. *-e, --exclude* 'EXPRESSION':: exclude sites for which 'EXPRESSION' is true. For valid expressions see - *<>*. + *<>* and the extension described in *-i, --include* below. *--force*:: continue even when parsing errors, such as undefined tags, are encountered. Note @@ -423,6 +451,19 @@ Add or remove annotations. *-i, --include* 'EXPRESSION':: include only sites for which 'EXPRESSION' is true. For valid expressions see *<>*. + {nbsp} + + {nbsp} + + Additionally, the command *bcftools annotate* supports expressions updated from the annotation + file dynamically for each record: +---- + # The field 'STR' from the -a file is required to match INFO/TAG in VCF. In the first example + # the alleles REF,ALT must match, in the second example they are ignored. The option -k is required + # to output also records that are not annotated. The third example shows the same concept with + # a numerical expression. + bcftools annotate -a annots.tsv.gz -c CHROM,POS,REF,ALT,SCORE,~STR -i'TAG={STR}' -k input.vcf + bcftools annotate -a annots.tsv.gz -c CHROM,POS,-,-,SCORE,~STR -i'TAG={STR}' -k input.vcf + bcftools annotate -a annots.tsv.gz -c CHROM,POS,-,-,SCORE,~INT -i'TAG>{INT}' -k input.vcf +---- *-k, --keep-sites*:: keep sites which do not pass *-i* and *-e* expressions instead of discarding them @@ -666,10 +707,15 @@ demand. The original calling model can be invoked with the *-c* option. in low coverage data this inflates the rate of false positives.) The *-G* option requires the presence of per-sample FORMAT/QS or FORMAT/AD tag generated with *bcftools mpileup -a QS* (or *-a AD*). -*-g, --gvcf* 'INT':: - output also gVCF blocks of homozygous REF calls. The parameter 'INT' is the - minimum per-sample depth required to include a site in the non-variant - block. +*-g, --gvcf* 'INT'[,...]:: + output gVCF blocks of homozygous REF calls, with depth (DP) ranges + specified by the list of integers. For example, passing '5,15' will + group sites into two types of gVCF blocks, the first with minimum + per-sample DP from the interval [5,15) and the latter with minimum + depth 15 or more. In this example, sites with minimum per-sample + depth less than 5 will be printed as separate records, outside of + gVCF blocks. + *-i, --insert-missed* 'INT':: output also sites missed by mpileup but present in *-T, --targets-file*. @@ -1339,7 +1385,7 @@ output VCF and are ignored for the prediction analysis. transcripts in malformatted GFFs with incorrect phase *-g, --gff-annot* 'FILE':: - GFF3 annotation file (required), such as ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens. + GFF3 annotation file (required), such as http://ftp.ensembl.org/pub/current_gff3/homo_sapiens/. The script *<>* can help with conversion from non-standard GFF formats. An example of a minimal working GFF file: ---- @@ -1459,6 +1505,9 @@ output VCF and are ignored for the prediction analysis. and VCF, such as "chrX" vs "X". The chromosome names in the output VCF will match that of the input VCF. The default is to attempt the automatic translation. +*-v, --verbose* 'INT':: + verbosity level (0-2) + *-W*['FMT']*, -W*[='FMT']*, --write-index*[='FMT']:: Automatically index the output file. 'FMT' is optional and can be one of "tbi" or "csi" depending on output file format. @@ -1645,7 +1694,7 @@ Without the *-g* option, multi-sample cross-check of samples in 'query.vcf.gz' i Note that the interpretation of the discordance score depends on the options provided (specifically *-e* and *-u*) and on the available annotations (FORMAT/PL vs FORMAT/GT). -The discordance score can be interpreted as the number of mismatching genotypes if only GT-vs-GT matching is performed. +The discordance score can be interpreted as the number of mismatching genotypes only if GT-vs-GT matching is performed. *--distinctive-sites* 'NUM[,MEM[,DIR]]':: Find sites that can distinguish between at least NUM sample pairs. If the number is smaller or equal to 1, @@ -2128,7 +2177,8 @@ multiple regions and many alignment files are processed. Assume the quality is in the Illumina 1.3+ encoding. *-A, --count-orphans*:: - Do not skip anomalous read pairs in variant calling. + Include anomalous read pairs in variant calling, i.e. reads with + flag PAIRED but not PROPER_PAIR set. By default such reads are discarded. *-b, --bam-list* 'FILE':: List of input alignment files, one file per line [null] @@ -2140,10 +2190,53 @@ multiple regions and many alignment files are processed. by misalignments. *-C, --adjust-MQ* 'INT':: - Coefficient for downgrading mapping quality for reads containing - excessive mismatches. Given a read with a phred-scaled probability q of - being generated from the mapped position, the new mapping quality is - about sqrt((INT-q)/INT)*INT. A zero value (the default) disables this functionality. + Coefficient for downgrading mapping quality for reads containing + excessive mismatches. Mismatches are counted as a proportion of the + number of aligned bases ("M", "X" or "=" CIGAR operations), along with + their quality, to derive an upper-bound of the mapping quality. + Original mapping qualities lower than this are left intact, while + higher ones are capped at the new adjusted score. ++ +The exact formula is complex and likely tuned to specific instruments +and specific alignment tools, so this option is disabled by default +(indicated as having a zero value). Variables in the formulae and +their meaning are defined below. ++ +---- +Variable Meaning / formula +M The number of matching CIGAR bases (operation "M", "X" or "="). +X The number of substitutions with quality >= 13. +SubQ The summed quality of substitution bases included in X, capped + at a maximum of quality 33 per mismatching base. +ClipQ The summed quality of soft-clipped or hard-clipped bases. This + has no minimum or maximum quality threshold per base. For + hard-clipped bases the per-base quality is taken as 13. + +T SubQ - 10 * log10(M^X / X!) + ClipQ/5 +Cap MAX(0, INT * sqrt((INT - T) / INT)) +---- ++ +Some notes on the impact of this. ++ +- As the number of mismatches increases, the mapping quality cap + reduces, eventually resulting in discarded alignments. +- High quality mismatches reduces the cap faster than low quality + mismatches. +- The starting INT value also acts as a hard cap on mapping quality, + even when zero mismatches are observed. +- Indels have no impact on the mapping quality. ++ +The intent of this option is to work around aligners that compute a +mapping quality using a local alignment without having any regard to +the degree of clipping required or consideration of potential +contamination or large scale insertions with respect to the reference. +A record may align uniquely and have no close second match, but having +a high number of mismatches may still imply that the reference is not +the correct site. ++ +However we do not recommend use of this parameter unless you fully +understand the impact of it and have determined that it is appropriate +for your sequencing technology. *-D, --full-BAQ*:: Run the BAQ algorithm on all reads, not just those in problematic @@ -2564,6 +2657,10 @@ the *<>* option is supplied. If a record is present in multiple files, output only the first instance. Alias for *-d none*, deprecated. +*-e, --exclude* 'EXPRESSION':: + do not normalize input records for which 'EXPRESSION' is true. For valid expressions see + *<>*. Note that duplicate removal ignores this option. + *-f, --fasta-ref* 'FILE'[[fasta_ref]]:: reference sequence. Supplying this option will turn on left-alignment and normalization, however, see also the *<>* @@ -2578,6 +2675,10 @@ the *<>* option is supplied. strand. In case of overlapping transcripts, the default mode is to left-align the variant. For a description of the supported GFF3 file format see *<>*. +*-i, --include* 'EXPRESSION':: + normalize only input records for which 'EXPRESSION' is true. For valid expressions see + *<>*. Note that duplicate removal ignores this option. + *--keep-sum* 'TAG'[,...]:: keep vector sum constant when splitting multiallelic sites. Only AD tag is currently supported. See also https://github.com/samtools/bcftools/issues/360 @@ -2629,6 +2730,10 @@ the *<>* option is supplied. *-s, --strict-filter*:: when merging ('-m+'), merged site is PASS only if all sites being merged PASS +*-S, --sort* 'pos'|'lex':: + when splitting sites or processing duplicates, sort records on output by + POS only ('pos', the default) or by POS and lexicographically by REF+ALT ('lex') + *-t, --targets* 'LIST':: see *<>* @@ -2641,6 +2746,9 @@ the *<>* option is supplied. *--threads* 'INT':: see *<>* +*-v, --verbose* 'INT':: + verbosity level of GFF parsing (0-2) + *-w, --site-win* 'INT':: maximum distance between two records to consider when locally sorting variants which changed position during the realignment @@ -3029,7 +3137,8 @@ Extracts fields from VCF or BCF files and outputs them in user-defined format. fields will be still printed but instead of their actual value, 'STR' will be used. *-H, --print-header*:: - print header + print header. By default, the header is printed with column indices, e.g. "#[1]CHROM". + These can be suppressed by giving the option twice, "`-HH`". *-i, --include* 'EXPRESSION':: include only sites for which 'EXPRESSION' is true. For valid expressions see @@ -3100,6 +3209,7 @@ Extracts fields from VCF or BCF files and outputs them in user-defined format. %TBCSQ Translated FORMAT/BCSQ. See the csq command above for explanation and examples. %TGT Translated genotype (e.g. C/A) %TYPE Variant type (REF, SNP, MNP, INDEL, BND, OTHER) + %VKX VariantKey, biallelic hexadecimal encoding of CHROM,POS,REF,ALT (https://github.com/tecnickcom/variantkey) [] Format fields must be enclosed in brackets to loop over all samples \n new line \t tab character @@ -3876,7 +3986,7 @@ number of samples; count of alternate alleles; minor allele count (similar to AC but is always smaller than 0.5); frequency of alternate alleles (AF=AC/AN); frequency of minor alleles (MAF=MAC/AN); number of alleles in called genotypes; number of samples with missing genotype; fraction of samples with missing genotype; -indel length (deletions negative, insertions positive) +indel length (deletions negative, insertions positive, balanced substitutions zero) N_ALT, N_SAMPLES, AC, MAC, AF, MAF, AN, N_MISSING, F_MISSING, ILEN diff --git a/edlib.h b/edlib.h index acce6b20e..1f5eca192 100644 --- a/edlib.h +++ b/edlib.h @@ -200,7 +200,7 @@ extern "C" { * 1 stands for insertion to target. * 2 stands for insertion to query. * 3 stands for mismatch. - * Alignment aligns query to target from begining of query till end of query. + * Alignment aligns query to target from beginning of query till end of query. * If gaps are not penalized, they are not in alignment. * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free(). */ diff --git a/filter.c b/filter.c index 1c6dc0f04..c9dcd023b 100644 --- a/filter.c +++ b/filter.c @@ -45,6 +45,11 @@ THE SOFTWARE. */ #include "bcftools.h" #if ENABLE_PERL_FILTERS +// Work around clang warning problems +# if defined(__clang__) +# define PERL_GCC_BRACE_GROUPS_FORBIDDEN +# endif + # define filter_t perl_filter_t # include # include @@ -68,7 +73,7 @@ typedef struct _token_t char *tag; // for debugging and printout only, VCF tag name double threshold; // filtering threshold int is_constant; // the threshold is set - int hdr_id, tag_type; // BCF header lookup ID and one of BCF_HL_* types + int hdr_id, hl_type, ht_type; // BCF header lookup ID and one of BCF_HL_* types and BCF_HT_* types int idx; // 0-based index to VCF vectors, // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..]) // -3: select indices on the fly based on values in GT @@ -81,11 +86,12 @@ typedef struct _token_t void (*comparator)(struct _token_t *, struct _token_t *, struct _token_t *rtok, bcf1_t *); void *hash; // test presence of str value in the hash via comparator regex_t *regex; // precompiled regex for string comparison + int iext; // for the use with filter_test_ext(), 1-based index to external values, 0=don't use // modified on filter evaluation at each VCF line double *values; kstring_t str_value; - int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues + int is_str, is_missing; // is_missing is set only for constants, variables are controlled via nvalues int pass_site; // -1 not applicable, 0 fails, >0 pass uint8_t *pass_samples; // status of individual samples int nvalues, mvalues; // number of used values: n=0 for missing values, n=1 for scalars, for strings n=str_value.l @@ -114,6 +120,8 @@ struct _filter_t char **undef_tag, **used_tag; int nundef_tag, nused_tag; int status, exit_on_error; + int n_ext; // number of external values to fill via filter_test_ext() + int *ext; // types of external values to fill via filter_test_ext() }; @@ -158,10 +166,11 @@ struct _filter_t #define TOK_IN 38 // contains, e.g. FILTER~"A" #define TOK_NOT_IN 39 // does not contain, e.g. FILTER!~"A" #define TOK_MODULO 40 // % +#define TOK_EXT 41 // external values set before each filter_test_ext() call, can be one of {},{str},{int},{float} -// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 +// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 // ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s % -static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7 }; +static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 0}; #define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis" // this is only for debugging, not maintained diligently static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok); @@ -225,6 +234,10 @@ static int filters_next_token(char **str, int *len) if ( !strncasecmp(tmp,"N_PASS(",7) ) { *len = 6; (*str) += 6; return -TOK_FUNC; } if ( !strncasecmp(tmp,"F_PASS(",7) ) { *len = 6; (*str) += 6; return -TOK_FUNC; } if ( !strncasecmp(tmp,"%ILEN",5) ) { *len = 5; return TOK_VAL; } // to be able to distinguish between INFO/ILEN and on-the-fly ILEN + if ( !strncasecmp(tmp,"{}",2) ) { *len = 2; return TOK_EXT; } + if ( !strncasecmp(tmp,"{STR}",5) ) { *len = 5; return TOK_EXT; } + if ( !strncasecmp(tmp,"{INT}",5) ) { *len = 5; return TOK_EXT; } + if ( !strncasecmp(tmp,"{FLOAT}",7) ) { *len = 7; return TOK_EXT; } if ( tmp[0]=='@' ) // file name { @@ -698,34 +711,48 @@ static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t * { token_t *tmp = atok; atok = btok; btok = tmp; } - if ( atok->hash ) + + char *id = line->d.id; + int pass = 0; + + while ( id ) { - if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE ) - error("Only == and != operators are supported for strings read from a file\n"); + char *ep = strchr(id,';'); + if ( ep ) *ep = 0; - int ret = khash_str2int_has_key(atok->hash, line->d.id); - if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1; - rtok->pass_site = ret; - return; - } + if ( atok->hash ) + { + if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE ) + error("Only == and != operators are supported for strings read from a file\n"); - if ( !btok->str_value.l ) error("Error occurred while evaluating the expression\n"); + pass = khash_str2int_has_key(atok->hash, id); + } + else + { + if ( !btok->str_value.l ) error("Error occurred while evaluating the expression\n"); - if ( rtok->tok_type==TOK_EQ ) - rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1; - else if ( rtok->tok_type==TOK_NE ) - rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 1 : 0; - else - { - if ( rtok->tok_type!=TOK_LIKE && rtok->tok_type!=TOK_NLIKE ) - error("Only the following operators are supported for querying ID: ==, !=, ~, !~; the operator type %d is not supported (%p %p)\n", - rtok->tok_type,atok->regex,btok->regex); + if ( rtok->tok_type==TOK_EQ || rtok->tok_type==TOK_NE ) + pass = strcmp(btok->str_value.s,id) ? 0 : 1; + else + { + if ( rtok->tok_type!=TOK_LIKE && rtok->tok_type!=TOK_NLIKE ) + error("Only the following operators are supported for querying ID: ==, !=, ~, !~; the operator type %d is not supported (%p %p)\n", + rtok->tok_type,atok->regex,btok->regex); - regex_t *regex = atok->regex ? atok->regex : (btok->regex ? btok->regex : NULL); - if ( !regex ) error("fixme: regex initialization failed\n"); - rtok->pass_site = regexec(regex,line->d.id, 0,NULL,0) ? 0 : 1; - if ( rtok->tok_type==TOK_NLIKE ) rtok->pass_site = rtok->pass_site ? 0 : 1; + regex_t *regex = atok->regex ? atok->regex : (btok->regex ? btok->regex : NULL); + if ( !regex ) error("fixme: regex initialization failed\n"); + pass = regexec(regex,id, 0,NULL,0) ? 0 : 1; + } + } + if ( ep ) + { + *ep = ';'; + id = ep + 1; + } + if ( pass || !ep ) break; } + if ( rtok->tok_type==TOK_NE || rtok->tok_type==TOK_NE) pass = pass ? 0 : 1; + rtok->pass_site = pass; } /** @@ -733,7 +760,7 @@ static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t * * @line: BCF line * @info_id: tag ID, as returned by bcf_hdr_id2int * @ivec: 0-based index to retrieve, -1 when single value is expected - * @vptr: pointer to memory location of sufficient size to accomodate + * @vptr: pointer to memory location of sufficient size to accommodate * info_id's type * * The returned value is -1 if tag is not present, 0 if present but @@ -1291,9 +1318,13 @@ static void filters_set_ilen(filter_t *flt, bcf1_t *line, token_t *tok) int i, rlen = strlen(line->d.allele[0]); for (i=1; in_allele; i++) { + if ( line->d.allele[i][0]=='<' ) + { + bcf_double_set_missing(tok->values[i-1]); + continue; + } int alen = strlen(line->d.allele[i]); - if ( rlen==alen ) bcf_double_set_missing(tok->values[i-1]); - else tok->values[i-1] = alen - rlen; + tok->values[i-1] = alen - rlen; } } static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok) @@ -2005,19 +2036,6 @@ static int func_strlen(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **sta } return 1; } -static inline double calc_binom(int na, int nb) -{ - if ( na==0 && nb==0 ) return -1; - if ( na==nb ) return 1; - - // kfunc.h implements kf_betai, which is the regularized beta function P(X<=k/N;p) = I_{1-p}(N-k,k+1) - - double pval = na < nb ? kf_betai(nb, na + 1, 0.5) : kf_betai(na, nb + 1, 0.5); - pval *= 2; - if ( pval>1 ) pval = 1; // this can happen, machine precision error, eg. kf_betai(1,0,0.5) - - return pval; -} static int func_binom(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { int i, istack = nstack - rtok->nargs; @@ -2085,7 +2103,7 @@ static int func_binom(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac bcf_double_set_missing(rtok->values[i]); continue; } - rtok->values[i] = calc_binom(vals[idx1],vals[idx2]); + rtok->values[i] = calc_binom_two_sided(vals[idx1],vals[idx2],0.5); if ( rtok->values[i] < 0 ) { bcf_double_set_missing(rtok->values[i]); @@ -2109,7 +2127,7 @@ static int func_binom(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac bcf_double_set_missing(rtok->values[i]); continue; } - rtok->values[i] = calc_binom(ptr1[0],ptr2[0]); + rtok->values[i] = calc_binom_two_sided(ptr1[0],ptr2[0],0.5); if ( rtok->values[i] < 0 ) { bcf_double_set_missing(rtok->values[i]); @@ -2148,7 +2166,7 @@ static int func_binom(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac bcf_double_set_missing(rtok->values[0]); else { - rtok->values[0] = calc_binom(ptr1[0],ptr2[0]); + rtok->values[0] = calc_binom_two_sided(ptr1[0],ptr2[0],0.5); if ( rtok->values[0] < 0 ) bcf_double_set_missing(rtok->values[0]); } @@ -2429,7 +2447,7 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token // The problem is that the implementation truncates the number of fields, filling // usually fewer than the original number of per-sample values. This is fixed by // adding an exception that makes the code aware of this: the GT indexing can be -// recognised by haveing tok->idx==-3 +// recognised by having tok->idx==-3 #define CMP_VECTORS(atok,btok,_rtok,CMP_OP,missing_logic) \ { \ token_t *rtok = _rtok; \ @@ -2935,9 +2953,26 @@ static int max_ac_an_unpack(bcf_hdr_t *hdr) return BCF_UN_INFO; } +static int filters_init1_ext(filter_t *filter, char *str, int len, token_t *tok) +{ + tok->hl_type = -1; + tok->ht_type = -1; + tok->tok_type = TOK_VAL; + tok->hdr_id = -1; + tok->pass_site = -1; + tok->idx = 0; + tok->iext = ++filter->n_ext; + filter->ext = realloc(filter->ext,sizeof(*filter->ext)*filter->n_ext); + if ( !strncasecmp(str,"{str}",len) ) { tok->ht_type = BCF_HT_STR; tok->is_str = 1; } + else if ( !strncasecmp(str,"{int}",len) ) tok->ht_type = BCF_HT_INT; + else if ( !strncasecmp(str,"{float}",len) ) tok->ht_type = BCF_HT_REAL; + filter->ext[filter->n_ext-1] = tok->ht_type; + return 0; +} static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { - tok->tag_type = -1; + tok->ht_type = -1; + tok->hl_type = -1; tok->tok_type = TOK_VAL; tok->hdr_id = -1; tok->pass_site = -1; @@ -2954,6 +2989,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) memcpy(tok->key,str+1,len-2); tok->key[len-2] = 0; tok->is_str = 1; + tok->ht_type = BCF_HT_STR; tok->nvalues = len-2; if ( !strcmp(".",tok->key) ) tok->is_missing = 1; return 0; @@ -2995,6 +3031,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { tok->setter = filters_set_qual; tok->tag = strdup("QUAL"); + tok->ht_type = BCF_HT_REAL; filter_add_used_tag(filter,NULL,tok->tag); return 0; } @@ -3002,6 +3039,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { tok->setter = filters_set_type; tok->tag = strdup("TYPE"); + tok->ht_type = BCF_HT_STR; return 0; } else if ( !strncasecmp(str,"FILTER",len) || !strncmp(str,"%FILTER",len) /* for backward compatibility */ ) @@ -3009,7 +3047,8 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) tok->comparator = filters_cmp_filter; tok->tag = strdup("FILTER"); filter->max_unpack |= BCF_UN_FLT; - tok->tag_type = BCF_HL_FLT; + tok->hl_type = BCF_HL_FLT; + tok->ht_type = BCF_HT_STR; filter_add_used_tag(filter,NULL,tok->tag); return 0; } @@ -3017,6 +3056,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { tok->comparator = filters_cmp_id; tok->tag = strdup("ID"); + tok->ht_type = BCF_HT_STR; filter_add_used_tag(filter,NULL,tok->tag); return 0; } @@ -3024,6 +3064,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { tok->setter = &filters_set_chrom; tok->tag = strdup("CHROM"); + tok->ht_type = BCF_HT_STR; filter_add_used_tag(filter,NULL,tok->tag); return 0; } @@ -3031,6 +3072,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { tok->setter = &filters_set_pos; tok->tag = strdup("POS"); + tok->ht_type = BCF_HT_INT; filter_add_used_tag(filter,NULL,tok->tag); return 0; } @@ -3039,6 +3081,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) tok->setter = &filters_set_ref_string; tok->is_str = 1; tok->tag = strdup("REF"); + tok->ht_type = BCF_HT_STR; filter_add_used_tag(filter,NULL,tok->tag); return 0; } @@ -3047,6 +3090,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) tok->setter = &filters_set_alt_string; tok->is_str = 1; tok->tag = strdup("ALT"); + tok->ht_type = BCF_HT_STR; tok->idxs = (int*) malloc(sizeof(int)); tok->idxs[0] = -1; tok->nidxs = 1; @@ -3058,6 +3102,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { tok->setter = &filters_set_nalt; tok->tag = strdup("N_ALT"); + tok->ht_type = BCF_HT_INT; return 0; } else if ( !strncasecmp(str,"N_SAMPLES",len) ) @@ -3065,6 +3110,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) tok->tok_type = TOK_VAL; tok->threshold = bcf_hdr_nsamples(filter->hdr); tok->is_constant = 1; + tok->ht_type = BCF_HT_INT; return 0; } else if ( !strncasecmp(str,"N_MISSING",len) ) @@ -3072,6 +3118,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) filter->max_unpack |= BCF_UN_FMT; tok->setter = &filters_set_nmissing; tok->tag = strdup("N_MISSING"); + tok->ht_type = BCF_HT_INT; return 0; } else if ( !strncasecmp(str,"F_MISSING",len) ) @@ -3079,6 +3126,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) filter->max_unpack |= BCF_UN_FMT; tok->setter = &filters_set_nmissing; tok->tag = strdup("F_MISSING"); + tok->ht_type = BCF_HT_REAL; return 0; } } @@ -3119,13 +3167,14 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) for (i=0; insamples; i++) tok->usmpl[i] = 1; } - tok->tag_type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO; + tok->hl_type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO; if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT; if ( tok->hdr_id>=0 ) { if ( is_fmt && !strcmp("GT",tmp.s) ) { tok->setter = &filters_set_genotype_string; tok->is_str = 1; + tok->ht_type = BCF_HT_STR; } else if ( is_fmt ) { @@ -3140,9 +3189,9 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) } switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_FMT,tok->hdr_id) ) { - case BCF_HT_INT: tok->setter = &filters_set_format_int; break; - case BCF_HT_REAL: tok->setter = &filters_set_format_float; break; - case BCF_HT_STR: tok->setter = &filters_set_format_string; tok->is_str = 1; break; + case BCF_HT_INT: tok->setter = &filters_set_format_int; tok->ht_type = BCF_HT_INT; break; + case BCF_HT_REAL: tok->setter = &filters_set_format_float; tok->ht_type = BCF_HT_REAL; break; + case BCF_HT_STR: tok->setter = &filters_set_format_string; tok->ht_type = BCF_HT_STR; tok->is_str = 1; break; default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__); } } @@ -3151,10 +3200,14 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) else { if ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) == BCF_HT_FLAG ) + { tok->setter = filters_set_info_flag; + tok->ht_type = BCF_HT_INT; + } else { - if ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) == BCF_HT_STR ) tok->is_str = 1; + tok->ht_type = bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id); + if ( tok->ht_type == BCF_HT_STR ) tok->is_str = 1; if ( bcf_hdr_id2number(filter->hdr,BCF_HL_INFO,tok->hdr_id)==1 ) tok->setter = filters_set_info; else @@ -3186,6 +3239,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { tok->setter = &filters_set_alt_string; tok->is_str = 1; + tok->ht_type = BCF_HT_STR; tok->tag = strdup(tmp.s); free(tmp.s); filter_add_used_tag(filter,NULL,tok->tag); @@ -3196,6 +3250,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) filter->max_unpack |= BCF_UN_FMT; tok->setter = &filters_set_an; tok->tag = strdup("AN"); + tok->ht_type = BCF_HT_INT; free(tmp.s); return 0; } @@ -3204,6 +3259,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) filter->max_unpack |= BCF_UN_FMT; tok->setter = &filters_set_ac; tok->tag = strdup("AC"); + tok->ht_type = BCF_HT_INT; free(tmp.s); return 0; } @@ -3212,6 +3268,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) filter->max_unpack |= max_ac_an_unpack(filter->hdr); tok->setter = &filters_set_mac; tok->tag = strdup("MAC"); + tok->ht_type = BCF_HT_INT; free(tmp.s); return 0; } @@ -3220,6 +3277,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) filter->max_unpack |= max_ac_an_unpack(filter->hdr); tok->setter = &filters_set_af; tok->tag = strdup("AF"); + tok->ht_type = BCF_HT_REAL; free(tmp.s); return 0; } @@ -3228,6 +3286,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) filter->max_unpack |= max_ac_an_unpack(filter->hdr); tok->setter = &filters_set_maf; tok->tag = strdup("MAF"); + tok->ht_type = BCF_HT_REAL; free(tmp.s); return 0; } @@ -3236,6 +3295,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) filter->max_unpack |= BCF_UN_STR; tok->setter = &filters_set_ilen; tok->tag = strdup("ILEN"); + tok->ht_type = BCF_HT_INT; free(tmp.s); return 0; } @@ -3258,7 +3318,10 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) filter->status |= FILTER_ERR_UNKN_TAGS; filter_add_undef_tag(filter,tmp.s); } + tok->ht_type = BCF_HT_REAL; } + else + tok->ht_type = BCF_HT_INT; tok->is_constant = 1; if ( tmp.s ) free(tmp.s); @@ -3412,6 +3475,33 @@ static void perl_destroy(filter_t *filter) #endif } +// A very rudimentary heuristics to determine type, e.g. STR_TAG={} implies {str}. +// Throws an error on anything more complex and asks for an explicit type. +static void determine_ext_types(filter_t *filter, int ntok, token_t *tok) +{ + int i; + for (i=0; istr); + for (i=0; iext[j]!=-1 && filter->ext[j]!=tok[i].ht_type ) + error("[%s:%d %s] FIXME: this should not happen %d vs %d, iext=%d\n",__FILE__,__LINE__,__FUNCTION__,filter->ext[j],tok[i].ht_type,j); + filter->ext[j] = tok[i].ht_type; + if ( tok[i].ht_type==BCF_HT_STR ) tok[i].is_str = 1; + } +} + // Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error) @@ -3459,6 +3549,13 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error memset(&ops[nops-1],0,sizeof(token_t)); nops--; } + else if ( ret==TOK_EXT ) // external value + { + nout++; + hts_expand0(token_t, nout, mout, out); + filters_init1_ext(filter, tmp, len, &out[nout-1]); + tmp += len; + } else if ( ret!=TOK_VAL ) // one of the operators { // detect unary minus: replace -value with -1*(value) @@ -3569,12 +3666,12 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error hts_expand0(token_t, nops, mops, ops); ops[nops-1].tok_type = ret; } - else if ( !len ) + else if ( !len ) // all tokes read or an error { if ( *tmp && !isspace(*tmp) ) error("Could not parse the expression: [%s]\n", str); break; // all tokens read } - else // annotation name or filtering value + else // TOK_VAL: annotation name or value { nout++; hts_expand0(token_t, nout, mout, out); @@ -3604,10 +3701,13 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error return filter; } + // Determine types of external variables from the context + determine_ext_types(filter,nout,out); + // In the special cases of TYPE and FILTER the BCF header IDs are yet unknown. Walk through the // list of operators and convert the strings (e.g. "PASS") to BCF ids. The string value token must be // just before or after the FILTER token and they must be followed with a comparison operator. - // At this point we also initialize regex expressions which, in RPN, must preceed the LIKE/NLIKE operator. + // At this point we also initialize regex expressions which, in RPN, must precede the LIKE/NLIKE operator. // Additionally, treat "." as missing value rather than a string in numeric equalities; that // @file is only used with ID; etc. // This code is fragile: improve me. @@ -3641,7 +3741,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error int set_missing = 0; if ( out[k].hdr_id>0 ) { - int type = bcf_hdr_id2type(filter->hdr,out[k].tag_type,out[k].hdr_id); + int type = bcf_hdr_id2type(filter->hdr,out[k].hl_type,out[k].hdr_id); if ( type==BCF_HT_INT ) set_missing = 1; else if ( type==BCF_HT_REAL ) set_missing = 1; } @@ -3731,7 +3831,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error else if ( !strcasecmp(out[ival].key,"r") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]=0; } // r continue; } - if ( out[i].tag_type==BCF_HL_FLT ) + if ( out[i].hl_type==BCF_HL_FLT ) { if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); int itok = i, ival; @@ -3845,6 +3945,7 @@ void filter_destroy(filter_t *filter) } for (i=0; inundef_tag; i++) free(filter->undef_tag[i]); for (i=0; inused_tag; i++) free(filter->used_tag[i]); + free(filter->ext); free(filter->undef_tag); free(filter->used_tag); free(filter->cached_GT.buf); @@ -3858,6 +3959,37 @@ void filter_destroy(filter_t *filter) free(filter); } +int filter_test_ext(filter_t *filter, bcf1_t *rec, const uint8_t **samples, const void **ext) +{ + if ( !filter->n_ext ) + return filter_test(filter,rec,samples); + + int i; + for (i=0; infilters; i++) + { + token_t *tok = &filter->filters[i]; + if ( !tok->iext ) continue; + if ( !ext[tok->iext-1] ) + { + tok->is_missing = 1; + tok->nvalues = 0; + if ( filter->ext[tok->iext-1]==BCF_HT_STR ) tok->str_value.l = 0; + continue; + } + tok->is_missing = 0; + tok->nvalues = 1; + if ( filter->ext[tok->iext-1]==BCF_HT_STR ) + { + tok->str_value.l = 0; + kputs((const char*)ext[tok->iext-1],&tok->str_value); + tok->nvalues = tok->str_value.l; + } + else if ( filter->ext[tok->iext-1]==BCF_HT_INT ) tok->values[0] = *((const int*)ext[tok->iext-1]); + else if ( filter->ext[tok->iext-1]==BCF_HT_REAL ) tok->values[0] = *((const float*)ext[tok->iext-1]); + } + return filter_test(filter,rec,samples); +} + int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) { if ( filter->status != FILTER_OK ) error("Error: the caller did not check the filter status\n"); @@ -3994,7 +4126,11 @@ int filter_max_unpack(filter_t *flt) { return flt->max_unpack; } - +const int *filter_ext_types(filter_t *filter, int *n_ext) +{ + *n_ext = filter->n_ext; + return filter->ext; +} const double *filter_get_doubles(filter_t *filter, int *nval, int *nval1) { token_t *tok = filter->flt_stack[0]; diff --git a/filter.h b/filter.h index cc60d6b96..d6a8e0893 100644 --- a/filter.h +++ b/filter.h @@ -1,6 +1,6 @@ /* filter.h -- filter expressions. - Copyright (C) 2013-2023 Genome Research Ltd. + Copyright (C) 2013-2024 Genome Research Ltd. Author: Petr Danecek @@ -31,8 +31,9 @@ typedef struct _filter_t filter_t; /** * @hdr: BCF header file - * @str: see the bcftools filter command help for description - * + * @str: see the bcftools filter command help for description. + * See also the extended usage described in filter_test_ext(), + * intended for programmatic access * Same as filter_parse() but exits on errors */ filter_t *filter_init(bcf_hdr_t *hdr, const char *str); @@ -48,6 +49,22 @@ void filter_destroy(filter_t *filter); */ int filter_test(filter_t *filter, bcf1_t *rec, const uint8_t **samples); +/** + * filter_test_ext() - same as filter_test(), but sets some of the terms + * on the fly. An expression initialized with, say, + * "STR_TAG={} | INT_TAG={} | FLT_TAG={}" takes three + * additional pointer arguments which are expected to point to memory + * area occupied by the appropriate type, see also filter_ext_types(). + * The type determination is not fool-proof, in such case the type can + * be given explicitly as eg "TAG={str}". + * @ext: array of size 'n_ext' occupied with pointers to the data types + * inferred from the expression given at the time of initialization. + * The pointers set to NULL will be treated as if missing value "." + * was given. + * @n_ext: the size of 'ext' array + */ +int filter_test_ext(filter_t *filter, bcf1_t *rec, const uint8_t **samples, const void **ext); + /** * filter_set_samples() - restrict filtering expression to samples. * Call after filter_init(). @@ -60,9 +77,14 @@ void filter_set_samples(filter_t *filter, const uint8_t *samples); */ const double *filter_get_doubles(filter_t *filter, int *nval, int *nval1); -void filter_expression_info(FILE *fp); int filter_max_unpack(filter_t *filter); +/** + * filter_ext_types() - returns the number and BCF_HT_* types of external values + * found in the filtering expression + */ +const int *filter_ext_types(filter_t *filter, int *n_ext); + /** * Same as filter_init() but may not exit on some type of errors. The caller * must check if the returned value is not NULL and if the consequent call diff --git a/gff.c b/gff.c index e56ceeda1..283ced331 100644 --- a/gff.c +++ b/gff.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -521,8 +522,8 @@ static void gff_parse_gene(gff_t *gff, const char *line, ftr_t *ftr) gene->name = strdup(aux->gene_ids.str[gene_id]); // Name= field is not present, use the gene ID instead } -// Returns 0 for exons,CDS,UTRs to indiciate these need to be pruned later and regidx built on them, -// or -1 to indiciate the structure needs not be saved (either because of an error or because saved +// Returns 0 for exons,CDS,UTRs to indicate these need to be pruned later and regidx built on them, +// or -1 to indicate the structure needs not be saved (either because of an error or because saved // as transcript or gene.) static int gff_parse_line(gff_t *gff, char *line, ftr_t *ftr) { @@ -1063,7 +1064,7 @@ int gff_parse(gff_t *gff) INC_NWARN(wrong_phase); INC_NWARN(overlapping_cds); if ( nwarn > 0 ) - fprintf(stderr,"Warning: %d warnings were suppressed, run with `--verbose 2` to see them all\n",nwarn); + fprintf(stderr,"Warning: %d warnings were suppressed, increase verbosity to see them all\n",nwarn); } if ( gff->dump_fname ) gff_dump(gff, gff->dump_fname); diff --git a/gff.h b/gff.h index 46cbb267d..afa945e81 100644 --- a/gff.h +++ b/gff.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2023 Genome Research Ltd. + Copyright (c) 2023-2024 Genome Research Ltd. Author: Petr Danecek @@ -36,7 +36,7 @@ Read about transcript types here http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html - http://www.ensembl.org/info/genome/variation/predicted_data.html + https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html https://www.gencodegenes.org/pages/biotypes.html List of supported biotypes diff --git a/mcall.c b/mcall.c index deb2f33e5..13383787e 100644 --- a/mcall.c +++ b/mcall.c @@ -444,7 +444,7 @@ void mcall_destroy(call_t *call) // qual calculation is not affected. // Missing values are replaced by generic likelihoods when X (unseen allele) is // present. -// NB: While the -m callig model uses the pdgs in canonical order, +// NB: While the -m calling model uses the pdgs in canonical order, // the original samtools -c calling code uses pdgs in reverse order (AA comes // first, RR last). // NB: Ploidy is not taken into account here, which is incorrect. @@ -1495,7 +1495,7 @@ int mcall(call_t *call, bcf1_t *rec) // If available, take into account reference panel AFs if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 ) { - int an = call->ac[0]; // number of alleles total, procede only if not zero; reuse call->ac + int an = call->ac[0]; // number of alleles total, proceed only if not zero; reuse call->ac if ( an > 0 && bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals_ori-1 ) // number of ALT alleles { int ac0 = an; // this will become the number of REFs diff --git a/misc/gff2gff.py b/misc/gff2gff.py index 9a6f85ea0..2c26a6ca2 100755 --- a/misc/gff2gff.py +++ b/misc/gff2gff.py @@ -152,7 +152,7 @@ def main(): ''' Ensembl gff should have one gene and one transcript per "feature group" Then, can have multiple CDS/exons - read in from the input gff, one FeatueGroup instance has one gene, one transcript and (potentially) + read in from the input gff, one FeatureGroup instance has one gene, one transcript and (potentially) multiple CDS/exon Exons aren't printed, as not needed bt bcftools csq ''' diff --git a/misc/plot-vcfstats b/misc/plot-vcfstats index 990a56ffd..ae87ea241 100755 --- a/misc/plot-vcfstats +++ b/misc/plot-vcfstats @@ -672,7 +672,7 @@ sub parse_vcfstats1 for my $b (keys %{$dat{$a}}) { # Merging multiple vcfstats files. Honestly, this is quite hacky. - if ( !exists($$opts{dat}{$a}{$b}) ) { $$opts{dat}{$a}{$b} = $dat{$a}{$b}; next; } # copy all, first occurance + if ( !exists($$opts{dat}{$a}{$b}) ) { $$opts{dat}{$a}{$b} = $dat{$a}{$b}; next; } # copy all, first occurrence if ( $a eq 'ID' ) { merge_id($opts,$$opts{dat}{$a},$dat{$a},$b); } elsif ( ref($dat{$a}{$b}) ne 'ARRAY' ) { $$opts{dat}{$a}{$b} += $dat{$a}{$b} unless $b eq 'number of samples:'; } # SN, Summary numbers, do not sum sample counts @@ -749,7 +749,7 @@ sub init_plots img_fmt = '$$opts{img_fmt}' - # Use logarithimic X axis for allele frequency plots + # Use logarithmic X axis for allele frequency plots af_xlog = 0 # Plots to generate, set to 0 to disable diff --git a/mpileup.c b/mpileup.c index 2953c8806..943e0f6f6 100644 --- a/mpileup.c +++ b/mpileup.c @@ -1,6 +1,6 @@ /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools - Copyright (C) 2008-2023 Genome Research Ltd. + Copyright (C) 2008-2024 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -612,7 +612,7 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) } } } - return 0; + return ret; } static int mpileup(mplp_conf_t *conf) @@ -652,8 +652,12 @@ static int mpileup(mplp_conf_t *conf) } } nregs = regidx_nregs(conf->reg); - conf->reg_itr = regitr_init(conf->reg); - regitr_loop(conf->reg_itr); // region iterator now positioned at the first region + if ( nregs ) + { + // the regions list can be empty, see #2250 + conf->reg_itr = regitr_init(conf->reg); + regitr_loop(conf->reg_itr); // region iterator now positioned at the first region + } } // read the header of each file in the list and initialize data @@ -699,7 +703,7 @@ static int mpileup(mplp_conf_t *conf) i--; continue; } - if (conf->reg) { + if (conf->reg && nregs) { hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]); if (idx == NULL) { fprintf(stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]); @@ -938,6 +942,7 @@ static int mpileup(mplp_conf_t *conf) // Run mpileup for multiple regions + int ret = 0; if ( nregs ) { int ireg = 0; @@ -966,12 +971,18 @@ static int mpileup(mplp_conf_t *conf) bam_mplp_reset(conf->iter); } } - mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end); + ret = mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end); + if ( ret<0 ) break; } while ( regitr_loop(conf->reg_itr) ); } - else - mpileup_reg(conf,0,UINT32_MAX); + else if ( !conf->reg ) + ret = mpileup_reg(conf,0,UINT32_MAX); + if ( ret<0 ) + { + fprintf(stderr, "[%s] failed to read from input file\n", __func__); + exit(EXIT_FAILURE); + } flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL); @@ -1163,7 +1174,7 @@ static void list_annotations(FILE *fp) "\n" "FORMAT annotation tags available (\"FORMAT/\" prefix is optional):\n" "\n" - " FORMAT/AD .. Allelic depth (Number=R,Type=Integer)\n" + "* FORMAT/AD .. Allelic depth (Number=R,Type=Integer)\n" " FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n" " FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n" " FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n" @@ -1213,7 +1224,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) "\n" "Input options:\n" " -6, --illumina1.3+ Quality is in the Illumina-1.3+ encoding\n" - " -A, --count-orphans Do not discard anomalous read pairs\n" + " -A, --count-orphans Include anomalous read pairs, with flag PAIRED but not PROPER_PAIR set\n" " -b, --bam-list FILE List of input BAM filenames, one per line\n" " -B, --no-BAQ Disable BAQ (per-Base Alignment Quality)\n" " -C, --adjust-MQ INT Adjust mapping quality [0]\n" @@ -1367,7 +1378,7 @@ int main_mpileup(int argc, char *argv[]) mplp.n_threads = 0; mplp.bsmpl = bam_smpl_init(); // the default to be changed in future, see also parse_format_flag() - mplp.fmt_flag = B2B_INFO_BQBZ|B2B_INFO_IDV|B2B_INFO_IMF|B2B_INFO_MQ0F|B2B_INFO_MQBZ|B2B_INFO_MQSBZ|B2B_INFO_RPBZ|B2B_INFO_SCBZ|B2B_INFO_SGB|B2B_INFO_VDB; + mplp.fmt_flag = B2B_INFO_BQBZ|B2B_INFO_IDV|B2B_INFO_IMF|B2B_INFO_MQ0F|B2B_INFO_MQBZ|B2B_INFO_MQSBZ|B2B_INFO_RPBZ|B2B_INFO_SCBZ|B2B_INFO_SGB|B2B_INFO_VDB|B2B_FMT_AD; mplp.max_read_len = 500; mplp.ambig_reads = B2B_DROP; mplp.indel_win_size = 110; diff --git a/mpileup_bench/README b/mpileup_bench/README index 41f2d4aec..3de459789 100644 --- a/mpileup_bench/README +++ b/mpileup_bench/README @@ -101,3 +101,25 @@ false-negatives. This provides a way to data mine things much more carefully. We can further drill down on these cured/caused variants by subdividing into deletions vs insertions, or filtering to high quality only. + +----------------------------------------------------------------------------- + +There is also run_multi.sh which can call 3 samples together (eg a +trio), splits them, and then checks the calling works per sample when +in the presence of other samples. This also produces PNG files. + +Note the above script requires a working bcftools plugins system. +This won't be in your path by default if building from a source tree, +so you may need to manually set BCFTOOLS_PLUGINS=$srcdir/plugins first. + +For example: + +BENCHDIR=$bcftools_src/mpileup_bench \ +BCFTOOLS=$bcftools_src/bcftools \ +$bcftools_src/mpileup_bench/run_multi.sh \ +HG00[234].illumima.bam illumina.out -X illumina + +See the gnuplot example script at the end for ideas on how to merge +multiple runs together to visualise the impact of a change. +E.g. "devel" vs a local modification. + diff --git a/mpileup_bench/compare_vcf_simple.sh b/mpileup_bench/compare_vcf_simple.sh index f7994eec1..0d43b3f46 100755 --- a/mpileup_bench/compare_vcf_simple.sh +++ b/mpileup_bench/compare_vcf_simple.sh @@ -266,7 +266,7 @@ then v12_indel_hq=`$bcftools view -H -i "TYPE='indel' && QUAL >= $qual && QUAL < $qual_max" $v2.isec/0003.vcf|wc -l` v12_indel_hq2=`$bcftools view -H -i "TYPE='indel' && QUAL >= $qual" $v2.isec/0003.vcf|wc -l` - # Total number of SNPs minue true HQ snps we call + # Total number of SNPs minus true HQ snps we call v12_snp_hq2=`expr $tot_snp - $v12_snp_hq2` v12_indel_hq2=`expr $tot_indel - $v12_indel_hq2` printf "$v2\t$qual\t$v12_snp_hq $v2_snp_hq\t$v12_indel_hq $v2_indel_hq\t$tot_snp $v12_snp_hq2 $v2_snp_hq2\t$tot_indel $v12_indel_hq2 $v2_indel_hq2\n" diff --git a/mpileup_bench/run_multi.sh b/mpileup_bench/run_multi.sh new file mode 100755 index 000000000..bc72c05e4 --- /dev/null +++ b/mpileup_bench/run_multi.sh @@ -0,0 +1,190 @@ +#!/bin/sh + +# Execute bcftools on HG002, HG003 and HG004 samples. This could be done as +# a trio with pedegree information, but for now we just do them without +# additional data so we can evaluate calling in the presence of other samples. +# +# We have the GIAB benchmark files, but the HG002-4 BAMs need to be passed in +# as inputs as this may vary by instrument. + +# Standard files, downloaded by the get_data.sh script. +# They can be overridden on the command line before running the script +# eg "BED=hard_regions.bed ./run_mpileup.sh [opts]" + +# Note if in the shell you do: === () { /bin/true; } +# then you can cut and paste the === lines as the first part becomes a +# semi-colon capable comment. + +HREF38=${HREF38:-/nfs/srpipe_references/references/Human/GRCh38_full_analysis_set_plus_decoy_hla/all/fasta/Homo_sapiens.GRCh38_full_analysis_set_plus_decoy_hla.fa} +TRUTH002=${TRUTH002:-HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz} +TRUTH003=${TRUTH003:-HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz} +TRUTH004=${TRUTH004:-HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz} +BED002=${BED002:-HG002_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed} +BED003=${BED003:-HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed} +BED004=${BED004:-HG004_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed} +bcftools=${BCFTOOLS:-bcftools} +BENCHDIR=${BENCHDIR:-.} + +if [ $# -lt 5 ] +then + echo Usage: run_multi.sh HG002.bam HG003.bam HG004.bam region outdir [bcftools-arguments] + echo + echo Internal variables may be overridden on the command line, eg + echo BCFTOOLS=~/bcftools/bcftools.myPR ./run_multi.sh [args] + echo + echo Current settings: + echo BCFTOOLS=${bcftools} + echo HREF38=${HREF38} + echo TRUTH002=${TRUTH002} + echo TRUTH003=${TRUTH003} + echo TRUTH004=${TRUTH004} + echo BED002=${BED002} + echo BED003=${BED003} + echo BED004=${BED004} + echo BENCHDIR=${BENCHDIR} + exit 1 +fi + + +file2=$1 +file3=$2 +file4=$3 +region=$4 +dir=$5 +shift 5 +args=${@+"$@"} + +mkdir -p $dir + +# Change to "if false" if we wish to adjust the plots only without recalling. +if true +then + +# Do the actual variant calling +echo "===; Running $bcftools mpileup $args -a AD --fasta-ref $HREF38 -r $region $file2 $file3 $file4 | bcftools call -vm -" +eval $bcftools mpileup $args -a AD --fasta-ref $HREF38 -r $region $file2 $file3 $file4 2>$dir/bcftools.mpileup.out | $bcftools call -vm - > $dir/bcftools.vcf 2>$dir/bcftools.call.out + +# Split into the input samples +echo +echo "===; $bcftools +split $dir/bcftools.vcf -o $dir" +eval $bcftools +split $dir/bcftools.vcf -o $dir + +set -- `$bcftools view -h $dir/bcftools.vcf|tail -1|cut -f 10-` +vcf2=$1.vcf +vcf3=$2.vcf +vcf4=$3.vcf + +# Remove non-sample data +echo +echo "===; $bcftools norm -m -both -f $HREF38 $dir/$vcf2 | grep -v '[.0]/[.0]:' > $dir/_$vcf2" +eval $bcftools norm -m -both -f $HREF38 $dir/$vcf2 | grep -v '[.0]/[.0]:' > $dir/_$vcf2 +echo "===; $bcftools norm -m -both -f $HREF38 $dir/$vcf3 | grep -v '[.0]/[.0]:' > $dir/_$vcf3" +eval $bcftools norm -m -both -f $HREF38 $dir/$vcf3 | grep -v '[.0]/[.0]:' > $dir/_$vcf3 +echo "===; $bcftools norm -m -both -f $HREF38 $dir/$vcf4 | grep -v '[.0]/[.0]:' > $dir/_$vcf4" +eval $bcftools norm -m -both -f $HREF38 $dir/$vcf4 | grep -v '[.0]/[.0]:' > $dir/_$vcf4 + +# A primary evaluation. +# They key thing here is it leaves behind the .isec directory with the +# intersection of the truth and call sets. +echo +echo "===; QUAL=30 NORM=1 ${BENCHDIR}/compare_vcf_simple.sh $TRUTH002 $dir/_$vcf2 '' $BED002 $region" +QUAL=30 NORM=1 ${BENCHDIR}/compare_vcf_simple.sh $TRUTH002 $dir/_$vcf2 "" $BED002 $region +echo "===; QUAL=30 NORM=1 ${BENCHDIR}/compare_vcf_simple.sh $TRUTH003 $dir/_$vcf3 '' $BED003 $region" +QUAL=30 NORM=1 ${BENCHDIR}/compare_vcf_simple.sh $TRUTH003 $dir/_$vcf3 "" $BED003 $region +echo "===; QUAL=30 NORM=1 ${BENCHDIR}/compare_vcf_simple.sh $TRUTH004 $dir/_$vcf4 '' $BED004 $region" +QUAL=30 NORM=1 ${BENCHDIR}/compare_vcf_simple.sh $TRUTH004 $dir/_$vcf4 "" $BED004 $region + + +# Produce a .plot file for use in gnuplot, along with a basic summary too. +echo "===; ${BENCHDIR}/plot_isec.pl $dir/_$vcf2.isec indel > $dir/2.all" +${BENCHDIR}/plot_isec.pl $dir/_$vcf2.isec indel > $dir/2.out +grep ALL $dir/2.out > $dir/2.all +grep INS $dir/2.out > $dir/2.ins +grep DEL $dir/2.out > $dir/2.del +awk 'BEGIN {n=0} $6 >= n {print;n=50*(1+int($6/50))}' $dir/2.all | cut -c 1-28|head -20 + +echo "===; ${BENCHDIR}/plot_isec.pl $dir/_$vcf3.isec indel > $dir/3.all" +${BENCHDIR}/plot_isec.pl $dir/_$vcf3.isec indel > $dir/3.out +grep ALL $dir/3.out > $dir/3.all +grep INS $dir/3.out > $dir/3.ins +grep DEL $dir/3.out > $dir/3.del +awk 'BEGIN {n=0} $6 >= n {print;n=50*(1+int($6/50))}' $dir/3.all | cut -c 1-28|head -20 + +echo "===; ${BENCHDIR}/plot_isec.pl $dir/_$vcf4.isec indel > $dir/4.all" +${BENCHDIR}/plot_isec.pl $dir/_$vcf4.isec indel > $dir/4.out +grep ALL $dir/4.out > $dir/4.all +grep INS $dir/4.out > $dir/4.ins +grep DEL $dir/4.out > $dir/4.del +awk 'BEGIN {n=0} $6 >= n {print;n=50*(1+int($6/50))}' $dir/4.all | cut -c 1-28|head -20 + +else + +# Short cut to replot results without rerunning the calling +set -- `$bcftools view -h $dir/bcftools.vcf|tail -1|cut -f 10-` +vcf2=$1.vcf +vcf3=$2.vcf +vcf4=$3.vcf + +fi + +# Generate GNU plots +echo === Running gnuplot to create "HG00[234]*.png" +gnuplot <str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); return 0; } -void hdr_append(args_t *args, char *fmt) -{ - int i; - for (i=0; inpop; i++) - bcf_hdr_printf(args->out_hdr, fmt, args->pop[i].suffix,*args->pop[i].name ? " in " : "",args->pop[i].name); -} + +// This is implemented as a macro so the compiler can properly validate the +// printf format string. +#define hdr_append(args, fmt) \ +do { \ + int i; \ + for (i=0; inpop; i++) \ + bcf_hdr_printf(args->out_hdr, fmt, args->pop[i].suffix,*args->pop[i].name ? " in " : "",args->pop[i].name); \ +} while (0) + + int parse_func_pop(args_t *args, pop_t *pop, char *tag_expr, char *expr) { pop->nftf++; diff --git a/plugins/mendelian2.c b/plugins/mendelian2.c index cb407a3bd..3c3290ade 100644 --- a/plugins/mendelian2.c +++ b/plugins/mendelian2.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2015-2023 Genome Research Ltd. + Copyright (c) 2015-2024 Genome Research Ltd. Author: Petr Danecek @@ -70,7 +70,9 @@ typedef struct nfail, // number of -i/-e filters failed nmiss, // number of genotypes with a missing allele in the trio ngood, // number of good genotypes (after any -i/-e filters applied) - nmerr; // number of mendelian errors + nmerr, // number of mendelian errors + ngood_alt, // number of error-free non-ref genotypes + nrule; // number of genotypes with no rule to apply } stats_t; @@ -425,7 +427,7 @@ static void init_data(args_t *args) args->rules = regidx_init(args->rules_fname, parse_rules, NULL, sizeof(rule_t), args); else args->rules = init_rules(args, args->rules_str); - if ( !args->rules ) error("Coud not parse the Mendelian rules\n"); + if ( !args->rules ) error("Could not parse the Mendelian rules\n"); args->itr = regitr_init(args->rules); args->rule = (rule_t*) malloc(sizeof(*args->rule)*args->nsex_id); @@ -437,9 +439,9 @@ static void init_data(args_t *args) args->trio = (trio_t*) calloc(1,sizeof(trio_t)); list = hts_readlist(args->pfm, 0, &n); if ( n!=3 ) error("Expected three sample names with -t\n"); - args->trio[0].idx[iKID] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[0]); - args->trio[0].idx[iDAD] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[1]); - args->trio[0].idx[iMOM] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[2]); + const int ped_idx[3] = {2,1,0}; // sample order is different on the command line (P,F,M) and in the code (M,F,P) + for (i=0; i<3; i++) + args->trio[0].idx[i] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[ped_idx[i]]); if ( args->trio[0].idx[iKID] < 0 ) { if ( strlen(list[0])>3 && !strncasecmp(list[0],"1X:",3) ) @@ -453,10 +455,10 @@ static void init_data(args_t *args) args->trio[0].sex_id = iMOM; } } - for (i=0; itrio[0].idx[i] < 0 ) error("The sample is not present: %s\n", list[i]); - free(list[i]); + if ( args->trio[0].idx[i] < 0 ) error("The sample is not present: %s\n", list[ped_idx[i]]); + free(list[ped_idx[i]]); } free(list); } @@ -627,7 +629,7 @@ static int collect_stats(args_t *args, bcf1_t *rec) continue; } rule_t *rule = &args->rule[trio->sex_id]; - if ( !rule->inherits ) continue; // should have some stats for this? + if ( !rule->inherits ) { trio->stats.nrule++; continue; } uint64_t kid1, kid2, parent, mom, dad; int nal = parse_gt(&args->gt_arr[ngt*trio->idx[iKID]],ngt,&kid1,&kid2); if ( nal < rule->ploidy ) { ret |= HAS_MISS; trio->stats.nmiss++; continue; } @@ -639,7 +641,13 @@ static int collect_stats(args_t *args, bcf1_t *rec) } nal = parse_gt(&args->gt_arr[ngt*trio->idx[j]],ngt,&parent,&parent); if ( !nal ) { ret |= HAS_MISS; trio->stats.nmiss++; continue; } - if ( parent&kid1 ) { ret |= HAS_GOOD; trio->stats.ngood++; continue; } + if ( parent&kid1 ) + { + ret |= HAS_GOOD; + trio->stats.ngood++; + if ( parent!=1 || parent!=kid1 ) trio->stats.ngood_alt++; + continue; + } ret |= HAS_MERR; trio->stats.nmerr++; trio->has_merr = 1; @@ -648,7 +656,14 @@ static int collect_stats(args_t *args, bcf1_t *rec) } int nal_mom = parse_gt(&args->gt_arr[ngt*trio->idx[iMOM]],ngt,&mom,&mom); int nal_dad = parse_gt(&args->gt_arr[ngt*trio->idx[iDAD]],ngt,&dad,&dad); - if ( (kid1&dad && kid2&mom) || (kid1&mom && kid2&dad) ) { ret |= HAS_GOOD; trio->stats.ngood++; continue; } // both children's alleles phased + if ( (kid1&dad && kid2&mom) || (kid1&mom && kid2&dad) ) + { + // both children's alleles phased + ret |= HAS_GOOD; + trio->stats.ngood++; + if ( dad!=1 || mom!=1 || (kid1|kid2)!=1 ) trio->stats.ngood_alt++; + continue; + } if ( !nal_mom || !nal_dad ) { ret |= HAS_MISS; trio->stats.nmiss++; } // one or both parents missing if ( !nal_mom && !nal_dad ) continue; // both parents missing if ( !nal_mom && ((kid1|kid2)&dad) ) continue; // one parent missing but the kid is consistent with the other @@ -738,10 +753,17 @@ static void print_stats(args_t *args) int i; fprintf(log_fh,"# Per-trio stats, each column corresponds to one trio. List of trios is below.\n"); + fprintf(log_fh,"# The meaning of per-trio stats is the same as described above, ngood_alt is\n"); + fprintf(log_fh,"# the number of good genotypes with at least one non-reference allele, and is\n"); + fprintf(log_fh,"# included in the ngood counter\n"); fprintf(log_fh,"ngood"); for (i=0; intrio; i++) fprintf(log_fh,"\t%d",args->trio[i].stats.ngood); fprintf(log_fh,"\n"); + fprintf(log_fh,"ngood_alt"); + for (i=0; intrio; i++) fprintf(log_fh,"\t%d",args->trio[i].stats.ngood_alt); + fprintf(log_fh,"\n"); + fprintf(log_fh,"nmerr"); for (i=0; intrio; i++) fprintf(log_fh,"\t%d",args->trio[i].stats.nmerr); fprintf(log_fh,"\n"); diff --git a/plugins/parental-origin.c b/plugins/parental-origin.c index e1271d4ec..e412beee5 100644 --- a/plugins/parental-origin.c +++ b/plugins/parental-origin.c @@ -1,19 +1,19 @@ /* The MIT License - Copyright (c) 2019-2021 Genome Research Ltd. + Copyright (c) 2019-2024 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -74,7 +74,7 @@ typedef struct double ppat,pmat; // method 1: probability of paternal/maternal origin int ntest; // number of informative sites int nmat, npat; // method 2: number of pat/mat sites based on simple ad[0] < ad[1] comparison - double min_pbinom; // minimum binomial probability of paternal hets + double min_pbinom; // minimum binomial probability of paternal hets } args_t; @@ -87,7 +87,7 @@ const char *about(void) static const char *usage_text(void) { - return + return "\n" "About: Determine parental origin of a CNV region\n" "Usage: bcftools +parental-origin [Plugin Options]\n" @@ -125,7 +125,7 @@ static void init_data(args_t *args) if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "GT"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) error("Error: the tag FORMAT/GT is not present in %s\n", args->fname); - if ( args->filter_str ) + if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); int i, n = 0; @@ -151,16 +151,6 @@ static void destroy_data(args_t *args) bcf_sr_destroy(args->sr); free(args); } -static inline double calc_binom_two_sided(int na, int nb, double aprob) -{ - double prob = na > nb ? 2 * kf_betai(na, nb+1, aprob) : 2 * kf_betai(nb, na+1, aprob); - if ( prob > 1 ) prob = 1; - return prob; -} -static inline double calc_binom_one_sided(int na, int nb, double aprob, int ge) -{ - return ge ? kf_betai(na, nb + 1, aprob) : kf_betai(nb, na + 1, 1 - aprob); -} static void process_record(args_t *args, bcf1_t *rec) { if ( rec->n_allele!=2 || bcf_get_variant_types(rec)!=VCF_SNP ) return; @@ -258,7 +248,7 @@ static void process_record(args_t *args, bcf1_t *rec) if ( args->cnv_type==CNV_DEL ) { if ( *dsgP!=0 && *dsgP!=2 ) return; // proband not a hom - if ( *dsgF == *dsgM ) return; // cannot distinguish between parents + if ( *dsgF == *dsgM ) return; // cannot distinguish between parents if ( !args->greedy ) { if ( *dsgF==1 && *dsgP==*dsgM ) return; // both parents have the proband's allele @@ -350,7 +340,7 @@ int run(int argc, char **argv) char *tmp; while ((c = getopt_long(argc, argv, "h?e:i:p:r:t:dgb:",loptions,NULL)) >= 0) { - switch (c) + switch (c) { case 'e': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); @@ -366,7 +356,7 @@ int run(int argc, char **argv) case 'p': args->pfm = optarg; break; case 'd': args->debug = 1; break; case 'g': args->greedy = 1; break; - case 'b': + case 'b': args->min_pbinom = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -b %s\n", optarg); if ( args->min_pbinom<0 || args->min_pbinom>1 ) error("Expected value from the interval [0,1] with --min-binom-prob\n"); diff --git a/plugins/prune.c b/plugins/prune.c index 0bf644a9f..335638ab1 100644 --- a/plugins/prune.c +++ b/plugins/prune.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2017-2023 Genome Research Ltd. + Copyright (C) 2017-2024 Genome Research Ltd. Author: Petr Danecek @@ -194,21 +194,21 @@ static void init_data(args_t *args) args->ld_filter_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, args->ld_filter); args->vcfbuf = vcfbuf_init(args->hdr, args->ld_win); - if ( args->ld_max_set[VCFBUF_LD_IDX_R2] ) vcfbuf_set_opt(args->vcfbuf,double,LD_MAX_R2,args->ld_max[VCFBUF_LD_IDX_R2]); - if ( args->ld_max_set[VCFBUF_LD_IDX_LD] ) vcfbuf_set_opt(args->vcfbuf,double,LD_MAX_LD,args->ld_max[VCFBUF_LD_IDX_LD]); - if ( args->ld_max_set[VCFBUF_LD_IDX_HD] ) vcfbuf_set_opt(args->vcfbuf,double,LD_MAX_HD,args->ld_max[VCFBUF_LD_IDX_HD]); + if ( args->ld_max_set[VCFBUF_LD_IDX_R2] ) vcfbuf_set(args->vcfbuf,LD_MAX_R2,args->ld_max[VCFBUF_LD_IDX_R2]); + if ( args->ld_max_set[VCFBUF_LD_IDX_LD] ) vcfbuf_set(args->vcfbuf,LD_MAX_LD,args->ld_max[VCFBUF_LD_IDX_LD]); + if ( args->ld_max_set[VCFBUF_LD_IDX_HD] ) vcfbuf_set(args->vcfbuf,LD_MAX_HD,args->ld_max[VCFBUF_LD_IDX_HD]); if ( args->rand_missing || (args->nsites_mode && !strcasecmp(args->nsites_mode,"rand")) ) { fprintf(stderr,"Using random seed: %d\n",args->rseed); hts_srand48(args->rseed); } - if ( args->rand_missing ) vcfbuf_set_opt(args->vcfbuf,int,LD_RAND_MISSING,1); + if ( args->rand_missing ) vcfbuf_set(args->vcfbuf,LD_RAND_MISSING,1); if ( args->nsites ) { - vcfbuf_set_opt(args->vcfbuf,int,VCFBUF_NSITES,args->nsites); - vcfbuf_set_opt(args->vcfbuf,char*,VCFBUF_NSITES_MODE,args->nsites_mode); + vcfbuf_set(args->vcfbuf,PRUNE_NSITES,args->nsites); + vcfbuf_set(args->vcfbuf,PRUNE_NSITES_MODE,args->nsites_mode); } - if ( args->af_tag ) vcfbuf_set_opt(args->vcfbuf,char*,VCFBUF_AF_TAG,args->af_tag); + if ( args->af_tag ) vcfbuf_set(args->vcfbuf,PRUNE_AF_TAG,args->af_tag); if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); @@ -285,7 +285,7 @@ static void process(args_t *args) } } } - if ( filter ) vcfbuf_set_opt(args->vcfbuf,int,LD_FILTER1,1); + if ( filter ) vcfbuf_set(args->vcfbuf,LD_FILTER1,1); sr->buffer[0] = vcfbuf_push(args->vcfbuf, rec); flush(args,0); } diff --git a/plugins/remove-overlaps.c b/plugins/remove-overlaps.c index 0a081f5a0..2a62fa3eb 100644 --- a/plugins/remove-overlaps.c +++ b/plugins/remove-overlaps.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2017-2023 Genome Research Ltd. + Copyright (C) 2017-2024 Genome Research Ltd. Author: Petr Danecek @@ -34,6 +34,7 @@ #include #include #include +#include #include "bcftools.h" #include "vcfbuf.h" #include "filter.h" @@ -47,45 +48,62 @@ typedef struct char *filter_str; int filter_logic; // one of FLT_INCLUDE/FLT_EXCLUDE (-i or -e) vcfbuf_t *vcfbuf; - int argc, region_is_file, target_is_file, output_type, verbose, nrm, ntot, print_overlaps, rmdup, clevel; - char **argv, *region, *target, *fname, *output_fname; + int argc, region_is_file, target_is_file, output_type, verbose, nrm, ntot, clevel; + int reverse; + char **argv, *region, *target, *fname, *output_fname, *mark_expr, *mark_tag, *missing_expr; htsFile *out_fh; + BGZF *fh_bgzf; bcf_hdr_t *hdr; bcf_srs_t *sr; char *index_fn; - int write_index; + int write_index, record_cmd_line; + kstring_t kstr; } args_t; const char *about(void) { - return "Remove overlapping variants\n"; + return "Remove, list or mark overlapping variants\n"; } static const char *usage_text(void) { return "\n" - "About: Remove overlapping variants.\n" + "About: Remove, list or mark overlapping variants.\n" "\n" - "Usage: bcftools +remove-overlaps [Options]\n" + "Usage: bcftools +remove-overlaps [OPTIONS]\n" "Plugin options:\n" - " -d, --rm-dup remove only duplicate sites and remove them completely\n" - " -p, --print-overlaps do the opposite and print only overlapping sites\n" - " -v, --verbose print a list of removed sites\n" + " -M, --mark-tag TAG Mark -m sites with INFO/TAG\n" + " -m, --mark EXPR Mark (if also -M is present) or remove sites [overlap]\n" + " dup .. all duplicate sites\n" + " overlap .. overlapping sites\n" + " min(QUAL) .. sites with lowest QUAL until overlaps are resolved\n" + " --missing EXPR Value to use for missing tags with -m 'min(QUAL)'\n" + " 0 .. the default\n" + " DP .. heuristics, scale maximum QUAL value proportionally to INFO/DP\n" + " --reverse Apply the reverse logic, for example preserve duplicates instead of removing\n" "Standard options:\n" - " -e, --exclude EXPR exclude sites for which the expression is true\n" - " -i, --include EXPR include only sites for which the expression is true\n" - " -o, --output FILE write output to the FILE [standard output]\n" - " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n" - " -r, --regions REGION restrict to comma-separated list of regions\n" - " -R, --regions-file FILE restrict to regions listed in a file\n" - " -t, --targets REGION similar to -r but streams rather than index-jumps\n" - " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" - " -W, --write-index[=FMT] Automatically index the output files [off]\n" + " -e, --exclude EXPR Exclude sites for which the expression is true\n" + " -i, --include EXPR Include only sites for which the expression is true\n" + " -o, --output FILE Write output to the FILE [standard output]\n" + " -O, --output-type u|b|v|z|t[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level\n" + " t: plain list of sites (chr,pos), tz: compressed list [v]\n" + " -r, --regions REGION Restrict to comma-separated list of regions\n" + " -R, --regions-file FILE Restrict to regions listed in a file\n" + " -t, --targets REGION Similar to -r but streams rather than index-jumps\n" + " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" + " --no-version Do not append version and command line to the header\n" + " -W, --write-index[=FMT] Automatically index the output files [off]\n" "\n"; } +// duh: FT_TAB_TEXT is 0 :-/ +static int is_text(int flag) +{ + if ( flag==FT_TAB_TEXT || flag==FT_GZ ) return 1; + return 0; +} static void init_data(args_t *args) { args->sr = bcf_sr_init(); @@ -100,18 +118,30 @@ static void init_data(args_t *args) char wmode[8]; set_wmode(wmode,args->output_type,args->output_fname,args->clevel); - args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); - if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); - if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - if ( init_index2(args->out_fh,args->hdr,args->output_fname,&args->index_fn, - args->write_index)<0 ) - error("Error: failed to initialise index for %s\n",args->output_fname); + if ( is_text(args->output_type) ) + { + args->fh_bgzf = bgzf_open(args->output_fname, args->output_type&FT_GZ ? "wg" : "wu"); + } + else + { + args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); + if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); + + // todo: allow both INFO vs FILTER? + if ( args->mark_tag ) + { + int ret = bcf_hdr_printf(args->hdr, "##INFO=",args->mark_tag); + if ( ret!=0 ) error("Error adding the header tag INFO/%s\n",args->mark_tag); + } + if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_remove-overlaps"); + if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( init_index2(args->out_fh,args->hdr,args->output_fname,&args->index_fn, args->write_index)<0 ) + error("Error: failed to initialise index for %s\n",args->output_fname); + } args->vcfbuf = vcfbuf_init(args->hdr, 0); - if ( args->rmdup ) - vcfbuf_set_opt(args->vcfbuf,int,VCFBUF_RMDUP,1) - else - vcfbuf_set_opt(args->vcfbuf,int,VCFBUF_OVERLAP_WIN,1) + vcfbuf_set(args->vcfbuf,MARK,args->mark_expr); + if ( args->missing_expr ) vcfbuf_set(args->vcfbuf,MARK_MISSING_EXPR,args->missing_expr); if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); @@ -129,25 +159,40 @@ static void destroy_data(args_t *args) } free(args->index_fn); } - if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + if ( args->out_fh && hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + if ( args->fh_bgzf && bgzf_close(args->fh_bgzf)!=0 ) error("[%s] Error: close failed .. %s\n",__func__,args->output_fname); vcfbuf_destroy(args->vcfbuf); bcf_sr_destroy(args->sr); + free(args->kstr.s); free(args); } static void flush(args_t *args, int flush_all) { - int nbuf = vcfbuf_nsites(args->vcfbuf); bcf1_t *rec; - while ( (rec = vcfbuf_flush(args->vcfbuf, flush_all)) ) + while ( (rec=vcfbuf_flush(args->vcfbuf,flush_all)) ) { - if ( nbuf>2 || (nbuf>1 && flush_all) ) + int keep = vcfbuf_get_val(args->vcfbuf,int,MARK) ? 0 : 1; + if ( args->reverse ) keep = keep ? 0 : 1; + if ( !keep ) { args->nrm++; - if ( args->verbose ) printf("%s\t%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - if ( args->print_overlaps && bcf_write1(args->out_fh, args->hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - continue; // skip overlapping variants + if ( !args->mark_tag ) continue; + bcf_update_info_flag(args->hdr,rec,args->mark_tag,NULL,1); + } + + int ret; + if ( is_text(args->output_type) ) + { + args->kstr.l = 0; + ksprintf(&args->kstr,"%s\t%"PRIhts_pos"\n",bcf_seqname(args->hdr,rec),rec->pos+1); + if ( args->kstr.l && bgzf_write(args->fh_bgzf, args->kstr.s, args->kstr.l)!=args->kstr.l ) + error("Failed to write to %s\n", args->output_fname); + } + else + { + ret = bcf_write1(args->out_fh, args->hdr, rec); + if ( ret!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); } - if ( !args->print_overlaps && bcf_write1(args->out_fh, args->hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); } } static void process(args_t *args) @@ -171,30 +216,36 @@ int run(int argc, char **argv) args->argc = argc; args->argv = argv; args->output_type = FT_VCF; args->output_fname = "-"; + args->mark_expr = "overlap"; args->clevel = -1; + args->record_cmd_line = 1; static struct option loptions[] = { - {"rm-dup",no_argument,NULL,'d'}, - {"print-overlaps",no_argument,NULL,'p'}, + {"mark-tag",required_argument,NULL,'M'}, + {"mark",required_argument,NULL,'m'}, + {"reverse",no_argument,NULL,1}, + {"no-version",no_argument,NULL,2}, + {"missing",required_argument,NULL,3}, {"exclude",required_argument,NULL,'e'}, {"include",required_argument,NULL,'i'}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, - {"verbose",no_argument,NULL,'v'}, {"write-index",optional_argument,NULL,'W'}, {NULL,0,NULL,0} }; int c; char *tmp; - while ((c = getopt_long(argc, argv, "r:R:t:T:o:O:i:e:vpdW::",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "m:M:r:R:t:T:o:O:i:e:dW::",loptions,NULL)) >= 0) { switch (c) { - case 'd': args->rmdup = 1; break; - case 'p': args->print_overlaps = 1; break; - case 'v': args->verbose = 1; break; + case 'm': args->mark_expr = optarg; break; + case 'M': args->mark_tag = optarg; break; + case 1 : args->reverse = 1; break; + case 2 : args->record_cmd_line = 0; break; + case 3 : args->missing_expr = optarg; break; case 'e': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; @@ -212,12 +263,18 @@ int run(int argc, char **argv) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; + case 't': args->output_type = FT_TAB_TEXT; break; default: { args->clevel = strtol(optarg,&tmp,10); if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); } } + if ( optarg[1]=='z' ) + { + optarg++; + args->output_type |= FT_GZ; + } if ( optarg[1] ) { args->clevel = strtol(optarg+1,&tmp,10); @@ -233,6 +290,7 @@ int run(int argc, char **argv) default: error("%s", usage_text()); break; } } + if ( args->write_index && is_text(args->output_type) ) error("Cannot index text output\n"); if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n"); if ( optind==argc ) { diff --git a/plugins/scatter.c b/plugins/scatter.c index 55785f88a..351358068 100644 --- a/plugins/scatter.c +++ b/plugins/scatter.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "bcftools.h" #include "htslib/khash_str2int.h" #include "regidx.h" @@ -108,7 +109,7 @@ static const char *usage_text(void) "\n"; } -void mkdir_p(const char *fmt, ...); +void mkdir_p(const char *fmt, ...) HTS_FORMAT(HTS_PRINTF_FMT, 1, 2); // most of this code was inspired by Petr Danecek's code in regidx.c #define MAX_COOR_0 REGIDX_MAX // CSI and hts_itr_query limit, 0-based diff --git a/plugins/setGT.c b/plugins/setGT.c index 33164e5cc..ea1dfd9e9 100644 --- a/plugins/setGT.c +++ b/plugins/setGT.c @@ -1,6 +1,6 @@ /* plugins/setGT.c -- set gentoypes to given values - Copyright (C) 2015-2023 Genome Research Ltd. + Copyright (C) 2015-2024 Genome Research Ltd. Author: Petr Danecek @@ -401,19 +401,6 @@ static inline int set_gt_custom(args_t *args, int32_t *ptr, int ngts, int nals) return changed; } -static inline double calc_binom(int na, int nb) -{ - if ( na + nb == 0 ) return 1; - - /* - kfunc.h implements kf_betai, which is the regularized beta function I_x(a,b) = P(X<=a/(a+b)) - */ - double prob = na > nb ? 2*kf_betai(na, nb + 1, 0.5) : 2*kf_betai(nb, na + 1, 0.5); - if ( prob > 1 ) prob = 1; - - return prob; -} - static inline int random_draw(args_t *args) { return hts_drand48() > args->rand_frac ? 1 : 0; // reversed random draw @@ -538,7 +525,7 @@ bcf1_t *process(bcf1_t *rec) error("The sample %s has incorrect number of %s fields at %s:%"PRId64"\n", args->in_hdr->samples[i],args->binom_tag,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - double prob = calc_binom(args->iarr[i*nbinom+ia],args->iarr[i*nbinom+ib]); + double prob = calc_binom_two_sided(args->iarr[i*nbinom+ia],args->iarr[i*nbinom+ib],0.5); if ( !args->binom_cmp(prob,args->binom_val) ) continue; if ( args->tgt_mask>_RAND && random_draw(args) ) continue; diff --git a/plugins/split-vep.c b/plugins/split-vep.c index 6628a31f7..a59f6bb1b 100644 --- a/plugins/split-vep.c +++ b/plugins/split-vep.c @@ -162,7 +162,7 @@ static const char *default_severity(void) return "# Default consequence substrings ordered in ascending order by severity.\n" "# Consequences with the same severity can be put on the same line in arbitrary order.\n" - "# See also https://m.ensembl.org/info/genome/variation/prediction/predicted_data.htm\n" + "# See also https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html\n" "intergenic\n" "feature_truncation feature_elongation\n" "regulatory\n" @@ -233,7 +233,7 @@ static const char *usage_text(void) " -f, --format STR Create non-VCF output; similar to `bcftools query -f` but drops lines w/o consequence\n" " -g, --gene-list [+]FILE Consider only features listed in FILE, or prioritize if FILE is prefixed with \"+\"\n" " --gene-list-fields LIST Fields to match against by the -g list, by default gene names [SYMBOL,Gene,gene]\n" - " -H, --print-header Print header\n" + " -H, --print-header Print header, -HH to omit column indices\n" " -l, --list Parse the VCF header and list the annotation fields\n" " -p, --annot-prefix STR Before doing anything else, prepend STR to all CSQ fields to avoid tag name conflicts\n" " -s, --select TR:CSQ Select transcripts to extract by type and/or consequence severity. (See also -S and -x.)\n" @@ -623,7 +623,7 @@ static void parse_column_str(args_t *args) // either the original or sanitized version of the tag exists idx_end = idx_beg; } - else if ( (tp=strrchr(bp,':')) ) // notice this requests the last occurence of ':' + else if ( (tp=strrchr(bp,':')) ) // notice this requests the last occurrence of ':' { // there is a colon in the original string, expecting type specification *tp = 0; @@ -711,15 +711,13 @@ static void parse_column_str(args_t *args) ann->idx = j = column[i]; ann->field = strdup(args->field[j]); ann->tag = strdup(args->field[j]); - args->kstr.l = 0; const char *type = "String"; if ( ann->type==BCF_HT_REAL ) type = "Float"; else if ( ann->type==BCF_HT_INT ) type = "Integer"; else if ( ann->type==BCF_HT_FLAG ) type = "Flag"; else if ( ann->type==BCF_HT_STR ) type = "String"; else if ( ann->type==-1 ) type = get_column_type(args, args->field[j], &ann->type); - ksprintf(&args->kstr,"##INFO=",type); - bcf_hdr_printf(args->hdr_out, args->kstr.s, ann->tag,ann->field,args->vep_tag); + bcf_hdr_printf(args->hdr_out, "##INFO=", ann->tag,type,ann->field,args->vep_tag); if ( str.l ) kputc(',',&str); kputs(ann->tag,&str); } @@ -1001,6 +999,7 @@ static void init_data(args_t *args) args->convert = convert_init(args->hdr_out, NULL, 0, args->format_str); if ( !args->convert ) error("Could not parse the expression: %s\n", args->format_str); if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1); + if ( args->print_header>1 ) convert_set_option(args->convert, no_hdr_indices, 1); convert_set_option(args->convert, force_newline, 1); } if ( args->genes_fname ) init_gene_list(args); @@ -1350,7 +1349,7 @@ static void restrict_csqs_to_genes(args_t *args) // Split the VEP annotation by transcript and by field, then check if the number of subfields looks alright. // Unfortunately, we cannot enforce the number of subfields to match the header definition because that can // be variable: `bcftools csq` outputs different number of fields for different consequence types. -// So we need to distinguish between this reasonable case and incorrectly formated consequences such +// So we need to distinguish between this reasonable case and incorrectly formatted consequences such // as those reported for LoF_info subfield here https://github.com/Ensembl/ensembl-vep/issues/1351. static void split_csq_fields(args_t *args, bcf1_t *rec, int csq_str_len) { @@ -1541,7 +1540,7 @@ int run(int argc, char **argv) else if ( !strcasecmp(optarg,"space") ) args->all_fields_delim = " "; else args->all_fields_delim = optarg; break; - case 'H': args->print_header = 1; break; + case 'H': args->print_header++; break; case 'x': drop_sites = 1; break; case 'X': drop_sites = 0; break; case 'd': args->duplicate = 1; break; diff --git a/plugins/split.c b/plugins/split.c index 5e25497ab..bac2ad7c9 100644 --- a/plugins/split.c +++ b/plugins/split.c @@ -143,7 +143,7 @@ static const char *usage_text(void) "\n"; } -void mkdir_p(const char *fmt, ...); +void mkdir_p(const char *fmt, ...) HTS_FORMAT(HTS_PRINTF_FMT, 1, 2); static char *create_unique_file_name(args_t *args, const char *template) { diff --git a/plugins/tag2tag.c b/plugins/tag2tag.c index 5ac1175bf..e27a13b3b 100644 --- a/plugins/tag2tag.c +++ b/plugins/tag2tag.c @@ -24,6 +24,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#include #include #include #include @@ -133,6 +134,20 @@ static int parse_ori2new_option(args_t *args, char *optarg) args->loc_src = (1<loc_dst = (1<src = LXX; + args->dst = XX; + args->loc_src = (1<loc_dst = (1<src = LXX; + args->dst = XX; + args->loc_src = (1<loc_dst = (1<src = XX; @@ -140,6 +155,20 @@ static int parse_ori2new_option(args_t *args, char *optarg) args->loc_src = (1<loc_dst = (1<src = XX; + args->dst = LXX; + args->loc_src = (1<loc_dst = (1<src = XX; + args->dst = LXX; + args->loc_src = (1<loc_dst = (1<src = QRQA; @@ -247,7 +276,8 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) int j = tags_LXX[i]; if ( !(args->loc_src & (1<in_hdr,BCF_DT_ID,tags[j].str))<0 || !bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,tag_id) ) error("The source tag does not exist: %s\n",tags[j].str); - if ( bcf_hdr_id2type(args->in_hdr,BCF_HL_FMT,tag_id)!=tags[j].type ) error("The source tag is of different type than required by the VCF specification\n"); + if ( bcf_hdr_id2type(args->in_hdr,BCF_HL_FMT,tag_id)!=tags[j].type ) + error("The source tag %s is of different type than required by the VCF specification (%d vs %d)\n",tags[j].str,bcf_hdr_id2type(args->in_hdr,BCF_HL_FMT,tag_id),tags[j].type); } } else if ( args->src==XX ) @@ -257,13 +287,15 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) int j = tags_XX[i]; if ( !(args->loc_src & (1<in_hdr,BCF_DT_ID,tags[j].str))<0 || !bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,tag_id) ) error("The source tag does not exist: %s\n",tags[j].str); - if ( bcf_hdr_id2type(args->in_hdr,BCF_HL_FMT,tag_id)!=tags[j].type ) error("The source tag is of different type than required by the VCF specification\n"); + if ( bcf_hdr_id2type(args->in_hdr,BCF_HL_FMT,tag_id)!=tags[j].type ) + error("The source tag %s is of different type than required by the VCF specification (%d vs %d)\n",tags[j].str,bcf_hdr_id2type(args->in_hdr,BCF_HL_FMT,tag_id),tags[j].type); } } else { if ( (tag_id=bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,tags[args->src].str))<0 || !bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,tag_id) ) error("The source tag does not exist: %s\n",tags[args->src].str); - if ( bcf_hdr_id2type(args->in_hdr,BCF_HL_FMT,tag_id)!=tags[args->src].type ) error("The source tag is of different type than required by the VCF specification\n"); + if ( bcf_hdr_id2type(args->in_hdr,BCF_HL_FMT,tag_id)!=tags[args->src].type ) + error("The source tag %s is of different type than required by the VCF specification (%d vs %d)\n",tags[args->src].str,bcf_hdr_id2type(args->in_hdr,BCF_HL_FMT,tag_id),tags[args->src].type); } // Remove tags from the header if -r, --replace was given. However, do not remove if -s, --skip-nalt was given, diff --git a/plugins/trio-dnm2.c b/plugins/trio-dnm2.c index 947570660..89f76f5f6 100644 --- a/plugins/trio-dnm2.c +++ b/plugins/trio-dnm2.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2018-2023 Genome Research Ltd. + Copyright (c) 2018-2024 Genome Research Ltd. Author: Petr Danecek @@ -77,7 +77,7 @@ typedef struct { // combines priors, mutation rates, genotype transmission probability; see init_priors() double pprob[10][10][10]; // prior probability; the order is father,mother,child - uint8_t denovo[10][10][10]; // is the GT combination not compatible with normal inheritence (0) or is de novo (1) + uint8_t denovo[10][10][10]; // is the GT combination not compatible with normal inheritance (0) or is de novo (1) uint8_t denovo_allele[10][10][10]; // which of the alleles is de novo for this configuration } priors_t; @@ -366,7 +366,7 @@ static double init_mf_priors(args_t *args, int fi, int mi) int nref_mf = (fa==0 ? 1 : 0) + (fb==0 ? 1 : 0) + (ma==0 ? 1 : 0) + (mb==0 ? 1 : 0); const double p_homref = 0.998; // this assumes bi-allelic sites - const double p_poly = (1 - p_homref) * (1 - p_homref); // p of this occuring twice for a different allele + const double p_poly = (1 - p_homref) * (1 - p_homref); // p of this occurring twice for a different allele const double p_nonref = 1 - p_homref - p_poly; if ( nalt_mf>=3 ) // penalize heavily sites with 3 unique ALTs @@ -399,7 +399,7 @@ static double init_mf_priors_chrX(args_t *args, int mi) int nref_m = (ma==0 ? 1 : 0) + (mb==0 ? 1 : 0); const double p_homref = 0.999; // this assumes bi-allelic sites - const double p_poly = (1 - p_homref) * (1 - p_homref); // p of this occuring twice for a different allele + const double p_poly = (1 - p_homref) * (1 - p_homref); // p of this occurring twice for a different allele const double p_nonref = 1 - p_homref - p_poly; if ( nalt_m>=2 ) // 2 unique ALTs, 3 cases @@ -429,7 +429,7 @@ static double init_mf_priors_chrXX(args_t *args, int fi, int mi) else nalt_mf--; const double p_homref = 0.998; // this assumes bi-allelic sites - const double p_poly = (1 - p_homref) * (1 - p_homref); // p of this occuring twice for a different allele + const double p_poly = (1 - p_homref) * (1 - p_homref); // p of this occurring twice for a different allele const double p_nonref = 1 - p_homref - p_poly; if ( nalt_mf>=3 ) // 3 unique ALTs @@ -515,11 +515,13 @@ static void init_tprob_mprob(args_t *args, int fi, int mi, int ci, double *tprob int is_novel; if ( args->strictly_novel ) { - // account for LoH sites, see trio-dnm.11.vcf - // chr1:10000057 child=1/1 father=1/1 mother=0/0 .. LoH region - // chr1:10697377 child=0/1 father=1/1 mother=1/1 .. usually these are indel ambiguities + // Consider as DNMs only variants with a novel allele. For example, if heterozygosity is lost and + // Mendelian inheritance is violated (11+00=11), don't consider this to be a novel allele + // (see trio-dnm.11.vcf) + // chr1:10000057 f,m,c 11+00=11 .. LoH region + // chr1:10697377 f,m,c 11+11=01 .. usually these are indel ambiguities is_novel = ( (ca!=fa && ca!=fb && ca!=ma && ca!=mb) || (cb!=fa && cb!=fb && cb!=ma && cb!=mb) ) ? 1 : 0; - if ( is_novel && *denovo_allele==0 ) is_novel = 0; + if ( is_novel && *denovo_allele==0 ) is_novel = 0; // never count reference allele as novel } else { @@ -618,7 +620,7 @@ static void init_priors(args_t *args, priors_t *priors, init_priors_t type) if ( args->use_dng_priors ) init_DNG_tprob_mprob(args,fi,mi,ci,&tprob,&mprob,&allele); - else if ( type==autosomal || args->strictly_novel ) + else if ( type==autosomal ) init_tprob_mprob(args,fi,mi,ci,&tprob,&mprob,&allele); else if ( type==chrX ) init_tprob_mprob_chrX(args,mi,ci,&tprob,&mprob,&allele); @@ -843,7 +845,7 @@ static inline double subtract_num_log(double a_num, double b_log) return log(a_num - exp(b_log)); } #endif -static inline double subtract_log(double a_log, double b_log) +static inline double subtract_log(double a_log, double b_log) // log(exp(a_log)-exp(b_log)) { return a_log + log(1 - exp(b_log - a_log)); } @@ -863,18 +865,21 @@ static double process_trio_ACM(args_t *args, priors_t *priors, int nals, double double sum = -HUGE_VAL, max = -HUGE_VAL; int i, ca,cb, fa,fb, ma,mb, ci=0; - for (ca=0; cawith_ppl ) fpl = pl[iFATHER][fi]; else @@ -882,10 +887,18 @@ static double process_trio_ACM(args_t *args, priors_t *priors, int nals, double fpl = 0; for (i=0; i strong genotype fa|fb if ( fals&(1< weaker genotype fa|fb else if ( cals&(1<pprob[fi][mi][ci]; - sum = sum_log(sum,val); + double val = cpl + fpl + mpl + priors->pprob[fi][mi][ci]; // L_{p,f,m} + sum = sum_log(sum,val); // this is the denominator, adding to \sum L_{p,f,m} #define DEBUG 0 #if DEBUG if(val!=-HUGE_VAL) fprintf(stderr,"m,f,c: %d%d+%d%d=%d%d dn=%d (%d,%d,%d) mpl,fpl,cpl: %+e %+e %+e \t prior:%+e \t pval=%+e sum=%+e %c\n", mb,ma,fb,fa,cb,ca,priors->denovo[fi][mi][ci],fi,mi,ci,mpl,fpl,cpl,priors->pprob[fi][mi][ci], val,sum,(priors->denovo[fi][mi][ci] && max < val)?'*':'-'); #endif + // Is this a valid de novo combination of p,f,m genotypes (ie not inherited), and is it most likely thus far? if ( priors->denovo[fi][mi][ci] && max < val ) { max = val; @@ -939,6 +953,18 @@ static double process_trio_ACM(args_t *args, priors_t *priors, int nals, double #if DEBUG fprintf(stderr,"max=%e sum=%e ret=%e\n",max,sum,max-sum); #endif + + if ( args->strictly_novel ) + { + // Downplay de novo calls with alleles present in the parents + int ial = *al1; + sum = sum_log(sum,qs[iMOTHER][ial] + qs[iFATHER][ial]); + max += qs[iMOTHER][ial] + qs[iFATHER][ial]; + } + + // This is the log( 1 - (\max L_pfm) / (\sum L_pfm) ). The default output (DNM:log) prints the inverse. Note log + // values smaller than ~40 will be output as -inf due to a loss of precision. That's OK, we are interested + // in values very close to 0, bigger than, say, -8 return log2phred(subtract_log(0,max-sum)); } static double process_trio_DNG(args_t *args, priors_t *priors, int nals, double *pl[3], int npl, int *al0, int *al1) @@ -1371,7 +1397,7 @@ static void process_record(args_t *args, bcf1_t *rec) return; } - // fake QS from AD assuming average BQ=30 + // fake QS from AD assuming average BQ=30, used by --with-pAD nret = n_ad * nsmpl; hts_expand(int32_t,nret,args->mqs,args->qs); for (i=0; i @@ -134,7 +134,7 @@ static void init_data(args_t *args) error("Error: failed to initialise index for %s\n",args->output_fname); args->buf = vcfbuf_init(args->hdr, 0); - vcfbuf_set_opt(args->buf,int,VCFBUF_DUMMY,1) + vcfbuf_set(args->buf,VCFBUF_DUMMY,1); } static void destroy_data(args_t *args) { diff --git a/polysomy.c b/polysomy.c index 1a99f98af..809ee1139 100644 --- a/polysomy.c +++ b/polysomy.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "bcftools.h" #include "peakfit.h" @@ -62,7 +63,7 @@ typedef struct } args_t; -FILE *open_file(char **fname, const char *mode, const char *fmt, ...); +FILE *open_file(char **fname, const char *mode, const char *fmt, ...) HTS_FORMAT(HTS_PRINTF_FMT, 3, 4); static void init_dist(args_t *args, dist_t *dist, int verbose) { diff --git a/regidx.h b/regidx.h index f13b52a93..c9fa45ddf 100644 --- a/regidx.h +++ b/regidx.h @@ -114,7 +114,7 @@ int regidx_parse_vcf(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*); * * @param fname: input file name or NULL if regions will be added one-by-one via regidx_insert() * @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f. If NULL, - * the format will be autodected, currently either regidx_parse_tab (the default) or + * the format will be autodetected, currently either regidx_parse_tab (the default) or * regidx_parse_bed (file must be named 'bed' or 'bed.gz') will be used. Note that * the exact autodetection algorithm will change. * @param freef: NULL or see description of regidx_parse_f diff --git a/reheader.c b/reheader.c index e0ae12533..37e5d965e 100644 --- a/reheader.c +++ b/reheader.c @@ -415,7 +415,7 @@ static void reheader_vcf_gz(args_t *args) if ( bgzf_write(bgzf_out, hdr.s, hdr.l) < 0 ) error("Can't write BGZF header (code %d)\n", bgzf_out->errcode); free(hdr.s); - // Output all remainig data read with the header block + // Output all remaining data read with the header block if ( fp->block_length - skip_until > 0 ) { if ( bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until)<0 ) error("Error: %d\n",fp->errcode); diff --git a/str_finder.c b/str_finder.c index a850fb529..a9281d811 100644 --- a/str_finder.c +++ b/str_finder.c @@ -50,7 +50,7 @@ static void add_rep(rep_ele **list, char *cons, int clen, int pos, int rlen, return; } - // Find current and last occurence of repeated word. + // Find current and last occurrence of repeated word. cp2 = &cons[pos+1]; // If unpadded, this is quicker: cp1 = &cons[pos+1-rlen]; diff --git a/test/annotate.escape.1.1.out b/test/annotate.escape.1.1.out new file mode 100644 index 000000000..f908ce95c --- /dev/null +++ b/test/annotate.escape.1.1.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.3 +##FILTER= +##INFO= +##FORMAT= +##contig= +##reference=ref.fa +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT smpl +chr1 101 . C G . . ISTR=%3D_and_%_and_%3B_work FSTR %_and_=_and_%3A_work diff --git a/test/annotate.escape.1.tab b/test/annotate.escape.1.tab new file mode 100644 index 000000000..1bfe31616 --- /dev/null +++ b/test/annotate.escape.1.tab @@ -0,0 +1 @@ +chr1 101 =_and_%_and_;_work %_and_=_and_:_work diff --git a/test/annotate.escape.1.vcf b/test/annotate.escape.1.vcf new file mode 100755 index 000000000..5f774382f --- /dev/null +++ b/test/annotate.escape.1.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.3 +##INFO= +##FORMAT= +##contig= +##reference=ref.fa +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT smpl +chr1 101 . C G . . . . . diff --git a/test/annotate.match.1.1.out b/test/annotate.match.1.1.out new file mode 100644 index 000000000..f658c9abd --- /dev/null +++ b/test/annotate.match.1.1.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.2 +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 10 . A C . PASS STR=xx2;INT=22;FLT=2.2;SCORE=22 diff --git a/test/annotate.match.1.2.out b/test/annotate.match.1.2.out new file mode 100644 index 000000000..eca42dd35 --- /dev/null +++ b/test/annotate.match.1.2.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.2 +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 10 . A T . PASS STR=aa1;INT=11;FLT=1.1;SCORE=11 diff --git a/test/annotate.match.1.tab b/test/annotate.match.1.tab new file mode 100644 index 000000000..cbb734486 --- /dev/null +++ b/test/annotate.match.1.tab @@ -0,0 +1,3 @@ +#CHROM POS REF ALT SCORE STR INT FLT +chr1 10 A T 11 xx1 11 1.1 +chr1 10 A T 22 xx2 22 2.2 diff --git a/test/annotate.match.1.vcf b/test/annotate.match.1.vcf new file mode 100644 index 000000000..6c964135e --- /dev/null +++ b/test/annotate.match.1.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.2 +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 10 . A C . PASS STR=xx2;INT=22;FLT=2.2 +chr1 10 . A T . PASS STR=aa1;INT=11;FLT=1.1 diff --git a/test/atomize.split.4.1.out b/test/atomize.split.4.1.out index 84b94e2b8..dac8c823e 100644 --- a/test/atomize.split.4.1.out +++ b/test/atomize.split.4.1.out @@ -6,7 +6,7 @@ 11 100 . C G . . OLD_REC=11|100|CC|GG|1 11 101 . C G . . OLD_REC=11|100|CC|GG|1 11 200 . C G . . OLD_REC=11|200|CC|GGGG|1 -11 201 . C CGG . . OLD_REC=11|200|CC|GGGG -11 201 . C G . . OLD_REC=11|200|CC|GGGG -11 300 . C CGGG . . OLD_REC=11|300|C|GGGG -11 300 . C G . . OLD_REC=11|300|C|GGGG +11 201 . C CGG . . OLD_REC=11|200|CC|GGGG|1 +11 201 . C G . . OLD_REC=11|200|CC|GGGG|1 +11 300 . C CGGG . . OLD_REC=11|300|C|GGGG|1 +11 300 . C G . . OLD_REC=11|300|C|GGGG|1 diff --git a/test/atomize.split.4.2.out b/test/atomize.split.4.2.out new file mode 100644 index 000000000..00ab4cf7c --- /dev/null +++ b/test/atomize.split.4.2.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##FILTER= +##contig= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +11 100 . C G . . OLD_REC=11|100|CC|GG|1 +11 101 . C G . . OLD_REC=11|100|CC|GG|1 +11 200 . CC GGGG . . . +11 300 . C GGGG . . . diff --git a/test/atomize.split.5.1.out b/test/atomize.split.5.1.out new file mode 100644 index 000000000..08b58a976 --- /dev/null +++ b/test/atomize.split.5.1.out @@ -0,0 +1,17 @@ +##fileformat=VCFv4.1 +##FILTER= +##contig= +##contig= +##contig= +##FORMAT= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample +1 105174100 . G C . . OLD_REC=1|105174100|GCCCGC|CGCCCCGC|1 GT 0/1 +1 105174100 . G GGC . . OLD_REC=1|105174100|GCCCGC|CGCCCCGC|1 GT 0/1 +2 36744887 . C G . . OLD_REC=2|36744886|GCCCC|GGCT|1 GT 0/1 +2 36744889 . C T . . OLD_REC=2|36744886|GCCCC|GGCT|1 GT 0/1 +2 36744889 . CC C . . OLD_REC=2|36744886|GCCCC|GGCT|1 GT 0/1 +11 101 . G C . . OLD_REC=11|101|GCGT|G,GCGA,GTGA,CCGT|4 GT 0/. +11 101 . GCGT G . . OLD_REC=11|101|GCGT|G,GCGA,GTGA,CCGT|1 GT 0/1 +11 102 . C T . . OLD_REC=11|101|GCGT|G,GCGA,GTGA,CCGT|3 GT 0/. +11 104 . T A . . OLD_REC=11|101|GCGT|G,GCGA,GTGA,CCGT|2,3 GT 0/. diff --git a/test/atomize.split.5.2.out b/test/atomize.split.5.2.out new file mode 100644 index 000000000..4d3fc5cd9 --- /dev/null +++ b/test/atomize.split.5.2.out @@ -0,0 +1,17 @@ +##fileformat=VCFv4.1 +##FILTER= +##contig= +##contig= +##contig= +##FORMAT= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample +1 105174100 . G C,* . . OLD_REC=1|105174100|GCCCGC|CGCCCCGC|1 GT 0/1 +1 105174100 . G GGC,* . . OLD_REC=1|105174100|GCCCGC|CGCCCCGC|1 GT 0/1 +2 36744887 . C G . . OLD_REC=2|36744886|GCCCC|GGCT|1 GT 0/1 +2 36744889 . C T,* . . OLD_REC=2|36744886|GCCCC|GGCT|1 GT 0/1 +2 36744889 . CC C,* . . OLD_REC=2|36744886|GCCCC|GGCT|1 GT 0/1 +11 101 . G C,* . . OLD_REC=11|101|GCGT|G,GCGA,GTGA,CCGT|4 GT 0/2 +11 101 . GCGT G,* . . OLD_REC=11|101|GCGT|G,GCGA,GTGA,CCGT|1 GT 0/1 +11 102 . C T,* . . OLD_REC=11|101|GCGT|G,GCGA,GTGA,CCGT|3 GT 0/2 +11 104 . T A,* . . OLD_REC=11|101|GCGT|G,GCGA,GTGA,CCGT|2,3 GT 0/2 diff --git a/test/atomize.split.5.vcf b/test/atomize.split.5.vcf new file mode 100644 index 000000000..be44c6d9a --- /dev/null +++ b/test/atomize.split.5.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.1 +##contig= +##contig= +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample +1 105174100 . GCCCGC CGCCCCGC . . . GT 0/1 +2 36744886 . GCCCC GGCT . . . GT 0/1 +11 101 . GCGT G,GCGA,GTGA,CCGT . . . GT 0/1 diff --git a/test/check.chk b/test/check.chk index 31e4eb35b..85791369c 100644 --- a/test/check.chk +++ b/test/check.chk @@ -26,10 +26,16 @@ SN 0 number of indels: 9 SN 0 number of others: 2 SN 0 number of multiallelic sites: 6 SN 0 number of multiallelic SNP sites: 1 -# TSTV, transitions/transversions: +# TSTV, transitions/transversions +# - transitions, see https://en.wikipedia.org/wiki/Transition_(genetics) +# - transversions, see https://en.wikipedia.org/wiki/Transversion # TSTV [2]id [3]ts [4]tv [5]ts/tv [6]ts (1st ALT) [7]tv (1st ALT) [8]ts/tv (1st ALT) TSTV 0 3 2 1.50 3 1 3.00 # SiS, Singleton stats: +# - allele count, i.e. the number of singleton genotypes (AC=1) +# - number of transitions, see above +# - number of transversions, see above +# - repeat-consistent, inconsistent and n/a: experimental and useless stats [DEPRECATED] # SiS [2]id [3]allele count [4]number of SNPs [5]number of transitions [6]number of transversions [7]number of indels [8]repeat-consistent [9]repeat-inconsistent [10]not applicable SiS 0 1 3 1 2 0 0 0 0 # AF, Stats by non-reference allele frequency: @@ -75,6 +81,13 @@ ST 0 G>T 1 ST 0 T>A 0 ST 0 T>C 0 ST 0 T>G 0 +# DP, depth: +# - set id, see above +# - the depth bin, corresponds to the depth (unless --depth was given) +# - number of genotypes with this depth (zero unless -s/-S was given) +# - fraction of genotypes with this depth (zero unless -s/-S was given) +# - number of sites with this depth +# - fraction of sites with this depth # DP, Depth distribution # DP [2]id [3]bin [4]number of genotypes [5]fraction of genotypes (%) [6]number of sites [7]fraction of sites (%) DP 0 10 2 5.555556 0 0.000000 diff --git a/test/consensus.beyond.1.out b/test/consensus.beyond.1.out new file mode 100644 index 000000000..56fec1829 --- /dev/null +++ b/test/consensus.beyond.1.out @@ -0,0 +1,4 @@ +>1:1-1 +A +>2:2-2 +G diff --git a/test/consensus.beyond.fa b/test/consensus.beyond.fa new file mode 100644 index 000000000..6dd1f3d15 --- /dev/null +++ b/test/consensus.beyond.fa @@ -0,0 +1,4 @@ +>1:1-1 +A +>2:2-2 +G \ No newline at end of file diff --git a/test/consensus.beyond.vcf b/test/consensus.beyond.vcf new file mode 100644 index 000000000..e6371754e --- /dev/null +++ b/test/consensus.beyond.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.2 +##contig= +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 +1 3 . A T 100 . . GT 0/1 diff --git a/test/consensus.overlaps.1.1.out b/test/consensus.overlaps.1.1.out new file mode 100644 index 000000000..84c966eec --- /dev/null +++ b/test/consensus.overlaps.1.1.out @@ -0,0 +1,4 @@ +>1 +CGAC +>2 +CGATC diff --git a/test/consensus.overlaps.1.2.out b/test/consensus.overlaps.1.2.out new file mode 100644 index 000000000..0031fa15f --- /dev/null +++ b/test/consensus.overlaps.1.2.out @@ -0,0 +1,4 @@ +>1 +CGC +>2 +CGTC diff --git a/test/consensus.overlaps.1.3.out b/test/consensus.overlaps.1.3.out new file mode 100644 index 000000000..ea3495f2d --- /dev/null +++ b/test/consensus.overlaps.1.3.out @@ -0,0 +1,4 @@ +>1 +NGAN +>2 +NGATN diff --git a/test/consensus.overlaps.1.4.out b/test/consensus.overlaps.1.4.out new file mode 100644 index 000000000..aa4b17c88 --- /dev/null +++ b/test/consensus.overlaps.1.4.out @@ -0,0 +1,4 @@ +>1 +NGN +>2 +NGTN diff --git a/test/consensus.overlaps.1.fa b/test/consensus.overlaps.1.fa new file mode 100644 index 000000000..8258d14e3 --- /dev/null +++ b/test/consensus.overlaps.1.fa @@ -0,0 +1,4 @@ +>1 +CGGGACAC +>2 +CGGGACAC diff --git a/test/consensus.overlaps.1.vcf b/test/consensus.overlaps.1.vcf new file mode 100644 index 000000000..d2edc8c37 --- /dev/null +++ b/test/consensus.overlaps.1.vcf @@ -0,0 +1,10 @@ +##fileformat=VCFv4.2 +##FORMAT= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B +1 2 . GGGAC G, 649.73 . . GT 1/1 1/1 +1 6 . CA C, 536.73 . . GT 0/0 1/1 +2 2 . GGGAC G, 649.73 . . GT 1/1 1/1 +2 6 . CA C, 536.73 . . GT 0/0 1/1 +2 7 . A AT, 649.73 . . GT 1/1 1/1 diff --git a/test/csq/ENST00000573035-bt2212/ENST00000573035-bt2212.fa b/test/csq/ENST00000573035-bt2212/ENST00000573035-bt2212.fa new file mode 100644 index 000000000..4b9f742ad --- /dev/null +++ b/test/csq/ENST00000573035-bt2212/ENST00000573035-bt2212.fa @@ -0,0 +1,715 @@ +>7 7:53036-102995 +TTAAGTGAAAAGGGAGTGTGGTAAAATTTTCTTTGATTCTAAAGATAAGAATTTGCTGGAGAAAAGAGCA +GCACATAATTGCAGAATATTAATAAATGGTTAAAACTTTGCATTGGCTGGGATTATAGTGATTTTTTACC +TTCTGCTACCTAGCAAACTTTTGTCTTTTCAAAGACGTAAAGTCTTTACTTAAAATAAATATGCATTGCT +GTATTTATAGGGAGGGCAGAGAGGATTTCCCTACACAATCTAGCTGAAAGTTCTCACATGCAATCATATC +ATTGCATTTGCTTTTCTAGGCATTTCCCTGGAAATGGCAGCTGTGACAGTAAAGGAAGAATCAGAAGATC +CTGATTATTATCAATATAACATTCAAGGTAATTTGAATTAATGCAATTTTTCTTTCTAAAAATTATTCGT +GGTTAAAATTAAAATTTGCTCATCAATTGCTTTAATTTCTTAAATAATATTTTATTGATCAGTTCTTGAT +TGACATATATATTGTAATTCAGTCCCGGGGATAAAACATTTAAAAATGGGGCTAAAAGATCAACTCAGAC +AATCCAGAGGGGATATGTAAAATAGCCATTTGTGTTCTTAAAAGGATGAGCAAAAGTTGTGGCACAATTT +AGAATTCAATCCCAGGTTCTAGTGTTGCAAAATAAAACAAACCTATTAGGCAAATGCAGATAATGTCAGC +TTAATTTTTTCTCACTGCATAATTATAGTATATTAAACACTTAAAGTAAAAAATCTGGTTAGCTTTGCCA +TCTACATATCTAATACACCATCTTCATTGCATCCAAGATAATGAAATATCTATAACCCCAAAAGTTTCCT +GTGTCCCCTTGTTATTCATTGCCCTGCCCAGTATTCAGGCAACACGGATCTGTTTTCTGTTTTAGGTTAG +TTTGCATTTTCTATAAAGTCTTATGAATGAAATAATAAAATGTGGACTATTTTCATGGGCCGGGGAGCAG +TGTGGCTTCTTTCATTTCAAATGATTGTTTTGAAATTCATCCACAGCGTTGCACGTATCAGTAGTAGATT +CCATTTGATTGTTGATTTGTATTCTATTGTATGCCTGAGTCAAAATTTATTCATTTCTTTGTCTGTTGAT +AGCCATTTGGGTTCTTCCAGTTTGGGGCCATTACAAATAGAGGTACTATGAACATTGTGTAGAGGCTTTT +GTGTGGACATAGGCCTTCATTTCTCTCTTTTTTTTTTTTTTTTTTGAGACAGAGTCTCACTCTGTTGCCC +AGGCTGGAGTGCAGTGGCGCGATCTTGGCTCACTGCAACCTCCACCTCCCAGGTTGAGGTGATTCTTCTG +CCTCAGCCTCCCGAATAGCTGGGATAGCAGGTGTGTGCCAACACACCTGGCTAATTTTTGTGTTTTTTAG +TAGACACAGGGTTTCACCATGTTGGCCAGGCTGGTCTCGAGCTCCTGACCTCAGGTGATCCACCCGCCTT +GGCCTCCCAAAGTACTGGGATTACAGGGGCTAACACGCTTTTTTAAAATGAAATATTTAAAACCTAAAAG +CATAGTATCAGTCTGCCCAGGTTTAGTGAACCATAGTATTTGTCTCTTCTCTCCTCCCTCCAAGAAACAA +CGTGTAACTATTTCATTGAAGCCTTTCTTTGGGCTCCCCCAAATCCCCACTTTTCTCTCTATTCTTAAGT +TAACCATGATCTTGAAAACTGAGAATTGTCCTTCCTGATGGTGATGTGCTTTACTACGTAAATATTCTCC +CGGGAGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTCATGTCATATTTATT +GTTTTATATATGCCACTTCCTGAGTGAAGCTTACACAGTGGTACACTGTATGTGTTGTTGAGTATAGTCT +GTCACTCAATTTAAGTTTTCAAGATTTATCTTTTTTTTTTTTTTTTTGGCGTGGGGGTGGGGATGGATTT +TTGCTCTGTCGCCCAAGCTGGAGTGCAGTGGTGAGATCTCGGCTCATTGCAACCTCCGCTTGCCGGGTTC +AAGCAATTCTCTGCCTCAGCTGGGATTTATCTGTTTTGTCACAAGTAGATTTAATCATGTCATCATGGTA +GTTACACTGTGTGATTATAATACAGTTCGTTTTTTCGTTCTTCTACATAAAGACATTGATGTTTTCTTCT +AAAACTTTTGTATTCCAAAATAGGCTTGGATATATTTCTCTGTGCACCTCAGTGCTCCTGTAAAATAGTG +ACATTCAAATGTGAGTGTTGGGAATGTGTTGATTTTCTCAATGAGTGGTAGGCACCTCAAGCATTTGAGA +GGTGGAGGTTAAGTATGTTTTAGGGTGAATAAAATCAAGTACCATTTTTAGGGTGAATAAAATTCAAATA +TGGGACATTAAAATAATTTGAAAGTATAGTGATACGTGAAGGTATTTTGAGTAAATACCAACTAAAAAAA +ATATAGTTTAGCAATATTGGAAGCCAAAAACAGTAGTAGCAGGATAGAATAAGACTATGTCGTATTTAAA +GGAATTGGTCACTAAGTTGATATAACAACCATTAGTTTGTATGTGCCTAACTATGTATCCTCACAGTAAA +AGTAAAAATTATTTACAAATTATAAGTCTCTATAGATATGTACAGTCATGTGGGAGATTTTAACCAATGA +TATACGTCTAGCTGATGAATACTCTTTTAACATTTATATGCTAAATAGAATATTTTCACTACAGTTAATA +TATGTAAGCTAACTATAGAAAACTGTGTACCCAAGAAATAAGTTTATTTACAAAGCATGAATCATTTATA +AAAATTGACTGTATACTTGTTCACAGTCTTAATAAATTCTCTGAACATACTGAAATTAAATTATTAATAG +AAAACATACACAGTGGGAATCCTAGGAAACATGCTTGTAATTGACTTAATGAGGGCCTCATTAAAAATAT +TTAGACCAGAATATTGAGAATGCCACATAGTTAAACTTGTGAAGAGCAACTAAAGCAGTACTTAGAGGTC +AATTAGTAGCCTTCACTAAGGTTGCATTTTTAAATCCTGTAAATAGCACATCTTCCTTTTCTTGAGCTCT +GTATATTTTACTCAGAGTCAGAAGACTAGGAAGCAGTGGATCTAGCATTTGAATTTAGGAGTCTACATTC +CAAGACTTGCTCTTCTTTGAGATTCCAAATAGAAACTATTTCTTTATTTCTCATGCAATTGATTTTTTTC +TCTGATGTGTCCTTTGCCTGTAATGTGTAGTGTGGTGCATCTTTGTGAAATTTCTTAGGCTTTAGAGAAT +GAAGCCTTTCAAACAACTTCGGATGTTTTCCCAATTCTCTTCAGAGATAGTTAACTGAACTGTGGTCATA +TTAATGCCAAAATGTGTTTTATATTAAATAGCCATTGAAAACACAATGTTTTGTTTTCTATTTTAAAACT +ATTATTATTTTAATCATTAAAAAGTTTGATGATAACATGTAAATGTGTTGTCAAAATATAGCAGTTTGTG +AAGTTGAATTTAACTCAGAGTACTTTCTGTGTTTTGGAAAGAACCTTATTAGAAAAAGTGCCCGCTGTGC +TGGAAGAAAGGGAGAATGAATTAATAGCTGTTAGTTTGTCACATTTCTATTTCAGTAGAGATTCAAGGAG +ATGGTTTACGTTATTTTAGAGATTCTAAGGAAGGTATATAACATTTGGTTCTGCATTTTTCTTCAGTATG +ACAAAAGAGTAAGCAAATATTTGCCAAAGAAGAAACAAATAAACCTGTGAGATAAACCTGCATTTGTAAT +CTGTATGAAGTAAAAGTTACAGTTTACTGTTATGAACATTGTACTGTTTTAAAAAACCCACATCTTCCAT +TTTAGACATACCATTCCCATTCAATCCATATGTCCAGTCATCTTCAAAATTTCTTTTGACTGTTTACATA +TGAATTCACCTGATTACATTTTTCCATTCACCAAGACCATAAATTGACATTGCTCATCTTGTCAGTCTTT +TTTTTTTTTAATTTTTTTCTTTTTATTTATAAAAAGGCCATTCCATGTATCTCACCCAAATTTGTTGACT +AAAGTCACCCCACATTTTTTTTTGGGGGGGATTACTTTTGAAGTATTATCTTTGTATCCCAAAGCAGGCC +CTTCTGAAACTGATGATGTTGATGAAAAACAGCCCCTATCGAAGCCTTTGCAAGGTATAATCTTTTCACT +TCCATTCTCCCACATACTGCTTGTGTTTAATGTTTCCTTATATTGCACAGACTGTGTTTTAATATTTATA +AGTACAGTATTACTTTTCTAAATGGAAAAGGAAAAAAATGTATTGGCCTTTTTTTAACATATAAATGAAC +TTCACAAGCAACAATTTTTCAAAAATCAATTTTCCCTTTTGTGAATCTAGTAATGAATCTCGAAATGGGT +ACATGACACACTTTCCTATTGAAGGTAAAGTACACATTTATACCTTATCTCAGAACAGTCAGAAGCCAGG +TTACTGTTTTATTGGTTAGTGATGCAAATATTGATAAGGTCAGGTTGATACTTAGTATTTAGAAGAATTC +TGAAATAGCACTTGTTAATCTGTAAGTTTCGAATTGATTGCCTTTTTGGCAATTTTTTACAGTGTTCCCA +AACTTGATACTCATAGGCTATTAGTAATGGCATTTACTTCTATCTATTCTGGCTTTGAGTGCATAAGATG +TATTTTCTTTAGGCTTAGCATATAATGTAAATGTAATTGTTTATTTCAGATTTTAAAAGACATACAGAAT +TATTGCTTACACGAACAATCATCTTAACTGTTATTCTTTGCTGAAATTTTACTTTTTACCTACTGGCCTC +ATCATTTCACTCTCATTCCTTTGACCTCACATCTCTTTTTTCTTTGACTTTTACAGTCAAATAACTTAAT +GCTCCATAAATTCAAATATTAAGATTAAAAAAAGAGAAAGGAAGTGTGTAGAGAAAGCATATTTTAAAAA +TGAATTAAAGAGTGACTCATAGGTATAGGTTTGTAGATTTACCTAGAGTTTGAGTTCAGGTGAGTTTTCA +CAATATAAAGTAAGTGGAAAAGAATCCCCAGATAATTGTGACATGCTAAAGCTAGATACTGAGAACCAAA +AAGATGTGACCTCAGTATGGGACTTTTAATTCAAGGATATTAGCTACCAAATGACAGTTTTTCAGACCGT +AAGCATTGCTTTTTAAAAATCAAGATAAAATACATTTATAACATATCTTACTTTTAGTATAGCTTCTGAA +TTTAGTTCTCAATAGATAAGACAAAAGAGATGAACATCTGCTTCCTTTCCCCTCAAGCGATGTATTCTTA +CTAATAGGTAATATGTTTTTTTTTGGTACTTAAAGGCAGTGTTTTTCAATCTTTTCCATGTTAGTAGCTC +AGATCTTTAAGGACAGAGGATACAAATTAGGGGGCAGCTTGCGTTCTTAATCCTAAGTGGCTCTCTGATG +TCTTAGATTCTTCCTTTCCTTTTTGTGAGTAAAACTTCTGCTCTAGTAGGATGAGCCTGTTCACCTTGTG +ACTGCCTAACTGATCATTGCTTTATGTTCCAGTTTTTAGAAGCAGGCAGATTATATTACCTGACTTAAAA +TGCTTTTGAGACTAAAGGTAGTATCTTAAATTTTAACTTACGAGTTTCCTTTAAGTACAACTTTGATGAA +ATTATGGGGATTGAGTTTCCATGGCTTTCACCATAGTGAAAATGGAAAGCATGAAACATTCCACAGCAAA +CCCTGAACTTATTTATAGGGAGTTAATTAAGTGTCTGATCACAGCAAGCATTTACTCTGATCTGTTTGTG +TATGTGTGTCTTTGACAGATGAGCATGGTACTTGAAGCATAATTCTGCTTCCTCAGATTCCAATATCTAG +TAATTTTATCATGTAGTTATAGTAACTAATATTAAATCCCCAAACCTACTTAGGTCTACTCTGTATTTGT +GTTTTCAAATAATTTTTTTAGCTTCCAAATAGAAGCGTTGTATTCTTTTCTGAATTAGAAAACATTGTTC +GACTTAACTGCTAAAGAGTTAGACAATCATTTTGCATATGTTTATTATGTCTTAGATTGTTTGCATTGTT +ACCTAGATTCTCTTTCATCTTTCAATGTCAGTTTTTATTGGTTTATTATATGTTGTTTATTTTCTTTTTA +GGAAGCCACCATTCTTCAGAGGGCAATGAAGGCACAGAAATGGAAGTACCAGCAGAAGGTTAGGAGAAAA +AGAGATTGCATATTTTCCACTATTTGTTGTATGTTTATTCTATTTTATAGATTCTATTATCCTTAAAACA +CCTTATTTGGAAATAAAAGGTGATTCCCTTGGTATGCCTTCCTAGCCAACAGGTTATTATTTTTTTAAAA +ATTCTAAGATGTAATATACTTCTTTGACCATACTGATTAATAAAGCCTATGGGAATGAAACTAAGTGGTA +AGTAAAGCTCTTACATGATGCAAAATGCTACTTTTTTTTTTGTCCTGGTTTTCTTGTGCCATATTAAATA +TCTTTTAAGGAAGAATAGCTTCAGTGTTTCAGAATTAAAATTACTAGCTCCATTCATATTTTATTTTCAA +ATGTATTTTTTAATACTTCTAGTCTCTTAGTAGGGTAGGAATTTAGTCTACATAAAATTTATACAGGAGA +GAGTTTCTCCCTTTATTTTGTACTCACAACTACTCCATAAATGACTTTGTCAGTTTACCAGCTCTACAGC +TTTGGTTTTAGAATTGAACGCCTGACTGAGGTACAGTTCTTTTTGTTTAGAATCCTGAGTTCAACTTATC +ATGTAGGGTGGCTTCTTATTTCCAAGGTGTTTAAACGGTTCCACACAAATTTTTGTAGATTTCTCCCAAG +GAAAATGACTGGTAAGGTAACTGAATCTTGTTTTAGAATTATTTCTCTCATAAATTGGTCATGTTATCCC +TTATTTTTCTAGAATGAAAAGAGGAGTCTGTTGTGTAACTGCTTATATTCCATTGTATTTACATTTTTAT +TTTAATTAAAAAAAAACATTTGTTTGATCTTTTACCTTCTTTGAGGGAAAGGCAATCTTAAAAGTTCCCC +AGTATAGGCTGGTTTCTCCAAATTCAATTATAGAATTAGGGTTTCTCCCTGGAATACATTTCTCTCAAAA +ATACTGATATGCTCATTGTAAACCGTTGGTTACATTTCCCTGTCAGCCCTTTTTATAGAAGATTGTGTGG +GTGAATGTTGTATGCTTTACTGTGTGGAGGTGGGTATCTTCTGAGTGACAGAGTTTTCAGCTTTGAATAG +GCGGGATTTAGAACAACAAGACCTAGGTAATAGATCATGTGCAGTACGAGGGCTCATTGGGCAGTACCGA +TGGCTCACCGCCTTGCAGCCCCGCCGGCAGGTACTGCACCTTGAAGACTGCGTTTTTAGGACATTTTTAC +TGTCATATTCCAGTGTTAACAAAGTTGAATCGAATCTGGGTTCTGGCCTTTATTCCTTTACTAACAAGTA +AACATAATCTTTGGCAAATCAATTAATCTCTCAACCTTTGCACAATGAAAGAGAACTAATGAAAATACTC +TATAAGCTTCTAGCACTAACACTCTGATGTACATTTGTCTTGTGTAGGAACAAATATGAATGAACTTTGT +AATTTTTGAAATATGTTGCATACAAAGCCAGAGGATATAGTCTTGTGTGCACTTGTGACACTCAGTCTCT +TGATAGGAAAGTCTCCATTGTAAAGCCTGGGTTGTGGTTTTATTGAACAGTAGAAACCACTCCTCACATT +TCTGAAAGGAGAGAAGAAATTCTAACCAAAGTAGAAAGCATTTGTTTCTATCCGAGCATCTTCTAGCTGT +GGCAGGAAATTCATTGTGACTTTGTCATATCCTCATTCCTCCAAGACTATGGGATTTTTTTGAGTCAATT +TCTGCAGGTAACATGTTATCATCTAAATTTTTCCAGAGTTTTTCTAATTTTTGTTTTCTTTTATCCCTAT +TTATAATGACAAGGGTCAAATTTTAATTTCATTAATATGAAATGAATGAAAATGCCAGTGGATTTTTATA +GTAGGCTGCTTTTGGAGTATCAAATCAGTAAAATGTTGGAACATATGGAAACATGCTTAATAATGAATGT +CATTTTAGAATTTTATTTTTTTCAAGATTCTACTCAACATGTCCCTTCAGAAACAAGTGAGGACCCTGAA +GTTGAGGTGACTATTGAAGGTTAGTTATCTAAAAGCCTTGATTCCAAAGTTTACTTCTGGTCATAAAAAT +ACTTGTCACATTCACATTGCTAAATATGCATTTCATTCATTTTTAAAGGAGACTTGTGGGACGAATACAT +GTGTCTGTGTGGCCAAAGCAGCCCACTCCCCAGTGCTTTGGGAGTTTTAGATGACAGAATGAGGCCGTGA +GAATTAGCATTGGCCCACGGGTGGCCCAAGCTATCCTTGCTATCAGTGGGAGTTGAAAAGAGAAAAAGAA +AATTATGACCAGACTTGATACACACAAAAACAGATTCTATAATCTATTGGGTTGTGATAGAAGAAGACTT +ATTTGGAGCTGGAATCCCTAATTTCTAATTAAGTAAAGTGGAGGAGAAAGTGTTGATAGTTTAGTGAGTA +ATTTTTAAAGCAAGTCCCTGCCCCCTTCAACGACCCGCAAATACTACTTAGAGGCTGACATACAATTTAT +CTTTCAAAACTGTTTTGAAGCAATAGCTACAAATATATTCTACAATACTCTCTATAGCTAAGATTGCTTC +AAACCTGTTTTTTTCTCCAAAGATAAGCTTGAACTCAGCTGAGCCAGTTTATCCTGAATTATTTGAAAAT +TCAGCTAATAGAATTATTAACTATCACTTCATATTAATAGTTATTAAATGAGCATTGCGTTTTTCCTTTT +AAATTAATGTCTCACAGCTGGGCACGGTGGCTCACACCTGTAATCCCAGCACTTTGGGAGGCCAAGGTAG +GCAGATCACCTGAGGTCGGGAGCTCAAGACCAGCCTGACCAGCATGGTGAAACGCCCATCTCTACTAAAA +ATACAAAACTAGCTGGGTGTGGTGGTGCATGCCTGTAATCCCAGCCACTTGGGAGGCTGAGGCAGGAGAA +TCGCTTGAACCCTGGAGGCAGAGGTTGTAGTGTGCTGAGTTCACGCCATTGCACTACAGCCTGGGCGACA +GAGTGAAACTCCATCTCAAAAAAAAAATTAAAATTAAAATTAATGTGTCACAATACTTGTGTAAGTTTCT +TCCCCTGAAAGTACTTCTTTCTTGTGGTAAAATGTGAAAATGGTACAGTAAATTGTACAGTGGAAAGTTT +TCTTCACCCTCACTCTGCTCCCAAGTCCCAATACTGAGGGATGTATGCCAGCTTATTGTGTCTACTTAGA +GAAATAATTGCAGTACATATCTTGATGCGAAAATACACAGATGCTTTCAGCATACATTACAATGCACATT +TTAAAATATTACATATTCCATAGGCCTCCCCCTCCCATAACTGCACATCTTGGTACTTTATTAGTTTTTT +TTTCTAGCTGCTGCGTAACCTTTCTTGGTTTGCTGTCTGTCTCTCTTTCTCTCTTAATAACCTGTCACCT +ACTGATGTCACTTGGTTATTTCTAGTGTTTGACTACTCCAAACTACTCAACGTGTCTGTCTTCATTGCTT +CTAAAATATAAGTCAATTAGTAGCTTCCACTTAATACAAGAAATCTTTTCATACTTCATGTGTTAACTTA +ATATACAGATTTCTTTTTTCATAAATATAATTTTTCTAATCTAAAACATTTTGCATTGTTTACAGTCCAG +CATTAAATGGTTGAAATTTAATTCGTTTTTAGTGAGAAACCTCATTAACAGGAAAAAAACTTAAGAACAT +ACAAATATTACAACTACTCAGCTGTGCCCTTTCTATGGAGATATTAATAGGAGGTTATTAAAATTATAAA +AATAAAGCTTTTAGGGTAGGACTAATAACCATTTTGATTTGTCAAATCTTAAAGCAGATAGAAGCTGTAT +TTTTTTTTTTTTTTTTTTGAGACAGAGTCTGGCTCTGTCGCCCAGGCTGGAGTGCAGTGGTGCGATCTCG +GCTCACTGCAAGCTCTGCCTTCCTGGTTCACGCCATTCTCCTGTAGAGGCTGTATTAAAATGGTTTAGAA +TGTAAATACTTGCTTTTATAGTATAAGACTTTTAAAGAAAGGTTACGAAAATCCAGAAATAAATGGCTCA +TGAACCTGAAGCTTATGTCAGCCTTTTGAACTTTAACTGTTTTGGATGGAGTAATAAATCTCTTTTCTTT +TCATTTATTTGTTTATTTATTTATTTATTTGTTTTTGAGATGGAGTTTCACTCTTGTTGCCTAGGCTGGA +GTGCATTGGCGCTATCTCGGCTCACCGCAACCTCCGCCTCGCAGGTTCAAGCGATTCTCCTGCCTCAGCC +TCCTGAGTAGCTGGGATTACAGGCATGCGCCGCCATGCTCGGCTAATTTTGTATTTTTAATAGAGACAGG +GTTTCTCCATCTTGGTCAGGCTGGTCTCGAACTCCTGACCTCAGGTGATCTGCCCACCTCGGCTTCCCAA +AGTGCTGGGATTACAGGCATGAGCCACCGCACCTGGCCATTCATTTCTTTTAAACTAAGACATTGTGCCA +TAAGCCTGATTTAATGTTCACATTTGTGATTAAGTAGTAGAATTTTGCTAGTAGACCTCTTTCTTTTCAG +AAGACCAGTGAAATACCAGGTCTGAGTAAAGTAATCCTTTTAAAAGGTAGCATTTTCCATGTCATATGTG +CATATACACACACACATTTTATGTATTATATATAAAAAATATGTATATTATATAGAAATTACATCAACAA +TTAAATTCCTCATTTTTTTAAGAAGATAAAAAACTGTCTCAATATTAACAAGGAAGGTCAAGTGAGAACT +TACATGCACACCTCTGGTTAATAATGTGTATCATATATTCTTTCTGGAAAGATTACTTCCAAATGGTAGG +AGCTATTTAGATACCCGTAGTTAGGAAATAAAATTTTGGTATATCTTCATAACAGAAAAATATATAGCCA +TTGAAATCCTGATAATGTTTTCCAATAATCTGAAAAAAGATCTCTAATATAAAATGTTCAGTGGAAAAAA +AAAAAATCCTGCCTTGAGAACAAGGCAACTTGAAAAGCAAAAGTGCATCTAGAGGATAATCTTAGTTTTG +TTTTGGAAAATACTCATGCTTGGTAGGAAGACGGACAAAAATAAACTGAGATATTCCTGCTTGCTTGGAG +ACGACTTAATTTTCTCCTTTTATATCTTTTCACATTTCCCAAATCATTGGTAGAAAAAAAAACAGAACTT +TGTTATCAAATAAGTCTGTCCGATTTTTCACAGTGTTTTCACCTCGATATCTTCATTTGCAATTATGTGG +AGTTCTGAAGTGGCTAATTTGACACCGACGCGGACGTGTTTGTGGATACAGGGTCATTACTTACCCACTG +CCGGTGCGCTCTGAGCTGTGCACTTCCATTTGTGGCGTTTTATTTGTACTTGGAGTCGGTGGTAAACATT +TAAAACATACGTGACTTAGGTTTGGGAAATCGGCCTTGGACTAATAGGACCGCATGCCTCAGTTTTCGTA +AACACCGTGACTACATTTCTGGGTTAAGGTGAATTGCCGTCAGGTATGTTATCTCTAATGTGTGCAGAGA +AAACCTTATCAGACTTGTTTCAGGTCTAGCTGTCATGGGAGTGCTAAGAACCGTGTAAAGGAGGTTTCTT +TCTTGCGCAGTTCTCTCTTAGGGCCTAGCTTCCCAAACCTGAGTCACTTACCTGTTACCTGTGTTCCTTC +TGCTGTGGCCTAACTCTCTGTTCATTACTTAACCTGATTTTTTTCCCTTAAATTGATTTTAAGTAAATTC +ATTTTGTGATGAATTTTACGTCTTCCTAAGTCAAACTCTGGCTTTTTTATTTCTCATATTTTGAAAGGTA +TATGTAAATAATTTCAAATATAATTGACTCACATGACCACACGTGGCACACTTACTGCAGCTGGTACTGT +AGGTTTTTTAGGATCTCTGTAATTGAGGAGGTTAGCGTTGGAAAACTTCTGTGTTTTTTATTTTTATAGC +TGCCAATCAGACACTTAATGAATACACAATTATATATGTAAAAAAAGGAGTCTCAGAGGAATTATGTACT +AAGATTTGAATAATACGTTTTTAGAGTTTCAGTATTTTAGTAGGGTTTATGATGGGCAGTTGGACTACCT +ACCATCCCTTTCTTGTGGAATTTATGTTTAAGATAGTGCCAGATACTTGACTGCAGCATATAATTAGGGA +CTAAACTATTCATATGTTCATTTAATCCCTCCACAAATACTGAGCACATTGTATGTACCAGGTGCTAAGA +GTTTTTTTTTTTTTTTTTTGAGACGGAGTTTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCAATCTC +GGCTCACTGCATCCCCTGCCTCCCAGGTTTAAGCAGTTCTCTGCCTCAGCCTTCCTCCGAGTAGCTGGGA +TTATAGGCGCCCGCCACCATGCCTGGCTAATTTTTTTTGTATTTTTAGTAGAGACGGGGTTTCACCATTT +TGGCCAGGGTGATCTTGAACTCCTGACCTCGTGATCCGCCCACCTCACCCTCCCAAAGAGCTGGGATTAC +AGGCATAAGCCACCGCTCCCGGCCTCTTTTTTTTTTTTTTTTTTTTAAGACGGAGTCTTGCTCTGTTGCC +CAGGCTGGAGTACAGTGACCTGATCTCGGTTCTCCGCAGCCTCCGCCTCCTGGGTTCAAGTGATTCTCCT +GCCTCAGCCTCTGGAGTAGCTGGAACTACAGGCACACGCCACCATGTCCAGCCAATATTTTACTTTTAGT +ACAGACAGGGGTTTCACCATGTTGGCCAGGCTGGTCTTGAACTCCTTACCTCAGGTGATCCACCCGCCTC +GGCCTCCCAAAGTGCTAGGATTACAGGCATGAACCACCGCGCCTGGCCACGGTGCTAAGAATTTAGTAGT +GAATAATTCAGTGGCAAATCCTCGGGAGTGTTTTGCAAATAAGAGTGACACTTTTTGTCCTTCACTTCTA +GGAAACGTGGTGACTATGTTGAGAAAAGGGGGAAAAAAAAAAAAAGAGAAGACCTAGAGCAGTCAATAGT +TTTTTGACAGAAAACTAGCTTGGGAGCCTCTTGGAACTACCTAGGGCACAGAGAAGAGTGCTTTAGAGCA +GGTTGGTGGCAGGGGAGATAAGAAGTTGTTGATTCTGGGGAGATTTTGAAGACAAAGCCACCAGGCTTTG +CTGTTTATTGTTTAATTCTTGTATTTTCTTAAATGTCTCTGCAGCACTTTTAAAATGGACATAATTGCTT +AGGTTAGCACTTGAACACTTTGTTTTAAATTTTTTTTATTGCTGCCAAATAGTCATTTAATAAATACCCA +ATTGTTTATGTAAAGAGTTTTAGAGGAGCTATAAGTGGTATCCATGGGTGAAAGAATGAATCCATTTATT +TGAAATGTATAGGGTAATTAATGGCTCCAGTTTTGAAATAGATATATTACCTGGTGTTGATTGTTTTTTT +AACTTTGAATAGATGTCAGATTGTATGATAACACATTCCCAGTCTGATTATGTTGAAGCTAAGAGTATCA +GAACATATTGTAACATAGTCTATTTATTGATTCACCGGCCAGAATGCAGTGATAAACAGCTTTAGCTCTG +AAGCCTGTATTATCTTTAGGTGTATTCGTTAATATGTTGCAAAAAGGAAGTTGGTGTTTTCTAAACACAA +ATTTTGAGTTAGGGACGGTGTTGCCAACTTGAAGGTATTGGCGGAGTGCTTTTGTTAGTGTGGGTAGCAT +AACTATTGTAAATTCATGAATAGCACCTTTTGCATATTAAAATATCTTAAAATGGGCCAGGTGTGGTGGT +GTAATCCCAGCACTTTGGAAGACCGAGGTGGGTGGATCACCTGAGGTCAGGAGTTTGAGACCAGCCTGAC +CAACATGGTGAAATTCCATCTCTACTAAAAATATAAAAATTAGCTGGGCGTGGTGGTGGGCGCCTGTAAT +CCCAGCTACTCTGGAGGCTGAGGCAGGAGAATTGCTTGAACCCAGGAAGTGGAGATTGCAGTGAGCCAAG +ATCGTGCCACTGCCCACTGCACTCCCAGCCTGGGTGACAGAGCAAGACTCCATCTCAAAAAATAAAATAA +AATATCTTAAAATGTATTTTAAAAGCTTTACATTTTGATTATGGAAACAGAGTTTGGGCTTGAATTTGAA +GCATGCTGAAACGTGTGTCTTGTTAAACAAAAATGGTTGTTTATACTTTGGAAACGGCAGATAATTGGAA +GAGCAATGCCCAGTATGTGAGGGCAGGCCCTAGAGCTTTAGTTGATGTTTTATGCTTCAGAGTAAAAAGT +AATTTTGATCCTTTGTGGAAAGGATTTTAGGTCGAGAAACCCATTTTTTTTCTTACTTTCTCAAAACCAA +TTAATTGCAACAAAGAGTTAATTGCAGAAAAAATTGAGGATGTGGTGAAGTTTATAGCTGTTCTGATGCC +TGAATGCCATGTTCCATATGACTTAAAACACCCAAAGATTAATTAATAAGTTTATGAAACAGGGTTTTTA +AAATAGAATATATGGTGTGATATTTTATCAATAACTACATCCAGCTGGGTGCGGTGGCTCACACCTATAA +TCCCAGCACTTTAGGAGGCCAAGGCAGGAGGATCACTTGAGGCCAGGAGTTCAAGACCAACCTGGGCAAC +ATAGGGGGACCCCTATCTCTACAAAAGTTAAAAAAAAAAAAATTAGCTGGGCGTGGTGGTGCAGGCCTGT +AGTCCCAGCTATTTGGGAGGCTGAGGTGGGAGGATCACCTGAGCCTGGGAAGTCGAGGCTGCAGTGAGTG +GTGCTCCAGCCTGGGTGAAGGAGGGAGACCCTGTCTTAAAAAATAAAGTAACTACATTCATCTCTATATA +TGTTCACATTTAACAGATTGAACTTTATTATTTGTCAGTGAAGAGGAAAATGTGATATCCCTGATATTTT +CAATATTTTACAGTCTTTGTTGTATTAAAAAAGCACTCTAGTAGATTTGCCTTTTTCTCTTTCTTTTTTT +TTTTTTGGTTTATGATCGATTGGGTTACCTTCATAAAATATATTTATGCTATTAAATCTATTGTGGTCTA +AAATGTTTGTGTGACTGAAAAAGTGAAGTCTTTTCAGTAAGGAAAACCTGTCTTCCCTTCCCCTGGGCCT +CAGCTGTGAAGTTTGGATTTGGACTGAGATCCATGGGATGGAGCTCCACAGGTGTCCCTTCCATTCCATC +TACAGCTGTGTTTCCTTTCTCATGTCATTCTACTCTAGCAGCTTTCAGTTTATGTCACTCAATGAATAGT +CAGCTTAAATGATGATCTATAAGGATACTTAGGAGACCTTAACCTATAGGGGAAATACTTTTATTTTAGA +AGTTACTGCTTAATGTTTGTAAAAAAATATATAGTAATATTAAGCATTTATAATGCTTTGACAGTATTCA +TAGGTGAAATGAGTGTATTTTGTTTTAACCTTTGGAAGCCAGCATAAAAATACTCTTAAGGTTTCTAAAA +TCTGTTTGGGAGTTGGAAAATCGGGTTTTTTAAAAAGTATATTTTCAGAATTGAGGTCCAAACTTACACA +CTTCTGTTTTCCAGATTGTTTCTACTTAGGTTGGAACCTTAATCTATTTATAGGGTGTCTTGACCATTTT +TAATCCTTACAGTCATACAAACCCAGGTGCCAGTCAAGTTTTATTTCACAGGGAGGTTGTCATTTTAAAA +TTGTTTTCTGTCCTGGTCCCGTGGCTCATGCCTATAATCTCCGCACTTTGGGAGGCTGAGGCAGAAGGAT +CACTTGAGGCCAGGAGTTTCAGACCAGCCTGGGCAACATAATGAGACCCTGTCCCTACAAAAAATTAAAA +GAAAAATAGCCAGGTGTGTTGGTCCCAGCTATTTGGGAGGCTGAAGCAGGAGAATCATTTGAGGCCAGGA +GCTCGAGGTTGCAGTGAGCTATAATCATGCCACTGCCCTCCAGCCTGGGCGATAGAATGAGACCCTCCCG +ACAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAATGAGACCCTGCCAATA +GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAAAGAATGAGACCCTGCCGATAGATAGAT +AGTTTTCTATCAAATTTTTTCCTAGATTTGAACATGTTTTTCTAAAGTAGCATTCAACACATCAGCATTT +TATAGCGTCATTAGTTGTTACTATGATTATGTTTTTCTGAAATAGTATCCTTTACAAAAGCACTTCTGTC +TTTCTAAGCACATACATAGGTCCTAAAATGAATTTGTCTGATGTTGGTGACCTCGGAACCATTTTCTCCA +GTTGATTAGCACGGCCAGCCAGTCAATAATTTCAGGTCACTGTTGGCCTTAGAGGAAGAGCCCAAAGGCA +GCAAGCAAGGGTGCTGGTGTCCAGTCGCCTTCTAGAAGCATTTTCACCTTCCCTTAAGATTTCCCTTGAT +TAACACAGAAGTGTCTATAGAAGTGACCCAGTGCTGCCCCGGGCAACATCGTATATTAGGCCAAATTTGC +ATTTCTTACCTTTATGAGAAGCACCCTCGTAGTCTAGTGGAGTTACACACACACAGTCTGATCTCAGCTG +TGCTCTCCAGAGATACAACATAGTCCAATCAAATAACATCCTCTGAGCCTGTTTCTTTAGCTGTAAAATA +AGAATAACAATTATACCCATCTAAAAAGATAGTGTCTTGTACTTGAGTGATCTTTTTTCTATGATACTGT +CTTTAACACAATAATTATTTTCTTCACCATACCACATTTGTTTTTAAATAAGAATTTCTTAAAATGATGT +TTTCAGTATTTTATAGATGATGTTTGAGCAGCAATATTAATAGCATGTTACCTGACACTTTGAGGGACTA +GAAGATGACAGTGGCAGCTGATATGACCCAGATGTCTCTAATCCTAACATGAGGTGTACTGAAGGGTCAC +AGCAGAGTGAGTTCCTGGTGTATCCACAAGGAAATGAGCACGGAACTCTCACAACAGCGTGTGTGCTGTG +CGCTGTGCTTGAATAGCTGACTGCTCCTTTATACACAGCCTCTGTGCTGACTGAATCAACAGTATCTGTT +TCATCATAATGCAGCCCTATTTCTTTAAGCCCATACATTTTGCACTTGTTAAAAGTATTTGAACAGGCCA +GGCGCGGTGGCTCACACCTGTAATCCCAGCACTCTGGGTGGCCGAGGTGGGTGGATCACTTGAGGCCAGG +AGTTTGAGACCAGCCTGGCCAACATGGCGAAACCCCATCTCTATCAAAAAAATACAAAAATTAGCCGGGC +GCGGTGGCGGGTGCCTGTAATCCCAGCTACTTCGGAGGCTGAGGCATGAGAATCGCTTGAGCCCAGGAGG +TGGAGGTTGCAGTGAGCCAAGATTGCATCACTGAACTCCAGCCTGGGCAACAGAACAATTCTCTGTCTCA +AAAACAAAAACAAAAACAAAAACAAAAAAAAACTATTTACCCAATTTCCCTTGCTTATGCATGCTTAAAT +TTTGTATAAGCTTAAATATCTTTTCCATCTAAGCTGTACTCTATCCCCTTTTATAACTTTAGGTGGCTGT +CTTTATTAAAAGTTTTTTCTTGAAAGTCTTAAAACAATATAGTTCTTGGCAATTTGAAAGTTATTTGAGA +AGGGGAAATTTATAATGACAATTCAAATGAAGCAAACTAAAAAATAATGAAGAAAGACAGAGGAAAAAGC +AGTATTCACTTGAACACATCCCAAACAAAGAAAATTTCAAATGTAACAAGGAAAAAGCCTGCTAAAGCTC +ACAATACAGAAATAACTATTGATACTCAAAATAGCTTTAAAGCCCTGCTCACCTTTTGAATGTTGGGAAT +TGACCAGGAGGTGACTGTAACTGTAAGATGGTTCTTCCAGTAATGACCATTTTCTTTTTCAAGATGATGA +TTATTCTCCACCGTCTAAGAGACCAAAGGCCAATGAGCTACCGCAGCCACCAGTCCCGGAACCCGCCAAT +GCTGGGAAGCGGAAAGTGAGGGAGTTCAACTTCGGTAAGTTCTCAGCGAACGACGTGACCTTTTCCTTCA +TCTTCTGGATTCTCAGTGTGACTGATAAATGTTGCAACATGCTCTGCAGGGGGAAAATGCTTTAGCGTCT +ATTACTGCATCATAATCTCATCTTTGGAAAGCCAGGAGCATTTTGAAAATTACATTACAGACATTGTTTA +AACATAGTTTGGATTTACCAAAGCATAGGACATTGTCTTGTCTGATGTTAATTAGTCAGCTCAAGATTAG +TGCTAAAGACTTAGTAATTTAGTTATTTCTCTTAGCTTTAAAATCTTTATTTCAGAACTATTTCACCTCT +TGGTTTTCTTTTTTTTTTTTTTTGCTCTGTGTTACTGCAGCTGCTAATCTGTGAGCTCTCAATGGATGAT +GTATCCTAGCAAGGGACTGAATGAGATATTAGCGGCAAATTATGTTGATGATTCATATTTTGAATAAATG +GAATATTAAGCTTGTATACATCTTGAAAATAGTACTTTAATATTCTACTGTGTCGTAGTCACAATGATTG +GATATGAATTGAATTTTTGTACTTTTTAAATATGTTTTTGTTCTTTATTTTTAATTTTTATTATTATTTT +TTTGAGACAGAGTCTTGCTCTGTCACCCAGGTTGGAGTGCAGAGGCACGATCTCAGCTCACAGCAACCTC +CGCCTCCCGGGTTCAAGTGATTCTTCTGCCTCAGCCTCCTGAGTAGCTGGGATTACAGGCGCCCACCACC +ATGCCCAGCTAATTGTTGTATTTTTAGTAGAGATGGGATTTCACCATGTTGGCCAGGCTGGAATCGAACT +CCTGACCTCAGGTGATCCGCATGCCTCAGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACTGCACT +TGGCCTGTTCTTTAGCATTTTAAGTTGCAACTATATATCCGTAAAGTCTTATTTCCACACAACTGAGACA +TGTTTTAGGAAGTTTGCTAAAAGACCCCTGGAGACCTTTATCGTGGTCTCCCTTTTGTCGTGTTTCATTT +GCTTGATCTTTTCTGCCCTCCTGCTTTTCAGAAATTAAAAGGCTAAAAAGAGGTGCTAAATGTTAAAACT +TCTCGTGTAGTCTCCCATGAGACTATTAAAAGTAATGGCAAAAGCCACAGTTACTTTTGCACCAACCTAC +TAATTCTAAGAACCACCAAAAAGGGGAAAGTTCTTGGAAAGCAGTAAAATGATATGGACAGTTGGGATGT +AAAAATGTAGAAAATATGTCATTGTATGTTCAGTCATCCCAAGACCCTAGTGCTGCCCCGATGGTAGGGA +CTTCTTAATGAGAATTAACTTTTGCTCAATTTTCAGAGAAATGGAATGCTCGCATCACTGATCTACGTAA +ACAAGTTGAAGAATTGTTTGAAAGGAAATATGGTATGTCTAAATAGGAAAATTCCTGTAATACTTTGTTC +ATGAGCATTTACACAATGGCGTTACTGTTCATCATGGGGGTGATGTGGACAAGCCCAGCCCAGGGCTGCC +AGTGAACCGTGCCACACTTTCTTACACGTCTCTCATTGTAAGGTCCTTAGTAGTGTCTGTCTAAATATTA +GAAACAGTCTTTGTTTCTAGATTACAGTAAAGCTAAGGAAAAGTTGTATTTCTGTAGTTATCAGCTAACA +TTTCTTTTAAACTTCCAGCATGGATATTTGGGAATTTATTTACATATTTATTACAAAGCTCTGGATCTTG +GGGGTTTCATTAAAAATTATTTTTTTAACTGACAGTTTCTGTTAATCTACTTTGTTAAATCCAGTATTTG +CTGAGATCCCCCTATTGTCTCTACTTTTATCTTTTTTTTTTTTTTTTTTTTTTTTTTTGAGACGGAGTCT +CTGCATTGCAGTAGTACAATCTCGGCTCATTGCAACCTCCGCCTCCCGGGTTCAAGCAATTCTCCTGGCT +CAGTCTCCCAAGCAGCTGGGACTACAGGTGCCCGCCACCATGCCCTGCTAATTTTTGTATTTTTAGTAGA +GACAGGGTTTCACCATGTTGGCCAGGCTTGTCTCAAACTCCTGACCTCAAGTGATCCTCCCGCCTTGGCC +TCCCAAAGTGCTAGCATTACAGGCGTGAGCCACCCTTCCCGGCCACTGCTTTTATCTTGATGTATAACTG +AGTTGATGCTAGATTTCAATTCCTTCTTTGTCCTTTTACTATTCTGTCCTATAGCCACCTTTATATAATG +ATCAAAGAAATTCGCAATTTGTTATTATCATTTTTATTTTTTATAAAATATTTAAAATGATTTAAAATAA +AAATCATTTTATTTTTATCATATTTATTATCATCTGTTATCATTTTTATTTGGATGACTATTTACTTTGC +CATTAACTAGCAAGCGGTAAAATTGTATGATATGCAGTTTTAACTGAATTGCTATAAGTGAAAATTTAAA +TGCAATAAACCATATTGATGGTATTTGTGTTAACAAACTTAAAATGAGCATTTTTTCTTCATCATGAGTA +ATATAACCTACCCCTCAATGAAAATCTACAATTAGAGTAAATTTGCTAATGAATTCAGTAACTTTTCCAT +ATTTTTAGTGGTTTACTTAAGGTTCTCTTAGTGTTTCTCCCAGTTTTTAATAGCTTACACCTTTTTTGCC +CGTGGTTTTTTGTTTTTTGTTTTGTTGTTGTTTTTTTTTTTTTTGAGATAGAGTTTTGCTCTTGTTGCCT +AGTCTGGAGTGGCACGATCTTGGCTCACTGTAACCTCTGCCTCCCAGGTTCAAGCAATTCTCCTGTCTGA +GCCTCCTGAGTAGTTAAGATTACAGGTGCCCGCCACCATGCCCAGCTAATTTTTGTATTTTTAGTAGAGA +CAAGGTTTCACCATGTTGGCCAGGCTGGTCTTGAACTCCTAACCTCAGGTGATCCACCCACCTCGGCCTC +CCAGAGTGCTGGGATTACAGGCGTGAGCCGCTGCGCCCGGCCTCATGTTTTCTATTGGTTCATTATGAAG +CAAAAACTTCATAGCATGTGCTACCTGGAAGCACTGTAACCTAGTGGTAAGATCATAGGCTCTGGGGACA +CAGTGCCTTGCCACGTCTCTTCTCCTGTCTGAGTCTTAGTATCCTCTTTTGTGGTCATGAGAACTGAAGA +TCTATCCTGGAGATTGATAAGATAGTAAAGTGCTTCACGTAATACCTGGCATACATGTAATAAATGCTTC +CTGTGTGTATATATATACACACATATACATATATATGTATATATACATATACACATATACATATATATAT +ATATATACATAAACACATATACATATATATATAACACAGTGAAACCCCCGTCTCTACTAAAAATACAAAA +AATTAGCTGGGCGTGGTGGCGGGCACCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATGGCGT +GAGCCCGGGAGGCAGAGCTTGCAGTGAACCGAGATCACACCACTGCACTGCAGCCTGGGAGACACAGCAA +GACTCCATCTCAAAAAAATAAAAAAAAAGTTAAAGAGCACTACAGATTCAATGATTTATTATTCTTTTCT +ACAAATTGTGTTTAAATGATATCTCTTTCTCTTTTTGTCCTTATAGCTCAAGCCATAAAAGCCAAAGGTC +CGGTGACGATCCCGTACCCTCTTTTCCAGTCTCATGTTGAAGATCTTTATGTAGAAGGACTTCCTGAAGG +AATTCCTTTTAGAAGGCCATCTACTTACGGAATTCCTCGCCTGGAGAGGATATTACTTGCAAAGGAAAGG +ATTCGTTTTGTGATTAAGAAGTAAGACTCTTGGATTCCTGTTGAACTCTTGTCTCTTTTCTGAGTAATAC +GTCTTTTTTATTGTTGACCAATATTCATTCACCACTAGGTTCTATGTGATGAAGTTTGAGTTATTTTATG +TATTTTTATCTTGCACTTTTTAATTTATCTGGGTCCAGCATTGCATCAGTCATGCAGTGTTGGCATTCGA +AGCATGAACAGTGCCCGCACTGGATTGGCATGCAACTCACATTTTCTTTCACAATTTTCTGCTACTTTTG +CTAAAGAACATAGAATCCACGCCTTGTTTTTAGGCCTGATATATATATATATTTTTTTTTTCGAGATGGA +GTCTCACACTGGAACCCAGGCTTGAGTGCAGTGTCGCAATCTCAGCTCACTGCAACCTCTGCCTTCCAGG +TTCAAGCGATTCTCATGCCTCAGACTCCCGAGTAGCTGAGATTACAGGCGTGCGCCACCATGCCTGGCTA +ATTTTTGTATTTTTAGTAGAGACGGTGATTCACCATGTTGGCCAGGCTGGTCTTGAACTCCTGACCTCAG +ATGATCTGCCAACCTCGCCCTCATAGGCCTGAGATTTTTAAAGCATGCGTGGGAATATATGATTGTTTTT +ATAGATGTGCAGAGGAAGATAGTCTTGAATGCAATATGACATTAAAGGATCCCATTTAAGATTTTTGTAA +TATGCTTCAAAGACCTGTGGGTTGCAAAGTTACCTCTTTACTTGTGAGGATACATGCTCCATGAAGCACC +TTATGAGACAACTTGCAATTATTAGTTTGCTTTTTACTCTGTAGAAACCTCAAATTAAGATTTAGTTGTG +GGCTGGGTGTAGTGGCTCACACCTATAATCCCAGCACTTTGGGAGGCCAAGGCGGCTGGATCACCCGAGG +TCAGGAGTTCGAGACAAGCCAGGCCAACAATGGTGAAACACTGTCTCTACTAAAAATACAAAAATTGACC +AGGTGTGGTGGTGGGCGCCTGGAATCCCAGCTACTTGGGAGGCTGAGGCAGGAGGATTGCTTGAACCCGG +GAGGTGGAGGTTGCAGTGAGTCGAGACTGCGCCATTGCACTCCAGCCTGGGAACAAGAGCAAAACTCCGT +CTCAGGAAAAAAAAAAAAAAGATTTAGTTGTGCTCAAGCATCCAGATTATCTTTTCTTTTCAAAACCAGC +CTTACTGACTAAATGTTAAATATGTACTAGTCGTTATTAGTTTGCTGAATATTACCTAGTGATTATTGAG +TATTTATTCTCACCTTTCAGACATGAGCTTCTGAATTCAACACGTGAAGATTTACAGCTTGATAAGCCAG +CTTCAGGAGGTAGGTCTTCAATCTCGAGGCAGATCAGAAGATTATGTGCAATAATTATTTCACGCTTAAC +ATTGATTTCTTCTTTATGTTACCTTCCACATGAAATAATATGTCTCTAACTATTAATTATGTGCCATTAC +AGGAGAATTCATGTTGTCAAAATTCTAATAATTTCTAGAAGAATAAACGCATCTTCTTTTTATTAACCCA +TTGTAAATACTTATAAATATTGCATTTATGGGTAGACAGAAGTAAAAGAACAATATTTGTTCTACTTTTG +ATGCAAGATTTATCTGGCATAATGCATTGAACAGTTTATTATTGAAGTCTACACGAGTCAATGGAACAAG +CATTCATTGAATGTCCATGATATGCAGGACATAAGAAGGTTTCCTTTTAGAGCATGGAGCCATTTATATC +ATCTCTTAATTGTTAGATGTATTTTGTTTTGTTTTGTTTTGTTTTTTGAGAGGGAATCTTGCTCTGTTGC +CCAGGCTGGAGTGCAGTGGTGCGATCTCCGCTCACTGCAACCTCCCGCCTCCTGGGTTCAAGCAGTTCTC +CTGCCTCAGCCTCCCTGTAGCTGGGATTACAGGTGCCTGCCACCATGCCCAGCTAATTTTTGTATTTTTC +GTAGAGACAGGGTTTCACTATGTTGGCCAGGCTGATCTCAAACTGCTGACCTCAGGTGATCTGCCCACCT +CGGCCTCCCAGAGTGCTAGGATTACAGGCATGAGCCACCGCGCCCAGCCTGTTAGATGTGTTTTAAAATA +AATAGAAATATAATTGATTTTCTTGCTTGCTTTTGCTCTGGAGTGGAGTGGAGTGAGGATAAAACAGAAT +GGAATCACCCTGTTGATATTTACTAAGATAGAAGGACTGCAGCAAGATCATAGCCCTAATCTTCCTGTAG +CAAGTGTTACCTGCTAGCTGTTCCTGAATACACTGAGTTTTGCTTTTTCCTAGCTCTAATGAATGTTTCG +TTCTTCCTCTTTTTGTACAGTGTCAGCATTGTTTTAGAAATAAATACATTCTAGAATCTTAGGAATAATT +TTATTATTGTCTTTTTTTTTTTTTGAGACGGAATCTCATTCTGTTGCCCAGGCTGGAGTGCAATGGCGCA +ATCTCAGCTCACTGCAACCGCTGCCTCCTGGGTTCAAGCGATTCTCCTGCCTCAGCCTCATGAGTAGCTG +GGACTACAGGCGTGCGCCATCACGCCTGGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCATGT +TGGCCAGGCTGGTCTCAAACTCCTGACCTCAAGTGACCCGCCTGCCTCGGCTTCCCAAAGTGTTGGGATT +ACAGGTGTGAGCCATTGCACCTGGCCACGAATCTTAGAAATAATTTTGGCCATAGGAAAAGGAAAAGTTA +TACCTCTTATTTTATAGTAATAATGTTTAATAATCAAGTTATTAAACCCATTAAATTGAGAACACTTGTA +TTACTGTTATTTTGACAGTAAAGGAAGAATGGTATGCCAGAATCACTAAATTAAGAAAGATGGTGGATCA +GCTTTTCTGCAAAAAATTTGGTAAGTCTGTTTTTTTTAATTACCCCTTCAACTAAAATGTATTACTGAGT +AACATTTTTTTAAATGTTGTTTTATTTTAGGAAAGTAAATACAGTGAATAGGACTCAGCTTTAGTTTTCC +CTGTTTTTTTTTTTTGGGTTTTTTTTTTTTTCTTGAGAAGGAGTCTTGCTCTGTTGCCCAGGCTGGAGTG +CAGTGGTACGATCTCAGCTCACTGCAACCTCCTCCTCCCGGGTTCAAGCAATTCTCCTGCCTCAGCGTCC +CAAGTAGCTGGGATTACAGGTGCCCGCCACCACGCCTGGCTAACTTTTGTATTTTTAGTAGAGATGGGGT +TTCGCCATGTTGGCCAGGCTGGTCTCAAACTCCTGAATTCAGGTGATCCACCCGCCTCAGCCTCCTAAAG +TGCTGGGATTATAGGCGTGAGCCACCGCGCCCGGCCCCCATTCTATGAATTATCTGTGGAAAGTTATTTC +CTTTAAAATGGCATGCTCGTGAGGTTGAAGGGGTAAAGAACATTGATCTGGTTGCCACGCAGATGAAAGT +GTAGTCGGAAATGTGTTACCAGTTTGCCATAGCCGATGGAGGAGGGTAATTACGGTCCTGAACAGTTAGT +AGCAAAAGGCTGCTTCCAGATAGTATTTAGGTTGTGTCCTCTATTGTCCATGATGATTTTCTTTCTCTCT +TTTCTTCCTAAGGATATTGAAAAACAAACTTTAATCTTTTAAAGATCTCAGAGCTATTAAGATTGTCTTA +GGACCAGGACACTTAAATCTGCCCAAATCTGGCTGTTGATGATCGGTCATCTCTACTTGTCTTTGCTGTA +TTAAGTCTATTCAATAGTTAATAAATATGTTTCTAGAAAGAGTTTTTTTCAAGTAAAACAAAATTGACCT +GTGCCTTTCTTTGGATTGTACTAAAATCTGATTTCAATACAAATGTAGTCTCCCGAATTGCTTTGATTTT +TGTCCAGCGGAAGCCTTGGGGAGCACTGAAGCCAAGGCTGTACCGTACCAAAAATTTGAGGCACACCCGA +ATGATCTGTACGTGGAAGGACTGCCAGAAAACATTCCTTTCCGAAGTCCCTCATGGTATGGAATCCCAAG +GCTGGAAAAAATCATTCAAGTGGGCAATCGAATTAAATTTGTTATTAAAAGGTAAGATGATAATCTGTAG +AAATAGTTTCAGTGTCTTCCCTGAGAAGAGGTTAATTTGATGAAGAAGGGCCTTTTGTTTACCTTATGAC +TTATTTCTATTGACAATGAAGGCATTAATATTTAGATTCACTTAGTGAACAAATATTAGTATAAGCATCA +GATGTGCAAAATTGGGTCTAACAAGAACACTGTCCTTGGGGCCTTCATACAAAGAAAAATGCACTGAAGG +CCGGGCGCGGCAGCTCACGCCTGTAATCGCAGCACTTTGGGAGGCCAAGGCAGGTGGATCACTTGAGGTC +AGGAGTTCAAGACTAGCCTGGCCAACATGATGAAGCCCCATCTCTAGTAAAAATACAAAAATTAGCTGGA +AGCGGTGGTGCAAGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGGTTGGAGAATCACTTGAACCCTAGAG +GCGGAGGTTGCAGTGAGCCGAGATCGTGCCACTGCACTCCAGCCTGGGCAACAGAGCGAGACTCCATCTC +AAAATAATTAAAAAAAAAAAATAGAAAAATGCAATGAAGTGTTATTGAGCGTTTTTAAGGGAGAAGGCAA +GGATGGCACACCCAGCTCGGTCACTTGTGCATCCAGAAGAGATGGAAGGTGTTTCAAGTGAAGGAAATCA +TATGAGTAGGGGGAGGAGGTGGCAAATATGCCTGCGTATCCACAGAACTCACCCACCGTGTGTGGAGTGA +GGACTGCCACGTGGGCGTGGTGGGGTTGCATGGATCGACTTGGGTGGGCAAGTGGAGGAAGGCCTGAGAT +CCTACGAACACAGAGGCAGTCACGAAGTGGTCTCGAGGCAGATGCCTCTGAAAATAATGTGGATCCGCCC +TTTAGAAAGGTAATTCTGGCTTGATTTTGAAGGATAACACAATGGTTAGTTTGGGTGCGGGGTTAGGAAC +AGAAGGCCTCTCTCCACTCATTGACGGGATGTGGAAGGGTAAACCTTCCTTACTGATTGGGGTCATGCCT +CTGTGTGTTTGTTGGGACTGAGTTATAAGGGATAGGAAACGTTTAAGATGCTACAGCGAGCTGCTTCTGG +CTGTGCTGTGGGACAGTTCATGTAAGATTCAGAAACAGAATTGAGCTGGTTTGGGGGAAAAGTGACTTTC +GCCTGTTTATCTTAAATATAGGATGATTTTGAAGGTCTCACCCGAATATCTGAAAATTGCCATTTTCAAA +ATAAACTCGTCACCAAAATGATTTTTTTTTCACTATAAAATGAAGGCAGGATGAACCATATTTATAACTA +ATTGGCAATGAACAGCTGTGTGAGAAAGGCCTGTGAGTTGCTTCTAAATGCTTTATTACTAATATCAACT +CTGTTTCTACAGACCAGAACTTCTGACTCACAGTACCACTGAAGTTACTCAGCCAAGAACGAATACACCA +GGTAAACTAGTTGTGAAATCCTTTTTTAAAAACACAGATCAGCCAGGCTCGGTGGCTCACACCTGTAATC +CCAGCACTTTAGGAGGCTGAGGCGGGCAGATCACAAGGTCAGGAGATCAAGACCATCCTGGCTAACATGG +TGAAACCCCATGTCTACTAAAATACAAAAAAATTAGCCGGGCCTGGTGGCGGGCACCTGTAGTCCCAGCT +ACTCGGGAGGCTGAGGCAGGAGAATGGCATGAACCCAGGAGGCGGAGCTTGCAGTGAGCCGAGATCACGC +CATTGTACTCCAGCCTGGGCAGCAGAGCAAGACTCTCTCAGAAAAAAAAAAAAAAAAAAGCACAGATCAA +TACTTTGTAAGCTTTTAAAAGTTAATCTTTTAAAATTATGGAAGTCCCCCTTCCCCTGCCTCCCCAAAAA +ATCATTTGCAGAAGCTCAATCCAGCCACTCCTTGATTTATCAATGTGAACACCTGCAGTGCACAGATAAC +TCAAGCTTTGACCGTAGACCACTGCTATCCATGAGAAAGGCAGTGTGCCCCACAAATGCAAGCCACATGT +GTAATCGTCCACGTTCTAGTTCAGCCTGTAATCATTATAAAGAAATACACAGGGAAGTATTTCACGGTCT +TTATCCCATGTTCTTTATTTGACAGTTGGCGCGTCTTTCATACTCACAGCATATTTCAGTTCCAAGTAGC +CACATTTCAAACATTTCAACTGCTCGGGGATAGTACCTGCTATATGCATTTAGACCTTAATGTTTATTCA +TATGTATCGTGTTCCTAAAATGGCAAAGACTGTAACTCTGACCTGCACACAAATATCACAGCCTAATAGG +GAAGATCTAAGAAGTCTCTGTTCAATGAATGATGTTATTTTCTTTATTAGAGCTCTAAGTGTGCCTTTAT +TTCTTTCCTATCTTTTTTTTTTTTTTTTTTTTTTTTGAGACGGAGCCTCACTCTGTCATCCAGGCTGGAG +TGCAGTGGTGCAATCTCGGCTCACTGGGGCCTCCGCCTCCCGGGTTCAAGCAGCTCTCCCGCCTCAGCCT +CCCGAGTAGATGGGACTACAGCTGCACGCCACCACACCTGGGCTAATTTTTGTATTTTTAGTAGAGATGG +GGTTTCACCATGTCAGTCTGGCTGGTCTTGAACTCCTGACCTCAGGTGATCCACCTGCCCTGGCCTCCCA +AAGTGTTGGGATTATAGGCATGAGCCACTGCACCTGACCTCTTTCTTTTTGTTTGCCGTTGTGTAATGTC +AGAGGAAGTGACCACACTCTGTAGATCATCAGCCACCTTAGGAACTTCTGTTGCCAAGAACCAATAAATG +CCCATACCCTGTAGTGTGTAAATGCCCATACCCTGTAGTGTGAAGTTTTCTGTTGTTAGAAATAAGTGTT +AGGAATCAAGTATGAAATTTGTGTGTGTACTAGGTGTATACAGATTGGTGCAGTTAATCATGCTCACGAC +TACTAAGGTGAACAAATGTTTCAAGTTGGGTTCCTGGGTGTGCCCTAAAATACTTCTCCTTCAGCTCTCA +CAGCACCTTGTGGACATGAGTGAAGGTCAGTCAGTGTGCCAAATAGATTTTGTGTGGATTATGGCATGGA +AAGTGGCTGAGAAATTCTGTAGCAGGGTAACAAAATTATCTTGGTCCAGGAGTCCTGTAGTGGAGAAGAT +AAAAGTCAATGCTTAACTCATAGGTTAACCTCAGCATGCTTTTGATTTGGTCAAGCAATCAAAGTACTGG +TGAAGAATTGAAGATTGGAAGTGACACATTTTTTGCTCAGGGAAGTAATGGAGAAAGAAAAATCTTCCTG +GAGGTAGATCTTCAGTTTGGATTAGTCTGAACATGATGAAACCTGTAGAAACCTTCATTTCTCAAGACAA +AGCTCAAATTCAAGGTTGTGAGGAATGCAGTCACCACTTTTGTTAGGGGCAGTTGTGACAGTGAGTGACT +GCAAAAACTGAGAATGCGAAGCCTCTATTGTAAAAAGAATGTGCAAGTGCCATAGAAGTCACTGCAAGGA +TTGGGTGCTGGAGCAGTGCCGGACCTGCCATCGTCACCAGAGTGCAGCAGATTCACCAGAAGGAGAATCT +CCACTCTTGTCGTCACTAACAGCACTTGACTTTGTCCCATTCATAAAAGATCTTGGAAGGAATTAAAGGT +GCTTGGTGGTTCCTCATTCCAACAACACCTTACTTGGCCTGCCCGTGGAACATGCGTTAGTCTTTGGACA +GGACAACTCGAGTCACAGACCACAAGAGAAAAGAATTTTGTGGCCATCAGAATTGTCTGCTTAAACACAC +CACGGGTGCACAGTGTCCTCAGCTTGGTGAAAGGGAGATGTCACACGCGAGAAGGCAGCGGAGCCAGGCA +TGTGATGGAGTGGGAGGTGGCACCTGGCTCTGTGAGGAGGCTGTGAAGTCCTGCATGGGAGGAGCAAGGT +GGGGGAGGAGGGGGTGGGGGTGGGGCAGAAGAGGAGGCTGAGCAGTTAATAGAGGCGCTCGACCTTAGAG +GAGGAGAGTCCGAGGTCTTTATTGGTAGTATTCAAATGTGGTTCATCCAGAGTTATTTTCTGTGGCTGAA +TGGCCTACTCTGAAATCCACAGGGAAAAAACAACTCACATTCAACCCTTGAGATGCTAAGTTTTCTTTTA +AAGTAAAGGAAATGTTATAAGATTTTCTGAAACCACCAACCTTTAGCGATATTGTCAGACCTTCTCCAAC +ATTTTCCACTGCATTTGTCAGCAATCAGAGATGACCTCCACACCGAGGCGAACCCTCCACCCCCCGACCC +GTTTCCTCTTTCTCTCTTTCCCTCCTTTGCTCATCCAGAAACACTTCAGTCATCCTGCATTGTCTCCAAG +TTGACCTCCATCCTCATCCGCACCTCGTCCACACCTGAATACTCTGTCCACATCAGTGATATTCTACATG +TCTTTATAATAGCTTTTTTGTTTGTTTGTTTGTTTTTTAAATTTTGATGCTTAAAAGGCAATGGTTGTCT +TAGGGATAGGAAAACACACCCCACTGTCACTGTGAGTGAAGCTGAATAACGTCTCTAGGTCTTTTACCAT +TGCTTTCTTTTTTTTCTGAAGTGATTTTTCCTTTTATGTCCATTGCTTTAAGCCATTTGTGAAATTGCAC +AGTGATTTCTGGAGTGGGGACAGAAGGAAGGCGGTAGTAAAAGTCATTGGTGCTGTGGCCCAGTTGGCTG +GAGGAGGTGCAGGGCAGGGCGGCCTGCGCTGGGGCAGCTGGAGGAGCACAGATGTCCCCACGGGCAGGTG +GATGAGTTCTGAGAGCTGGAGGGCCGGTACAGTGTCCTCCATGGTTCCAGCTTGTGGGCTTGATCAGGCC +GTCACCTGCGGTGGCCACTGAGCTGGCGATGAACTCCGTGGCCTTGGAGTCGCCCTCGGCAGAGATGATG +GCCGCCGTCTTCTGCTGCTCAGCCCTTGCCACCACAGATCTGGCCCTCTCTTCTTCCTGCTGAGCCACCT +CTTTGGTTCCACTGCTTCTGCAAATTCCTTCCCGAAGGTCAGATCCAAGGTCACAGCATCCAGGAGGAGC +CCAAAGGTTGCTGCTTGCTCCGAAGTTAATTGCTCACCTGTCTGGAGCCCAGCTCTCCCTGCGTGATCAG +TTCTCCAGCGTCAGCCTGAGCCGCCCCCAGCTTGAGGAGCTCCGCAGTGATGGATGGCAGCACATTCTTC +ATTGGCTTCTCCAGTAATTGGAAGATGCAAGGACCTGGCCAGCAACAAGGTGGGAAGAGGACGCCCAGTG +TGATGGTGACAATCTTTGCTCACAGTGATGATTGGTGCATAATGTGGTCAAGAGCAGCAGTCAAAAATAA +TTGGTTTCTTTTCCCATGGGATGAGAAAGTGCGTTCCTTCCCCTATCACAGTGTCCTGAATGCCATGGAA +TTGGTCGAAGATGACAGACAGCTCTCTGTGCAGCATCACATTGTAGAAGGCAGAGTTCCCCACACCTCCT +GCAGAAGCTAAGGACAGGGCAGACTTGGCAGCTGTGTTTCCATCTGCTGGACCCACTCACGCCTGCGTCC +ACTCCAACCCCCCACATGAATTCCGTCCCTTTATCATTGATTTCTGATGCGAAAAGTCACTTTGATTAGT +GAAGTGTTGCTTTATGTAGATTTTTAGGCACACATCTGTTAGATAAAACGAGGATTGCCTGTAGAGAAAG +CAAATAAAAGCAAAGTCCCTCTTGTTGAAACTGGTGTGGGAGCCCTTGAACCTCATGCAGCTGGCCTCCC +TGCTCTGACCGAGTTCCCTGAGGGACTTCTCCAAGGAGGAATGTGTGAGAAGCACTGATTTTGAGCACTC +TTTCATGTGGACAAATTTCACTTTATTTACAATGTAAACTTAAATTTAAATTCTGTCTTTCTAGGCTGGG +CACGGTGGCTCACGCCTGTAATCCCAACACTTTGGGAGGCCGAGGCGGGCAGATCACTTGAGGTCAGGCA +TTCAAGACCAGCTTGGCCAACATGGTGAAACCCTGTCTCTACTAAAAATATAAAAATTAGCCGGGCGTGG +TGGCGGCTACCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCAGGAGGCGGA +GGTTGCTGTGAGCTGAGATCACGCCACTGCACTCCAGCCTGGGCGTCATAGCAAGACTCTTTCTCTAAAT +AAATAAATTAATTCTGTCTTTCTGCAGTTTTTCTGATATTTGGCAAGTACTGGAAATTATTATTTTCCTT +AAGACCCCAAATTTTCACACCAACATGGCACATGTATACATATGTAACAAACCTGCACGTTGTGCACATG +TACCCTAGAACTTAAAGTATGATAAAAAATAAATAAATTAATTAATTTAAAAAAAAAGACCCCAAATGTT +TGCTTTTAACAAAACTGAATTAAGAGAATCACTGCAGGCCGGGCGTGGTGGCTCACGCCTGTAATCCCAG +CACTTTGGGAGGCCGAGGCGGGCAGATCACGAGGTCAGGAGATCAAGACCTTCCTGGCTAACACCGTGAA +ACCCCATCTCTACTAAAAATACAAAAAGAAATTAGCTGGGCATGGTGGCGGGTGCCTGTAGTCCCAGCTA +CTCAGGAGGCTGAGGCAGGAGAATGGCGTGAACCTGGGAGGCGAAGCTTGCAGTGAGCCGAGATCGCGCC +ACTGCGCTCCAGCCTGGGCGACAGAGCAAGACTCTGTCTCAAAAAAATAAAACAAAACAAACAAAAAAGA +ATCACTGCAAACAAGAGTACTTTCTAGCAAAATCCATTCTGATTTGCAACAGCACTGATAAATAACATGG +TTATTGGGTTTCTTTTTGTTTTCCAGTCAAAGAAGATTGGAATGTCAGAATTACCAAGCTACGGAAGCAA +GTGGAAGAGATTTTTAATTTGAAATTTGGTAAGTAAAAGCCAGTATTTATGTCTTTAATAACATATCAAC +AAAGGGCCATGTCTGAATGAAGTATAGAAGTTCGGGACCAGCCGGGTGCAGTGGCTCACGCCTGTAATCG +CAGCACTTTGGGAGGCCAAGGCGGGCGGATCAGGAGGTCAGGAGATCGAGACCATCCTGGCGAACACGAT +GAAACCCCGTCTCTACTAAAAATACAGGAAAATTAGCCGGGCGTGGTGGCGGGCGCCTGTAGTCCCAGCT +ACTCGGGAGGCTGAGGCAGGGGAATGGCTTGAACCCCAGAGGCGGAGCTTGCAGTGAGCCAAAATCGCAC +CACTGCACTCCAGCCTGGGCGACAGAGTGAGACTCTGTCTCAAAAAAAAAAAAAAAAGAAGGAAAAAAAG +TTCAGGACCTAAAGAAGGAAGGTCCCAGAAACTGGGTTTCTGTTTCTTTCTACCACTACACTTGCTACTG +AAACCAAGCAGATGACTTCATTTCTCTTGGATTCCACTTTCTCACGTGTCAGAATGGGAGAGGAAGGGAG +GTGTTGGGATAAGTTTCAGTTCCACTGTTTGTACTTCTCGTCTGGCTAACACCCATGTGGCAAATAGGTT +TCATCATTTGTTTCAGCTTAGATTTGTTGACAGCGGTTTCCTGGAGTGCTGTCTTGAGAACGATTCTGAG +GAGGCTCAGCAAGAAAGAGTGTTTCAGTTGATTGGGTGTGTCTGTCATGGAGAAGGAAGTAAGGAGTGGG +CAGTGCTAGCAAAATTCCCGGGGCACTTCTGTCCATTATCTCAATACCTGGGGTTGACATTTCCTGTCTC +AGATCAGGAGTCCTGACTACCCTGCCTCTGACCACTCGAACTGAGTGCTGCTTAGCTGTATCGTAGACAC +CGCCTGTTTGTGAACAGACACCCTGCTTCTTGATGATAACACAAAGGCCAGCAGGGTCCACTGCTGTGTG +GAATGGCCTTCGGTCATTTCTGCCCAGAGCATAGAGGTCATTTTCACTAATAACATAACTCTCCTTTTGA +TTGAAAGTGTTAAAATGTTCCTCCTAAAAGCACTTATTTTTTAGGCTCGTTCTTCAGATTTGCCCCATAT +CCTAAGCAAAATGCCTTCAATATGAAGTGGATATTGCTTGACCGTAGGGAGCTTGTCCATACTGTACTCG +AGAATGTAGACACAAAGAAAGAATGTCTGTGTCAAATGTTATCCTCCGCAACTTAACGTTCTCTTGCACT +TTCAGCTCAAGCTCTTGGACTCACCGAGGCAGTAAAAGTACCATATCCTGTGTTTGAATCAAACCCGGAG +TTCTTGTATGTGGAAGGCTTGCCAGAGGGGATTCCCTTCCGAAGCCCTACCTGGTTTGGAATTCCACGAC +TTGAAAGGATCGTCCGCGGGAGTAATAAAATCAAGTTCGTTGTTAAAAAGTAAGTTCTTTTTGCCACTGT +AGTCGTTTCTGGAATCAAAACAATAAAATGACATTTCTGTTAAGATGTTTTTCAAGCTAGAGGTGACAGG +CGTGGCTGTAAGTCCTTGATGGCAAAGCCTGGCTGTGAGCTGCAGTCCGGGTACTGTCCATTGCCCCTGC +CCATACTCAATCATCTTCATCCGTGCAAGGAAAATAGTGACAATCGCCTCCGCGGAGTCACCGTGAGGCC +CCGTGGTGACTCGCAGTATGCCTGAGTACTTGAATGGAAGTTTAATTTGCTGGTTTTTATCTGCTCTGTT +ATTCCTGGTCAATCTTGAGAGCGACTAAATATTGATGATTGAGTTTCTTTCTTTTTTTTGAGACGGACTC +TTGCTCTGTCGCCCAGGCTTGAGTGCGGTGGCGCAATCTCAGCTCACTGCAACCTCCGCCTCCTGGGTTC +GAGCAATTCTCGTGCCTCAGCCTCCCGAGTAGCTGGGATTACAGACTTGCACCGCCACGCCTGGCTAATT +TTTGTATTTTTAGAATAGATGGAGCTTCACCGTGTTGGCTAGGCTGGTCTGGAATTCCTGACCTCAGGTG +ATCCACCCGCCTCAGCCTCCCAGAATGCTGGGATTACAGGCATGAGCCACCGTGCACGGCCTAATGATTG +ATTTTCTTATCTACATTTCTGCAGAATTTTAGTGGCTAAAGAAAGTACACTAGTGTTTTTTTTTTCTTTT +TTAATGAAAATAGTCACTTATTTACTCTTCATAAAATGGCTTACCAGTCATGGGAAGGAAAAGTCAGGGT +CTTTTGTCCACAACACTAGGAATTATTGTGTTACAGGATGATTCTGTGAATGCCTTTAGAAAAAAAACCC +TGATCACATCACGACCGTTTTACTAGCTGTGAGGCCACCACTGTGCTGGCTACTGTAACACTTCTTGGTT +TTCTTTCTAGACCTGAACTAGTTATTTCCTACTTGCCTCCTGGGATGGCTAGTAAAATAAACACTAAAGG +TAAGAGACGACGTGTACTCCGTATCCTATTTGAGCGCATTAAGACACTCTTTATCCGCATTCCTTAAATC +ATACGGAATCGGCCAATATCTCTGTGGGGCTTGGAGTTGTTTACTTTGCTTTCTAAGGTATTTTATTTAA +TGGAACTATTTTTCTTTTTTTTTTTTAAAAAAACAAAACATTTCATTGGAAATGTCACATTTGCAATCCC +AGTTCATTCATGTTTTCACAGCTTTGTTGAGCCCCTGTGGAGGATTACACTACAAGTTTAATTCTGCAGG +TTTCTTTGAAAAGAACCAAACTGTATGAATTTTAATACTCTGCTCTTATAAAGCATCTCCTTCTGTATTT +TTTTTTCCCTACAATATACAAAAGGAACTCTGCTTATTTTACAGCTTTGCAGTCCCCCAAAAGACCACGA +AGTCCTGGGAGTAATTCAAAGGTTCCTGAAATTGAGGTCACCGTGGAAGGTAAGGGCCAGTCCTCGGTAT +GTTTCTGTTCATTCTCTAGTTTACTAATTATGTCAGTTCACTAGCTATGTGTTTATACAGGTTTATACAG +ATGACGTTAACCAAACTAGCTTTTAACTGTGAACAGCTGCCTGTGATAAAGCTAAATTTCTGAATGCTTC +TAATTGAAAATTTTTAAATCCTTAATAGAAAACAGATAACATGGATCACATCATAAACTTCCCCCAAGAA +AAGTCCTAAAATGCTTTTCCTTTATCATAACAAATAAGGGTGCAGAGTCATGTGGAATGTTTTATCTTTT +TTTTCCTCTTCATTTTTTTCTATTGTTTTAAGGAAAGAAAGTTTAGGGTGGTTTAAACAGCAGTGCCTTG +AATAAAAGTATTTTCTTTTTCTTTTTTGAGACAGAGTCTGGCTCTCTTGCCCAGGGTGGAGTGCAGTGGC +ACAATCTCGGCTCACTGCAGCCTCCGCCTCCCGGGTTCAAGTGATTCTCCTGCCTCAGCCTCCCAGGTAG +CTGGGATTACAGGCGTCCACCACCAACTCCAGCTAATTTTTATATCTTTAGTAGAGACGGGGTTTTACCA +CGTTGGCCAGGCTGGTCTCGAACCCCTGACCTCAAGTGAGCCACCACGCCCAGCTGGTTCTTGTCTTTTC +AGGCTCACAGTGTTAAAATACTGCATACAATACTAAAGACAATCTCCAACCAACTGTCAAGTATGGATAT +TTCTTCCATCTCTTCATAAATGCAGAAGGAAATCACTGCATTAGGATCATTCTCTACATCCCAGGTGTCA +AGTTTTTAAATAAGAGTTTGCCGTACTGGTGTTGAGTGGCGCAGAAGTCCCTATTTAGTCATGTGGCACA +CTTTCAGAGAGCCTGCTTTCTCATAAGATAGCCAACAAGCATCCATGGACAACTTACACCTGATTAATTA +TTTCTGTTCAAAAATGCCAAGAATTAAAAATATATTGAACCCTTGACTTTCCTCCTTCCGGAAGAGATCA +GAGGAAGATCTCTCTTATATACAAGATGCCAGCCTTTCCTAAAGGGCAGAGCCAAGTTCACGGGGCCTGG +GAGGCCTGCTTTAAGAATACAAAATTAGGGCCAGGCGTGGAGGCTCACGCCTGTAATCCCAGCACTTTGG +GAGGCCGAGGCAGGCGGATCACCTGAGGTTGGGAGATCGAGACCAGCCTGACCAACATGGAGAAACCCCA +TCTCTACTGAAAATACAAAATTATCTGGGCCTGGTGGCGCATGCCTGTAATCCCAGCTACTCAGGAGGCT +GAGGCAAGACAATCGCTTGAAACCAGGAGGCAGAGGTTGTGGTGAGCCGAGATTGCGCCATTGCACTCCA +GCCTGGGCAACAAGAGCGAAACTTTGTCTCAAAAAAAAAAAAAAAAAAGAAGAAGAAGAAGAAGAGAAGA +AAAATGGGTTTCTGTAGAAATAGAAAAATGGTTTCTGTGGAACTCGCCGGAATGACAAGAATACCCATTT +CAATTTGAACCTAGCTTTGTTTTCTGGCACAGGCTGGACTTGAAGATTGTAACCTAGAGACACTTACGTC +ATTAAATGTGTGTTGATTATCATTATTTTTTCTGTCTTTTAGGCCCTAATAACAACAATCCTCAAACCTC +AGCTGTTCGAACCCCGACCCAGACTAACGGTTCTAACGTTCCCTTCAAGCCACGAGGGAGAGAGTTTTCC +TTTGGTAAGTAAGCGTTTTATTTTTCTTTCTTTCATAGTTTTAAATAGAATACGTTCATTATGTTTCATT +TTACCAGGTGAATTGTTCCTATTGGTGAGTCAGTATATATCAAAGGTTAAAGGAATAGCCCGTAGAGCCA +GGCTGCCTGAGTTCAGATCTGAGTTCAAATCCGATGCCACTTGGGCAGGCGAATACACCCTGCCTCAGTT +TCCCTTTGTGCAAAATGGTAATAACATTACTTTCTTTCTCATGAGGTTATGGTGAGGATTATAAATACTG +TCATTTAAAGTACTTAGAACCATACCTCGCATGTAGTAGAGACTATAATTTGTGGGGGCATTTTGGGGGT +TTTCATTTGTTTTTTCTTTATTTTAGCATTTGTTCACTTATTTGTAGTCAACATAAAAGCAATGTACTAA +AACAGTTTGCTTATGTATCAGGATGGGAGTAGTTGTCAAGTCATATCATGGTAAAAGATTATGCATTTTT +TATTTCTTCTTAAAGATGCACAAATAAGGCCAGGCGCAGTGGCTCACGCCTGTAATCCCAACACTTTGGG +AGGCCGAGGTGGGCGGATCACCTGAGGTTGGGAGTTCGAGACCAGCCTGACCAACATGGAGAAACCCCGT +GTCTACCAAAAATACAAAATTAGCCAGGCATGGTGGCGCATGCCTGTAATCCCAGCTACTCAGGAGGCTG +AGGCAGGAGAACTGCTTGAACTCAGGAGGTGGAGGTTGCAGTGAGCAGAGATCGCACCATTGCCCTCCAG +CCTGGGTAACAAGAGCGAAACTCTGTCTCAACAAAAAAAAAATAAGATGCACAAATAAGATGAAACACTC +CTGCCTGTGAATGTGTATTGATAAGTACTGTGATAAGTATTGCAATTTAACTCTCTTATGTTTTATATGC +AGAGGCCTGGAATGCCAAAATCACGGACCTAAAACAGAAAGTTGAAAATCTCTTCAATGAGAAATGTGGT +AAGTCTATTTTGAAACCTTCTTACTGCCACGCAGGGTGCCTTCATGGGAGTAGGAGGGAGCAGAGTGGGA +TACTGAGCGTTGGAGTATCCCTGTCATTGGCAACGTGTGGGTTTTTCGGGATTGAATGGCGTCATTTCGC +CCATGTTCTGTTTCGTGTACGCGGCTGTTTTGCAGGGGAAGCTCTTGGCCTTAAACAAGCTGTGAAGGTG +CCGTTCGCGTTATTTGAGTCTTTCCCGGAAGACTTTTATGTGGAAGGCTTACCTGAGGGTGTGCCATTCC +GAAGACCATCGACTTTTGGCATTCCGAGGCTGGAGAAGATACTCAGAAACAAAGCCAAAATTAAGTTCAT +CATTAAAAAGTAAGGAAACTGGATGAAGTGGGATTAGCATGAGTTGATTGTGTTTGACACTGGGAGATGG +GTATGTGGGTTTGTGTTTGTGGCAGGGCTGAATTAGCTCTGGAGTCAGAAAAACTGCTTTTTAAGCTGCA +CTCTTGTATATGTTTAAAACTTCTGGCCAGGTGCGATGGCTCACACCTGTAATCCCAGCACTTTGGGAGG +CCAAAGCAGGTGGATTACCTGAGGTCAGAAGTTCAAGACCAACCTGGCCAACATGATGAAACCCATCTCT +ACTAAAAGTACAAAAATTAGCTGGGCGTGGTGGCACGTGCCTGTAATCCCAGCTACTTGGGAGACTGAGG +CAGGAGAATCACTTGAACCCAGCAGGCAGAGGTTGCAATGAGCCAAGATCGTGCCACTGCACTCCAGCCT +GGGCAGCAAGAGCAAAACTCCATCTCAAAAACAAAAAAAAAAAAAAAAAAGAAGAAGAAATTTTCCTTAT +GCTGCTGGTAGAAGGGAAGGAGGACTCCACCATACTAGAGAGATAACGACTTTTGGTTGCTAAATCTACA +CAGCTATGTTGTTACTATTTGATGCTGCTGGAATCTGAGCAGCTGTTTAGTTTTTCAAGTCCTTTTATTT +CTCCAACTTAATAGAATATTTTCAATTTCTCCTGTTTGCTTGAACATCTGTGGATTAGGCTAACATAATT +GAAATAGATAGGAATGGGCCGGGTGCGGCCATAAATCCCAGCACTGTGGGAGGCTAAGGCAGGCAGATCA +CTTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAGCCCCATCTCTACTTAAAATACAAAAA +AAAAAAAAAAAAATAGCCAGGGGTGGTAGCTGGCACCTATAATCTCAGCTACTCAGGGAGGCTGAGACAG +GAGAATCACTTGAACCCAGGAGGCGGAGGAGATCGTGCCATTGCACTCCAGCCTGGATGACAGAGTAAGA +CTCTGTCTCAAAAAAAATTAAAAAAAGAAATAAGAATAATTGGAATTTTATAGTACATGCTAATGCACAT +GAATTATTTGAAATCAATTGAAATGAATTTTAGGTTTCATTTGCTTGGTGTCAGGCTTAATTTTGAGTCA +CCCAGGGATGCTCTCAAGCAAAGCTGGCCACTTTTTTTTTTTTTTTTTTTTTTTTTTTTGTGAGACAGAA +TCTCGCTCTGTACCCAAGACTGGAGTGCAGTGGTGTGTGATATCGGCTTACTGCAAATTCCGCCTCCTGG +GTTCAAGCGATTCTCCTGCCTCAGCCTCCAAAGTAGCTGGGATTACAGGCATGTGCCACCATGCCCAGCT +AATTTTTGTATTTTTAGTAGAGAAGGGGTTTCACCATGTTGGTCAGGCTAGTCTCAAACCCCTGACCTCA +TGATCCGCCTGCCTCGGCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCGCGCCCGGCCAATGGCC +ACCTTTTATGCTTTTGGCCAGGTCTGCAGAGGTTTGGCAGCCTCTAAGCCCTCTGCTTCTGCTTCCGGAG +TGAGAGCCTCAAGTGCTCCTGTTTTCACAGCCCATCTTGTGTTTCCATGCTGCGTTTGTGGGGCGTTTTC +TCTTTGCATCCTCAATTCTGGCGCTGATTCTTTTCAGGATACATTCTAACCTTGGAAAGAAGATGCCAAT +AAAACCAGTGTAGGGGAAGGGCAGCCAATGAGAGGCAGCAATGGATGTTAAATTTACAATTGTGGTTTGT +CTTTTTGGCTGTGGTTCTTTAGATAAGCATGTGATTTCTGTTCTTCATGACTAAGAATTGAATTTAGACT +TTACAGAGTTACTGGTTTGTAAATCTTTGAGTTGTTTAAATTTTAATGTTAGAGTTTTACTGTTTGATCA +GCACATTTTTTTTCTCTTTTGTCTATAGGCCCGAAATGTTTGAGACGGCGATTAAGGAGAGCACCTCCTC +TAAGAGCCCTCCCAGTGAGTGTATTTTCTGTATTTTCATTGCTATAGAACACACGCTCTTAGGCATGCAT +TGTGCAGCTGTGGTTTACAAACATTGTTTAAAGAATAGCCATTGAGGCCAGGCATGGTGGTTCACGCCTG +TAATCCCAGCACTTTGGGAGGCTGAGGTGGGTGGATCACCTGAGGTCAGGAGTTTGAGACCAGCCTGGCC +AACACGGCAAAACCCTGTCTCTACTAAAAATATAAAAAATTAGCCAGGCGTGGTGGCAGGCGCCTATAAT +CCCAGCTACTCAGGAGGCTGAGACAGCAGAATCACTTGAACCCGGGAGGCGGAAGTTGCAGTGAGCCGAG +ATCGCACCACTGCACTCCAGCCTGGACAACAGAGCAAGACTCATCTCAAAAAGAAAAAAGAAAACCCATT +CAAAGTTGTAGTTCAGGCTATTTCACAATTTTTAGTTATTTAATAATTGGGTGGCTCTGTGAGTGTGTGA +TGATCTCCTAAGTACCCCCAGGAAGAGAAAAACAGTGAAAGGCAGCCCTTCAGGGGGCATCATGTCTTCT +TGAAATGATCTACCATGGAAAGAAAGAACCTTGAACAGGACTGAGATAAAACAGTCTCAGACAGCATGAT +TATTAGAGATGTTGACCGTGTTTTCTACAAATTCTGATTTTAAATGTATATTTTCAGGAAAAATAAATTC +ATCACCCAATGTTAATACTACTGCATCAGGTGTTGAAGACCTTAACATCATTCAGGTGACAATTCCAGGT +ATGAGTTCTCTGCTTCCATCAGAGTTTTTGGGGGTCTCTTCAGGGCCTCTTGGACTTTCCCAACTAGAGA +GGTTTGCTCAGCTCATCCGGGGACTCGGATACATTACACGTTGCCTGGTGCTCCTCCCAGACTCCTACAG +CAGTTGCCTTCCAAATACAGTTGCTCTCCAAACAGCAACGGCAAGTTTTGGCACTTGGCATGAGCTAAAT +CCTGTTAACCACCCAAATGGCTAGTCTGTTCCTTAAAAAAAGTCTATATGAGACCGGGCACGGTGGCTCA +CGCCTGTAATCCCAGCACTTTGGGAGGCTGAGGCAGGAGGATCGCTTGAGGTCAGGAATTCAAAACCAGC +CTAGCCAACGTGGTGAAACCCTGACTCTACTAAACATACAAGAAAAATTAGCCAGGCCTGGTTGTGTGTG +GCTGTAATCCCAGCTACTTGGGAGGCTGAGGCAGGAGAAGTGCTTGAACCCGGGAGGTGGAGGTCGCAGT +GAGCCGAGATGGGGCCAGTGCACTCCAGCCTTGGTGACAGAGCCATACTCCATCTGGGAAAAAAAAAAAA +TCTATATAAAACAGCAGGTAAAGGTCTTTATAACAAGAATAAATTTGTAGCATTTTTAGTTAGGCATTAT +TTTAAACAATTTCAAATTTAATTAGCTCAAAGTGCTCAAATACTTAAATCATTAAAAAATGGAAAATGCT +TGAAAACATTACACAGAGCTCCTAAAAATTGGGATTAAAAGTGCATCATTGAGCAGCGGCTTGTACCTGT +AGTCTCGGCTACTTGGGAGGCTTAGGCAGGAGGATCGCTTGAGCCCAAGAGTTCAAGGCCAGTCTGGGCA +ACATCGTGAGATACCATCTCTTTACCATAAAAAAAAAAAAAAGCAGTATTTAAGTTTTGGGTTTTCTCTG +AACTGTTTCAGATGATGATAATGAAAGACTCTCGAAAGTTGAAAAAGCTAGACAGCTAAGAGAACAAGTG +AATGACCTCTTTAGTCGGAAATTTGGTAAGTTTTGCATTTGCAAAGTACAGTTGCTATAAGCAAAGAGAT +TTGTTTTAATAAGATCTTTTCAGCAGATGATGGTTGGATGGTTGTAATCCTATATAAAAGGAGTTAAAAT +TTAAAAGTGAGTTGTTTGTGCTAAATTTATTTAATAAGATACCATTAGTGTTACACTATTGATTGTCAGT +ATGAAAGTTAATGAATTTAAGGTTCACCATCTGGCAGTGTGGCTCACACTTGTAATCCCAGCACTTTGGG +AGCCCGAGGCAGGTGGATCAGTTGAGGCCAGGAGTTCGAGACCAGCCTGGCCAACATAGTGAAACCCTGT +CTCTACTAAAAAATTCAAAAAATTAGCCGGGTATGGTGGTGCACACCTGTAATCCCAGCTACTCAGGAGG +CTGTGGCACGAGAATTGCTTGAACCTGGGAGGCAGAGATTGCAGTGAGCTGAGATCAAGCCACCACACTC +CAACCTGGGTGACAGAGCAAGACTCTGTCTCAAAAAAAAAAAAAATTTAAGGTTCATATTTAAATTTGTT +TGAAGTGTACAAAGACCTTAGTTAACAGCAGAGACAGTCTTTAAAATAAATGTTGATCTTGTGCTTTTGA +CAGGCTGTTTAAGGCGGTGGGTTAATCTGTGCTGTTGTGCTGTTTGTCTCACTGTCCCTGTAGGTGAAGC +TATTGGTATGGGTTTTCCTGTGAAAGTTCCCTACAGGAAAATCACAATTAACCCTGGCTGTGTGGTAGTT +GATGGCATGCCCCCGGGGGTGTCCTTCAAAGCCCCCAGCTACCTGGAAATCAGCTCCATGAGAAGGATCT +TAGACTCTGCCGAGTTTATCAAATTCACGGTCATTAGGTAAGTGAGAGTTTCCTGCTTAGTCACAGGAGC +GAATCTGGAGCTCATGAGGCTGACTCTTCTAAAATGCAGCCACAGGTAGTCATCGAATCCGGCTTCCTAT +GCTGTGCAATCAACAAATCAAAATAACTTGTGTCATCATTAGAATGTCAGATGTGCTTCTACGAACTAAG +CTGACTCTTTTAATTCTTTGGCAAAGGGTTGGCAAACTAGGACTGTTTGCCAAATTCAGGCTGCCTCCTA +TTTTTAGAGTCTTCCTGAAACACAGCTACACCCGTATTATCCATGGCTGCTTTCCTGTCACAGTGAGTAA +GTAGCTGGGACTGAGAAGGCATGGCCTCCAAAGTCTAAAATATTTACTCTCTAGCACTTTGTAGAAAAAC +CTTAGCTAGGCACAGTGGCTAACGCCTGTAATCCCAGCATTTTGGGAGGCCAAGGCAGGCAGATCACCTG +AGGTCAGGAGTTCCAGACCAGCCTGGCCAACATGGTGAAATCCCATCTCTACAAAAATAAAACAGTTAGC +TGGGCATGATGGCGGGTGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGATAGTCGCTTGAACCCA +GGAGGTGGAGGTTGCAATGAGCCAAGATCATGCCACTGCACTCCAGCCTGGACGACAGAGTGAGACTCCA +TCACAAGAAAAAAAGAACTTTGTCAACCTCTGTCTTAGGGCGCCTTGTCACAGGCTTCGGGTCAGACGGA +TTCAACCTTGCATCAGCCATTTGTTAGCCAGGTCACTCTGTTCTTTGTCTGTAAATGAGATTGATCGTTG +TTCCCACTGAGAGTGTCAGCTCCTTCCCGTAGAGCAGGCATGATGATTGTACTCACCTCTGACACCATTG +TGAGTGCCACATTCCTTCCCACGTCCTTGTCACTGTAAGAGATGCCCACCTGAGCACCAACCCCAGGTTA +TCTTCCCCTTTGTCTTCCAGCCCCCCAGAAACAGCTACGACTCAACCTACCCAATCATTTCATCATCAGA +TTGCCACTGTCTCTAGTTCAGGTCTCTTGGAACTGGCACTCAGAAATCTCATAATAAATCCTCTTGAGGC +TTCTCATACACTCGTCTTCTTCCAATCTTCTTTCCCTCAAAATCTCATATTTTGGTTCCACTTCACCCAC +CGTCATTCTCCATATCACTCCCAGGAGTTAGGCAAAAAGCCCCTTCCGTTCTTCCGTATGTTAAACTTAG +AATCACTCTGTTCCCTGCTCTGCGTTTCTATTTTTTGTTTTTCCTCCATTTACTAGTAGCTTAACACTTT +CTAACAGTGTTCTTATTATTGATACGTATCTATCTCTTCCATAAGCTTATAAGGTCACGGATAATACTTC +TCATTGTAGTACGTAAATGACGTGGGCTAGATATGAGTTGAATAAACAGTTATACCTGTAAATTCTTACA +GAGTGAAAATAAATTGTTATACTTTACAATTTGTTTCTCTCTTTAGACCATTTCCAGGACTTGTGATTAA +TAACCGTGAGTATTTTGTGAAGTGTTTTGTTTTTGTTTTTTCCTGGGGTCTGACGTGTGTGCGTGTGAGT +GTGTATACATGCTTAACGTATATCACGTTACTTCACCTATGTCAGTAACCAGGCCAAATACTTGTTTTAG +CCCTCAGTAAAAACACCAGGCACTTCCTAGTTGTAAAATTATTCAAGCTTCTTAACTTCCTATCCTCGAT +GCACTTAATCATAAAATGGTAATAATAGCACCGATTTTGGGGGAGTCGTTCCAGTAGATGGAAAGCATCT +GGAACAGGTGTCAGCAAGCTCCTGCCGACGTGTATGCATAAAGTTTTATTGAAACCACCATCGTGTCCAT +TTGTTTATGGCAGAACTGAGAGATGGCAGCAGCAAGTGTGTGCCTGCAAAACCTAAAATATTTACTAATT +GGCTCTGCAAGAAAAGAGTTTGCATCCCCCTAACCTAGAACAGTGTCTAGCCTCTAGTATGTGTTTAGCC +TACAGTATGTGCTCAGCGAATACAATCTATATTTATTACTGCTTTTATGACTGTTATAATTACTGTGCTT +GGATTTCGTTACAAAGTAAGTCACAATGTGCCTGCTTCTGTTAGTATTTCAGCACAGTGCCTGGCACACA +TGGGGCTCTCAAATATTGCTGAGCGAGTGAACAAATGTCCTTTCAATTCCTTAACGTTGATGTCATTTTC +AATAGTATTTTGAGCCAAACTTAATTTTGCGAGTGTGTTTTGTTTTCTTAACTTTATTATTAAAAATGTA +TAAAAGTGAGGCCAGGCGTGGTGGCTCACGCCTGTAATCCCAGCAGTTTGGGAGGCTGAGGCAGGCGGAT +CAGTTGAGGTTGGGAGTTGGAGACCAGCCCGTCCAACATGGTGAAAGCCTGTCTCTACTAAAAATACAAA +AATCAGCTGGGTGTGGTGGCACGTGCCTGTAATCCCAGCTACTCAGGAGGCTGAGGCAGGAGAGTTGCTT +GAATCCTAGAGGTGGAGGTTGCAGTGAGCTGAGATCGTGCCATTGCACTCCAGCCTGGGCAACAAGAGCA +AAACTCTGTCCCAATAAATAAATAAATAAAATGTTCTCTTTTGTTCCTGTTCTTGTGCGGTAGTGTGGTA +TAGAGTTTTATGGTAATTACTGTGAATTAGTGATTCTGAGGGACATATCAGAACTCTGAGGTTTGTTTCC +TTCTCATCTTGAGGGAAACAGCAAATGTATGTTAAAATGCTTTTCCAAGGGAACAACACATCCTTACATT +ATTTAAACCAATCTGCTTCATTTTCAGAGCTGGTTGATCAGAGTGAGTCAGAAGGCCCCGTGATACAAGG +TGAGCGAGGCAGGGGAGGGCCCGGAGCTACTCCTGCCTGCACAGTGGCACAATGGCGTGCCTGCGTGTGG +CTTTGGCTCTCAGTCACCTGCCCTGAGGGGACTCAGTTACACAGCACACACATGCTTCTCTGTGGTTTTC +ACTCCTGGGTTTGACAGCTGATCAAAACATAAATTCAAGCTGTGGGTCCTGATTGAGAACTGGGGGCTGC +AGACCATTTGCACCCCCTATCCCAGCTCAGGCCTAACATCAGGAACCCCAGGATTAATGGGTAGGATGAA +ATGGCAGAGCAAGAGGGCCGTCACTTTAACCTGACTCTGCCATCCATTTCTAATGTCTGCCATAAGTCAG +TGAGCAAAATGTTCTTCAGTAGAAATGTACACATTGTGCTCTTAAAAAATTCCTTAAAAAACAAGTGGAA +TGGCCTGGTGCTGTGTGAGTCATTGAAAGTAATGAGACTGGGCGCGGTGGCTCACGCCTGTTATCCCAGC +ACTTTGGAAGGCTGAGAAGGGTAGATCACTTGAGATCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGC +AACCCCGTCTCTACTAAGAATACAAAAACTAGCCAGACGTGGTGGCGTGTGCCGGCTACTCAGGAGGCTG +AGGCAGGAGAACCGCTTGAGCCTGGGAGGCGGAGGTCGCAGTGAGCCAAGATCGTGCCACTGCACTCCAG +CCTGGGCAACAGAAAGGAGATTCTGTCTCAAAAAAACAAACATACGAAGAAAAACAAAAAAAGTAATGAA +AAGCTTTTATTAAAGGGAGTAAACAGAAGGATAAGGGAGAAAGCATAACTAAGGAGCTTGTTTTCATGGT +AGAGCTATGTTAAGACTCTGCTCTTTCAAACTTCAGTTGCATATGTGAACTTAGGACCACATTTGAAAAA +CAGAAATTTGAAAGTACACTTGGATAATCGTGTGCTCCATCTCAAGACCGTGAGCATTGTTTCATCATGC +ACCTGTGTTTGTACAGAGTCTAGAGGGCTTTTCTCCTCTTCCTCCTCCTGGGTTCTTTACATAGTATAAA +GCAGCTGTTGAACAATGTGGAAATCAGTCTCTGTGTTTCTCTTTAGAATCAGCTGAACCAAGCCAGTTGG +AAGTTCCAGCCACAGAAGGTAAAAGGGTGGGGTGGTCCTGCAAGTCCTTAAGACTTCTTCTTTCTTCTTC +TTCTTTTTTTTTTTTTTAAAGACAGAGACTTGCTCTGTCACCCAGGGTGGAGTGAGGTTGCGCGATCTCT +GCAACCTCCGCCTCCCGGGCTCAAGCAGTTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGATTACAGGCCT +GCACCACCATGCTTGGCTAATTTTTGTATTTTTAGTAGAAACGGGGTTTCACCATGTTGGCCAGGCTAGT +CTCAGACTTCTGACCTCAAGTGATCCGCCAGCCTTGGCCTCCAAAGTGCTCGGATTACAGGCGCGAGCCA +CCTTGCCCAGCCAAGACTTTTTTATCAGGACAAAGGATTGTGCATTTAAACTATTTCACTAGAACTGGGT +GGTGGTTTTGCTCTCTTTCTTCTGGGTGAATTGGATTTGCAGGTTATGCTGTTGAGTGATGACGCATAGC +TGCTTTTGCTCCATTTCCCCCAGATGACTTGGTAAATTCTCCGTGAATGACTCTGCTACATAACCTAGAT +AACCTAACGTGTGTCCTTTAAATGCATGTAAGCCAGAAGATGTATGTTACTTTGAAAACATAAGTAACAA +AATTTTGAATGTATTGCTAAAGAGATGTCTCTCTGAAGCTCTTTTGATGTTTGGTGTCTTGTCCTTCTTA +TTAAACCATATCTTAGTAAATAGTTTGGTACGAATGGATTTATCACTGAGCAGGTCTGCAAAATAATTAA +TCGGTACCGTTTTGTTTCTGTTGATAGAAATAAAAGAGACTGATGGAAGCTCTCAGATCAAGCAAGAACC +AGACCCCACGTGGTAGACCTCTTCCCTCCTAGGGTAAATCAGCTTCTGTGTCAGGGATGCTGTGTGGTGT +CCATCTGAACCCCCTGCATACGCGTAGCTAATGTGATCTCCCCACTTTCACATAAGATGGTGGCCCTGCC +TTCAGGGAATGTGGGAGCCAGGTGGGAGCCTTCCCGGATATTTAAGCTAGAAGATTCTACAGGGAGATTC +TCCTTGGATCAATATATGTCTCTCAGTCAAAGATGTAAAAGCACTTTTGCCTTAAAAAGAATGTTCTGTT +TCTAAATAGAGTCAACGTTGTCCTCCTCATTGGAATTCACTATGAGTCAGAATCATTAGACTGACTTTTT +TTTTTCCATAGTAATAGTATTTTGCAGAGTCTCACAGAGCTGCAGATCTTTTGTTCATCTTGCAGAGTTA +ACAAGTCTGATCCTGTTAGTCCAGATTTCTTAAATTTGGCCAAGTTATAATAGGAGCAGTAGCTTGAGAC +CCGAAGTCAGGAAACTTTGACAATGGATTTTTTTTTTTAATCCAGAGACTTGTACTGGAATTTGCCTTAC +CCTGTCAGCTCATGGACTTAAGGTTTCATCCCGCTTTATGAGTGCTTCTGAATCCAAGTCATTGTTACCT +GAATTTGCAAATTAAGTTGTGATATTCGTGACTGTTAAATTCCTGTAATTAGATTAACCTCTTTGCTTGC +TTGTTTGTTTTCTCTCCTATTTTAGCTTAAAGTATCAGTGGTTGAGAAGAGCTTTTCGGACCTGTTACTA +CCCCAAGCTGTGTAATATACTTGTATAACAGAAATACCTTCTATACAAACCTTTTTTTCTACTTTTAGAT +AGAAATGTCTACTTTTTCAGCAGTTCTGTGAATTAAAGAGCAGAGTGACTGTGGGTCTGGAATGGCTGGT +GTACTTGGGAATGTACTATCAGGATTTTACAGCAATGCTGGGAAATGACAGGGAAAATGACAGGAATGAA +TCTCACCAGATTTTTTATGTACTCAGCAGAGCCTTGAGTTACGGTGTTTATTTTCCAATCAAGTGAAGAT +ATCTCCTACTTCTCCTACTGGAACATCTCAGCTTCTGCAGTGAAGAAAAATTCCTGTGATAGTTCAGTTC +TTTAGTTTTTCTATTTGAAAAAAAAAAATCATTTAAATGATCCTTTGTTCACGGCTCTCCTTAATGACTG +AGTGAACAGTTCCTATCTGTATATTTGACTAAACCTTTTCCTAAGCTATCTCTCATGGTTCCTATGTTTT +TTTATCATAATTAAAAGCAAAACCATCTGGATCACCTAACAGTCAGAGGTCAGTATCTCAGCGTGTGAAT +TATAGAGGAAATACAGAGAGAACCTCTTCCACTTTTACTTTTCGTCCAAATAAAATGCATGGTGTACCAG +AAGTTGAAGATCGGGTTGAGGATTGGGGCTAGCTCGATGACACTAAGGCCCCAACATCGCGGGACCTGCT +GTGGCGCGGATTCTTAGGAACGCTGTTCTAGCCGGCCCCCTCTCCAGGGGTCGCCGTGGCCGGCATTATT +TCCTAGTTCTTCTTGTAACCCTGAGGTGCCAGCGCGGGGAGTGAGGAGGGGTCAGGGGGCTAAGGATGCA +ACCTCTGACGTTCTGCGCCTTCCTAGGAGAGTCTTACATGTGTTGAGATTTCACAAGCAATGCGAGTTGT +AAAATACCAGCTCTACAAGAAGCTAGGCTCTGTGACGGCATAGTTTTCAGTAGCTTTATCACAATATTCA +CAATGGAGAATTATATGACATGGTAGCAGAAATAGGCCCTTTTATGTGTTGCTTCTATTTTACCTCAAAT +TGTAGATATAGGGTAATCAATAAAATCCATCCATGCCTTTCACACACTAA diff --git a/test/csq/ENST00000573035-bt2212/ENST00000573035-bt2212.fa.fai b/test/csq/ENST00000573035-bt2212/ENST00000573035-bt2212.fa.fai new file mode 100644 index 000000000..8aecdcdf7 --- /dev/null +++ b/test/csq/ENST00000573035-bt2212/ENST00000573035-bt2212.fa.fai @@ -0,0 +1 @@ +7 49960 18 70 71 diff --git a/test/csq/ENST00000573035-bt2212/ENST00000573035-bt2212.gff b/test/csq/ENST00000573035-bt2212/ENST00000573035-bt2212.gff new file mode 100644 index 000000000..aefa5258a --- /dev/null +++ b/test/csq/ENST00000573035-bt2212/ENST00000573035-bt2212.gff @@ -0,0 +1,4 @@ +7 ensembl_havana gene 100 49960 . + . ID=gene:ENSG00000263001;Name=GTF2I;biotype=protein_coding +7 ensembl_havana mRNA 100 49960 . + . ID=transcript:ENST00000573035;Parent=gene:ENSG00000263001;Name=GTF2I-212;biotype=protein_coding +7 ensembl_havana CDS 300 377 . + 2 Parent=transcript:ENST00000573035 +7 ensembl_havana CDS 4125 4184 . + 2 Parent=transcript:ENST00000573035 diff --git a/test/csq/ENST00000573035-bt2212/complex-event.txt b/test/csq/ENST00000573035-bt2212/complex-event.txt new file mode 100644 index 000000000..a6c0abba4 --- /dev/null +++ b/test/csq/ENST00000573035-bt2212/complex-event.txt @@ -0,0 +1,3 @@ +374 CAAG CT frameshift&splice_region|GTF2I|ENST00000573035|protein_coding +374 CAAG CT frameshift&splice_region|GTF2I|ENST00000573035|protein_coding + diff --git a/test/csq/ENST00000573035-bt2212/complex-event.vcf b/test/csq/ENST00000573035-bt2212/complex-event.vcf new file mode 100644 index 000000000..b6c99e283 --- /dev/null +++ b/test/csq/ENST00000573035-bt2212/complex-event.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +##contig= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +7 374 . CAAG CT . . EXP=frameshift&splice_region|GTF2I|ENST00000573035|protein_coding diff --git a/test/gtcheck.1.2.out b/test/gtcheck.1.2.out new file mode 100644 index 000000000..791166ea2 --- /dev/null +++ b/test/gtcheck.1.2.out @@ -0,0 +1 @@ +DCv2 s1 s1 0 0.000000e+00 2 0 diff --git a/test/gtcheck.12.out b/test/gtcheck.12.out index 0ae8b70d8..36e479292 100644 --- a/test/gtcheck.12.out +++ b/test/gtcheck.12.out @@ -1,10 +1,10 @@ DCv2 B A 5.733631e-01 1.126897e+00 2 2 -DCv2 C A 4.938053e+00 4.337503e-01 2 2 -DCv2 C B 2.791391e+00 4.337503e-01 2 2 -DCv2 D A 5.022610e+00 0.000000e+00 2 2 -DCv2 D B 5.178533e+00 0.000000e+00 2 2 -DCv2 D C 4.938053e+00 0.000000e+00 2 2 -DCv2 E A 7.325195e+00 0.000000e+00 2 2 -DCv2 E B 5.178533e+00 0.000000e+00 2 2 -DCv2 E C 2.635468e+00 6.931472e-01 2 2 -DCv2 E D 2.720025e+00 3.566749e-01 2 2 +DCv2 C A 4.938053e+00 8.675006e-01 2 1 +DCv2 C B 2.791391e+00 8.675006e-01 2 1 +DCv2 D A 5.022610e+00 0.000000e+00 2 0 +DCv2 D B 5.178533e+00 0.000000e+00 2 0 +DCv2 D C 4.938053e+00 0.000000e+00 2 0 +DCv2 E A 7.325195e+00 0.000000e+00 2 0 +DCv2 E B 5.178533e+00 0.000000e+00 2 0 +DCv2 E C 2.635468e+00 1.386294e+00 2 1 +DCv2 E D 2.720025e+00 7.133499e-01 2 1 diff --git a/test/gtcheck.3.1.out b/test/gtcheck.3.1.out index 8475d2416..00ae6ab82 100644 --- a/test/gtcheck.3.1.out +++ b/test/gtcheck.3.1.out @@ -1,3 +1,3 @@ -DCv2 A D 2.302585e+00 0.000000e+00 1 1 -DCv2 A E 4.605170e+00 0.000000e+00 1 1 -DCv2 D E 2.302585e+00 0.000000e+00 1 1 +DCv2 A D 2.302585e+00 0.000000e+00 1 0 +DCv2 A E 4.605170e+00 0.000000e+00 1 0 +DCv2 D E 2.302585e+00 0.000000e+00 1 0 diff --git a/test/gtcheck.6.1.out b/test/gtcheck.6.1.out index dabf26c69..2c7dcdcc5 100644 --- a/test/gtcheck.6.1.out +++ b/test/gtcheck.6.1.out @@ -1,2 +1,2 @@ DCv2 A B 0.000000e+00 5.753641e-01 1 1 -DCv2 B C 9.210340e+00 0.000000e+00 1 1 +DCv2 B C 9.210340e+00 0.000000e+00 1 0 diff --git a/test/gtcheck.ntop.1.out b/test/gtcheck.ntop.1.out index 414a95963..0cb03fc3c 100644 --- a/test/gtcheck.ntop.1.out +++ b/test/gtcheck.ntop.1.out @@ -1,6 +1,6 @@ -DCv2 smpl x1 9.210340e+01 1.351550e-01 6 6 -DCv2 smpl x2 1.842068e+01 7.931820e-01 6 6 +DCv2 smpl x1 9.210340e+01 8.109302e-01 6 1 +DCv2 smpl x2 1.842068e+01 9.518185e-01 6 5 DCv2 smpl x3 0.000000e+00 9.518185e-01 5 5 -DCv2 smpl x4 5.526204e+01 3.662041e-01 6 6 +DCv2 smpl x4 5.526204e+01 7.324082e-01 6 3 DCv2 smpl x5 0.000000e+00 9.634573e-01 6 6 -DCv2 smpl x6 7.368272e+01 2.310491e-01 6 6 +DCv2 smpl x6 7.368272e+01 6.931472e-01 6 2 diff --git a/test/mendelian.8.out b/test/mendelian.8.out index 28f9df8a7..aa1fbc70c 100644 --- a/test/mendelian.8.out +++ b/test/mendelian.8.out @@ -7,6 +7,7 @@ sites_missing 6 # number of sites with at least one trio GT missing sites_merr 12 # number of sites with at least one Mendelian error sites_good 14 # number of sites with at least one good trio ngood 14 +ngood_alt 11 nmerr 12 nmissing 6 nfail 0 diff --git a/test/merge.11.1.out b/test/merge.11.1.out new file mode 100644 index 000000000..c9bc1f928 --- /dev/null +++ b/test/merge.11.1.out @@ -0,0 +1,11 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file://ref/Zmays_493_APGv4.fa +##contig= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 24 25 26 27 28 29 +1 5695120 . C T 863.255 . . GT:AD ./.:0,0 0/0:389,0 0/0:475,0 0/0:597,0 0/0:262,0 0/0:1428,0 0/0:26,0 0/0:1279,0 0/0:820,0 0/0:616,0 0/0:123,0 0/0:1026,0 +1 6070233 . G C,A 10355.5 . . GT:AD 0/0:918,1,0 1/1:0,19,0 1/1:1,355,0 0/2:181,1,136 1/1:0,634,0 0/0:1219,0,0 1/1:0,365,. 1/1:1,428,. 0/0:751,0,. 1/1:1,837,. 0/0:308,1,. 1/1:0,1113,. +1 7988614 . C T 1143.54 . . GT:AD 0/0:441,. 0/0:278,. 0/0:722,. 0/0:614,. 0/0:321,. 0/0:515,. 0/0:2,0 0/0:600,0 0/0:769,0 0/0:450,0 0/0:300,0 0/0:166,0 +1 11301504 . C . 22.0947 . . GT:AD 0/0:81 0/0:136 ./.:0 0/0:161 0/0:417 0/0:241 0/0:178 0/0:226 0/0:32 0/0:138 ./.:0 0/0:353 diff --git a/test/merge.11.a.vcf b/test/merge.11.a.vcf new file mode 100644 index 000000000..b2ec9d097 --- /dev/null +++ b/test/merge.11.a.vcf @@ -0,0 +1,10 @@ +##fileformat=VCFv4.2 +##reference=file://ref/Zmays_493_APGv4.fa +##contig= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 +1 5695120 . C T 665.34 . . GT:AD ./.:0,0 0/0:389,0 0/0:475,0 0/0:597,0 0/0:262,0 0/0:1428,0 +1 6070233 . G C,A 10355.5 . . GT:AD 0/0:918,1,0 1/1:0,19,0 1/1:1,355,0 0/2:181,1,136 1/1:0,634,0 0/0:1219,0,0 +1 7988614 . C . 22.0485 . . GT:AD 0/0:441 0/0:278 0/0:722 0/0:614 0/0:321 0/0:515 +1 11301504 . C . 22.0947 . . GT:AD 0/0:81 0/0:136 ./.:0 0/0:161 0/0:417 0/0:241 diff --git a/test/merge.11.b.vcf b/test/merge.11.b.vcf new file mode 100644 index 000000000..ef449718b --- /dev/null +++ b/test/merge.11.b.vcf @@ -0,0 +1,10 @@ +##fileformat=VCFv4.2 +##reference=file://ref/Zmays_493_APGv4.fa +##contig= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 24 25 26 27 28 29 +1 5695120 . C T 863.255 . . GT:AD 0/0:26,0 0/0:1279,0 0/0:820,0 0/0:616,0 0/0:123,0 0/0:1026,0 +1 6070233 . G C 4730.75 . . GT:AD 1/1:0,365 1/1:1,428 0/0:751,0 1/1:1,837 0/0:308,1 1/1:0,1113 +1 7988614 . C T 1143.54 . . GT:AD 0/0:2,0 0/0:600,0 0/0:769,0 0/0:450,0 0/0:300,0 0/0:166,0 +1 11301504 . C . 22.0179 . . GT:AD 0/0:178 0/0:226 0/0:32 0/0:138 ./.:0 0/0:353 diff --git a/test/norm.3.2.out b/test/norm.3.2.out new file mode 100644 index 000000000..98c40f9ce --- /dev/null +++ b/test/norm.3.2.out @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=norm.3.fa +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 9 . TCA T . . . +1 11 . N C . . . diff --git a/test/norm.3.out b/test/norm.3.out index dae608538..ab35b19c7 100644 --- a/test/norm.3.out +++ b/test/norm.3.out @@ -4,3 +4,4 @@ ##contig= #CHROM POS ID REF ALT QUAL FILTER INFO 1 9 . TCA T . . . +1 11 . A C . . . diff --git a/test/norm.3.vcf b/test/norm.3.vcf index fd8891fd0..4871d1d3c 100644 --- a/test/norm.3.vcf +++ b/test/norm.3.vcf @@ -3,3 +3,4 @@ ##contig= #CHROM POS ID REF ALT QUAL FILTER INFO 1 10 . NAC N . . . +1 11 . N C . . . diff --git a/test/norm.5.2.out b/test/norm.5.2.out index fd57823a1..7302f4fae 100644 --- a/test/norm.5.2.out +++ b/test/norm.5.2.out @@ -5,5 +5,5 @@ ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B 1 511 . CAGCA CAG . . . GT 1|0 .|1 -1 511 . CAGCA CAA . . . GT .|. 1|. -1 511 . CAGCA CCA . . . GT .|. .|. +1 511 . CAGCA CAA . . . GT .|0 1|. +1 511 . CAGCA CCA . . . GT .|0 .|. diff --git a/test/norm.join-missing-ploidy.1.out b/test/norm.join-missing-ploidy.1.out new file mode 100644 index 000000000..6cb78c21f --- /dev/null +++ b/test/norm.join-missing-ploidy.1.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.1 +##FILTER= +##fileDate=20190708 +##FORMAT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE_ONE SAMPLE_TWO +chr1 100 . A T,C . . . GT 0/1 2 +chr1 200 . A C,T . . . GT 0/2 1 diff --git a/test/norm.join-missing-ploidy.vcf b/test/norm.join-missing-ploidy.vcf new file mode 100644 index 000000000..f55a5796f --- /dev/null +++ b/test/norm.join-missing-ploidy.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.1 +##fileDate=20190708 +##FORMAT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE_ONE SAMPLE_TWO +chr1 100 . A T . . . GT 0/1 ./. +chr1 100 . A C . . . GT . 1 +chr1 200 . A C . . . GT . 1 +chr1 200 . A T . . . GT 0/1 ./. diff --git a/test/norm.rmdup.3.1.out b/test/norm.rmdup.3.1.out new file mode 100644 index 000000000..e4f39c84e --- /dev/null +++ b/test/norm.rmdup.3.1.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.1 +##FILTER= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 1 . A . . SVLEN=-1000 +1 1 . A . . SVLEN=-2000 +1 1 . A . . SVLEN=-3000 +1 18 . AC A . . . diff --git a/test/norm.rmdup.3.2.out b/test/norm.rmdup.3.2.out new file mode 100644 index 000000000..6a0be8e53 --- /dev/null +++ b/test/norm.rmdup.3.2.out @@ -0,0 +1,7 @@ +##fileformat=VCFv4.1 +##FILTER= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 1 . A . . SVLEN=-1000 +1 18 . AC A . . . diff --git a/test/norm.rmdup.3.fa b/test/norm.rmdup.3.fa new file mode 100644 index 000000000..6e8492b14 --- /dev/null +++ b/test/norm.rmdup.3.fa @@ -0,0 +1,2 @@ +>1 +ACGTACGTCCCGTACGTACCCCCGT diff --git a/test/norm.rmdup.3.vcf b/test/norm.rmdup.3.vcf new file mode 100644 index 000000000..3ae4455e1 --- /dev/null +++ b/test/norm.rmdup.3.vcf @@ -0,0 +1,15 @@ +##fileformat=VCFv4.1 +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 10 . C . . SVLEN=-1000 +1 10 . C . . SVLEN=-1000 +1 10 . C . . SVLEN=-2000 +1 10 . C . . SVLEN=-1000 +1 10 . C . . SVLEN=-3000 +1 10 . C . . SVLEN=-2000 +1 10 . C . . SVLEN=-1000 +1 10 . C . . SVLEN=-3000 +1 10 . C . . SVLEN=-2000 +1 20 . CCC CC . . . +1 20 . CC C . . . diff --git a/test/norm.sort.1.out b/test/norm.sort.1.out new file mode 100644 index 000000000..79aca96d2 --- /dev/null +++ b/test/norm.sort.1.out @@ -0,0 +1,7 @@ +##fileformat=VCFv4.1 +##FILTER= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr19 50359054 . C G . . . +chr19 50359054 . C T . . . +chr19 50359054 . C A . . . diff --git a/test/norm.sort.2.out b/test/norm.sort.2.out new file mode 100644 index 000000000..cbd4aef01 --- /dev/null +++ b/test/norm.sort.2.out @@ -0,0 +1,7 @@ +##fileformat=VCFv4.1 +##FILTER= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr19 50359054 . C A . . . +chr19 50359054 . C G . . . +chr19 50359054 . C T . . . diff --git a/test/norm.sort.vcf b/test/norm.sort.vcf new file mode 100644 index 000000000..1c9c9ba38 --- /dev/null +++ b/test/norm.sort.vcf @@ -0,0 +1,4 @@ +##fileformat=VCFv4.1 +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr19 50359054 . C G,T,A . . . diff --git a/test/norm.split.5.1.out b/test/norm.split.5.1.out new file mode 100644 index 000000000..eb33c3122 --- /dev/null +++ b/test/norm.split.5.1.out @@ -0,0 +1,10 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=ref.fasta +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B +1 11 . A C . . . GT 1|0 0|1 +1 11 . A T . . . GT .|0 0|. +1 22 . A C . . . GT .|0 0|. +1 22 . A T . . . GT 1|0 0|1 diff --git a/test/norm.split.5.vcf b/test/norm.split.5.vcf new file mode 100644 index 000000000..74898c25b --- /dev/null +++ b/test/norm.split.5.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +##reference=ref.fasta +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B +1 11 . A C,T . . . GT 1|0 0|1 +1 22 . A C,T . . . GT 2|0 0|2 diff --git a/test/norm.split.merge.5.out b/test/norm.split.merge.5.out new file mode 100644 index 000000000..461e7bde4 --- /dev/null +++ b/test/norm.split.merge.5.out @@ -0,0 +1,12 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##contig= +##contig= +##reference=file:ref.fa +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 29291 . C T . . . +chr1 29291 . C G . . . +chr1 29291 . C A . . . +chr2 29292 . T C,TCCCTCTCCTTTCTCCTCTCTAGCC,TCTCTTTCTCACTGTCTCTCTAGCC,TCCCTCTCCTTTCTCCTCTCTAGC,TCCATCTGTATCCTCTCTAAGC,TCCCTCTCCTTTCTCCTCAGCC,TCCCTCTCCCTTTCTCCTCTCTAGCC,TCCTCTCCTTTCTCCTCTACCGC,TCCCTCTCCTTTCTCTCTCTAGCC,TCCCTCTCCTTTCTCCTCTAGCC,TCCCTCTCCTTTTCCTCCCCAGCC,TCCCTCTCCTTCTCCTCTCTAGCC,TCCCTCTCCCTTCTCCTCTCTCAC . . . +chr3 29292 . T CGTA,TCCCTCTCCTTTCTCCTCTCTAGCC,TCTCTTTCTCACTGTCTCTCTAGCC,TCCCTCTCCTTTCTCCTCTCTAGC,TCCATCTGTATCCTCTCTAAGC,TCCCTCTCCTTTCTCCTCAGCC,TCCCTCTCCCTTTCTCCTCTCTAGCC,TCCTCTCCTTTCTCCTCTACCGC,TCCCTCTCCTTTCTCTCTCTAGCC,TCCCTCTCCTTTCTCCTCTAGCC,TCCCTCTCCTTTTCCTCCCCAGCC,TCCCTCTCCTTCTCCTCTCTAGCC,TCCCTCTCCCTTCTCCTCTCTCAC . . . diff --git a/test/norm.symbolic.3.1.out b/test/norm.symbolic.3.1.out new file mode 100644 index 000000000..72dd5da99 --- /dev/null +++ b/test/norm.symbolic.3.1.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##FILTER= +##INFO= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 11 . G GGATTACA . PASS SVTYPE=DUP;SVLEN=7 +1 11 . G . PASS SVTYPE=DUP;SVLEN=7 diff --git a/test/norm.symbolic.3.fa b/test/norm.symbolic.3.fa new file mode 100644 index 000000000..a31e5d17e --- /dev/null +++ b/test/norm.symbolic.3.fa @@ -0,0 +1,2 @@ +>1 +ACTCACAAAGGGATTACAGATTACAGATTACAGGGGATCCAGAT diff --git a/test/norm.symbolic.3.vcf b/test/norm.symbolic.3.vcf new file mode 100644 index 000000000..34b2967a3 --- /dev/null +++ b/test/norm.symbolic.3.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +##INFO= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 26 . G GATTACAG . PASS SVTYPE=DUP;SVLEN=7 +1 26 . G . PASS SVTYPE=DUP;SVLEN=7 diff --git a/test/query.98.2.out b/test/query.98.2.out new file mode 100644 index 000000000..d6b83888b --- /dev/null +++ b/test/query.98.2.out @@ -0,0 +1,2 @@ +#CHROM POS C:SAMPLE D:SAMPLE C:DP D:DP C:GT D:GT +4 3258449 C D 1 0 1/1 0/0 diff --git a/test/query.filter.id.3.out b/test/query.filter.id.3.out new file mode 100644 index 000000000..2f741322e --- /dev/null +++ b/test/query.filter.id.3.out @@ -0,0 +1 @@ +ss124;abc diff --git a/test/query.filter.id.3.txt b/test/query.filter.id.3.txt new file mode 100644 index 000000000..8baef1b4a --- /dev/null +++ b/test/query.filter.id.3.txt @@ -0,0 +1 @@ +abc diff --git a/test/query.filter.id.4.out b/test/query.filter.id.4.out new file mode 100644 index 000000000..499ba0288 --- /dev/null +++ b/test/query.filter.id.4.out @@ -0,0 +1,2 @@ +rs123 +. diff --git a/test/remove-overlaps.1.out b/test/remove-overlaps.1.1.out similarity index 85% rename from test/remove-overlaps.1.out rename to test/remove-overlaps.1.1.out index b688dd263..a9503d36e 100644 --- a/test/remove-overlaps.1.out +++ b/test/remove-overlaps.1.1.out @@ -5,3 +5,5 @@ #CHROM POS ID REF ALT QUAL FILTER INFO 1 789241 . C G . . . 1 789243 . C CA . . . +1 790000 . C G . . . +1 900000 . C G . . . diff --git a/test/remove-overlaps.1.2.out b/test/remove-overlaps.1.2.out new file mode 100644 index 000000000..ee72197eb --- /dev/null +++ b/test/remove-overlaps.1.2.out @@ -0,0 +1,27 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file:///usr/bio-ref/GRCh38.81/GRCh38.fa +##contig= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 100000 . CC G . . overlap +1 100001 . C G . . overlap +1 789241 . C G . . . +1 789242 . C G . . overlap +1 789242 . C A . . overlap +1 789243 . C CA . . . +1 789243 . C CCA . . overlap +1 789244 . C A . . overlap +1 789244 . C CA . . overlap +1 789244 . C T . . overlap +1 789244 . C CCA . . overlap +1 789245 . C T,CA,TTT . . overlap +1 789245 . C T,A . . overlap +1 789245 . C T . . overlap +1 789245 . C A,T . . overlap +1 789245 . C T . . overlap +1 789245 . C CA,TTT,T . . overlap +1 790000 . C G . . . +1 800000 . CC G . . overlap +1 800001 . C G . . overlap +1 900000 . C G . . . diff --git a/test/remove-overlaps.1.3.out b/test/remove-overlaps.1.3.out new file mode 100644 index 000000000..c0753099a --- /dev/null +++ b/test/remove-overlaps.1.3.out @@ -0,0 +1,4 @@ +1 789241 +1 789243 +1 790000 +1 900000 diff --git a/test/remove-overlaps.1.4.out b/test/remove-overlaps.1.4.out new file mode 100644 index 000000000..ffda32be6 --- /dev/null +++ b/test/remove-overlaps.1.4.out @@ -0,0 +1,22 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file:///usr/bio-ref/GRCh38.81/GRCh38.fa +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 100000 . CC G . . . +1 100001 . C G . . . +1 789242 . C G . . . +1 789242 . C A . . . +1 789243 . C CCA . . . +1 789244 . C A . . . +1 789244 . C CA . . . +1 789244 . C T . . . +1 789244 . C CCA . . . +1 789245 . C T,CA,TTT . . . +1 789245 . C T,A . . . +1 789245 . C T . . . +1 789245 . C A,T . . . +1 789245 . C T . . . +1 789245 . C CA,TTT,T . . . +1 800000 . CC G . . . +1 800001 . C G . . . diff --git a/test/remove-overlaps.1.5.out b/test/remove-overlaps.1.5.out new file mode 100644 index 000000000..8a2167bbc --- /dev/null +++ b/test/remove-overlaps.1.5.out @@ -0,0 +1,27 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file:///usr/bio-ref/GRCh38.81/GRCh38.fa +##contig= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 100000 . CC G . . . +1 100001 . C G . . . +1 789241 . C G . . . +1 789242 . C G . . DUP +1 789242 . C A . . DUP +1 789243 . C CA . . DUP +1 789243 . C CCA . . DUP +1 789244 . C A . . DUP +1 789244 . C CA . . DUP +1 789244 . C T . . DUP +1 789244 . C CCA . . DUP +1 789245 . C T,CA,TTT . . DUP +1 789245 . C T,A . . DUP +1 789245 . C T . . DUP +1 789245 . C A,T . . DUP +1 789245 . C T . . DUP +1 789245 . C CA,TTT,T . . DUP +1 790000 . C G . . . +1 800000 . CC G . . . +1 800001 . C G . . . +1 900000 . C G . . . diff --git a/test/remove-overlaps.1.6.out b/test/remove-overlaps.1.6.out new file mode 100644 index 000000000..c7cf90557 --- /dev/null +++ b/test/remove-overlaps.1.6.out @@ -0,0 +1,27 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file:///usr/bio-ref/GRCh38.81/GRCh38.fa +##contig= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 100000 . CC G . . unique +1 100001 . C G . . unique +1 789241 . C G . . unique +1 789242 . C G . . . +1 789242 . C A . . . +1 789243 . C CA . . . +1 789243 . C CCA . . . +1 789244 . C A . . . +1 789244 . C CA . . . +1 789244 . C T . . . +1 789244 . C CCA . . . +1 789245 . C T,CA,TTT . . . +1 789245 . C T,A . . . +1 789245 . C T . . . +1 789245 . C A,T . . . +1 789245 . C T . . . +1 789245 . C CA,TTT,T . . . +1 790000 . C G . . unique +1 800000 . CC G . . unique +1 800001 . C G . . unique +1 900000 . C G . . unique diff --git a/test/remove-overlaps.vcf b/test/remove-overlaps.1.vcf similarity index 86% rename from test/remove-overlaps.vcf rename to test/remove-overlaps.1.vcf index 647a955f1..c0207fd76 100644 --- a/test/remove-overlaps.vcf +++ b/test/remove-overlaps.1.vcf @@ -19,3 +19,7 @@ 1 789245 . C A,T . . . 1 789245 . C T . . . 1 789245 . C CA,TTT,T . . . +1 790000 . C G . . . +1 800000 . CC G . . . +1 800001 . C G . . . +1 900000 . C G . . . diff --git a/test/remove-overlaps.2.1.out b/test/remove-overlaps.2.1.out new file mode 100644 index 000000000..dc4ec4791 --- /dev/null +++ b/test/remove-overlaps.2.1.out @@ -0,0 +1,12 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file:///usr/bio-ref/GRCh38.81/GRCh38.fa +##contig= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 100000 . CCCC C 5 . rmme +1 100001 . CCCC C 9 . . +1 100002 . CCCC C 7 . rmme +1 100005 . C T 9 . . +1 200001 . CCCC C 9 . . +1 200005 . C T 9 . . diff --git a/test/remove-overlaps.2.out b/test/remove-overlaps.2.vcf similarity index 51% rename from test/remove-overlaps.2.out rename to test/remove-overlaps.2.vcf index 5a8b7bd3b..9594631b7 100644 --- a/test/remove-overlaps.2.out +++ b/test/remove-overlaps.2.vcf @@ -1,8 +1,10 @@ ##fileformat=VCFv4.2 -##FILTER= ##reference=file:///usr/bio-ref/GRCh38.81/GRCh38.fa ##contig= #CHROM POS ID REF ALT QUAL FILTER INFO -1 100000 . CC G . . . -1 100001 . C G . . . -1 789241 . C G . . . +1 100000 . CCCC C 5 . . +1 100001 . CCCC C 9 . . +1 100002 . CCCC C 7 . . +1 100005 . C T 9 . . +1 200001 . CCCC C 9 . . +1 200005 . C T 9 . . diff --git a/test/remove-overlaps.3.1.out b/test/remove-overlaps.3.1.out new file mode 100644 index 000000000..6a9443ad9 --- /dev/null +++ b/test/remove-overlaps.3.1.out @@ -0,0 +1,28 @@ +##fileformat=VCFv4.2 +##FILTER= +##INFO= +##contig= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr 398016 . T . . DP=74 +chr 398017 . GGGGACCCTTCCCTGTGCCTGGCAGAGCGGTGCAATGCA G, 27.74 . DP=93;rmme +chr 398036 . T C, 161.77 . DP=79 +chr 398037 . G . . DP=114;rmme +chr 398038 . G . . DP=114;rmme +chr 398039 . C . . DP=114;rmme +chr 398040 . A . . DP=114;rmme +chr 398041 . G . . DP=114;rmme +chr 398042 . A . . DP=114;rmme +chr 398043 . G . . DP=114;rmme +chr 398044 . C . . DP=114;rmme +chr 398045 . G A, 209.77 . DP=89 +chr 398046 . G . . DP=109;rmme +chr 398047 . T . . DP=109;rmme +chr 398048 . G . . DP=109;rmme +chr 398049 . C . . DP=109;rmme +chr 398050 . A . . DP=109;rmme +chr 398051 . A . . DP=109;rmme +chr 398052 . T . . DP=109;rmme +chr 398053 . G . . DP=109;rmme +chr 398054 . C . . DP=109;rmme +chr 398055 . A . . DP=98;rmme diff --git a/test/remove-overlaps.3.2.out b/test/remove-overlaps.3.2.out new file mode 100644 index 000000000..a6d115090 --- /dev/null +++ b/test/remove-overlaps.3.2.out @@ -0,0 +1,28 @@ +##fileformat=VCFv4.2 +##FILTER= +##INFO= +##contig= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr 398016 . T . . DP=74 +chr 398017 . GGGGACCCTTCCCTGTGCCTGGCAGAGCGGTGCAATGCA G, 27.74 . DP=93;rmme +chr 398036 . T C, 161.77 . DP=79 +chr 398037 . G . . DP=114 +chr 398038 . G . . DP=114 +chr 398039 . C . . DP=114 +chr 398040 . A . . DP=114 +chr 398041 . G . . DP=114 +chr 398042 . A . . DP=114 +chr 398043 . G . . DP=114 +chr 398044 . C . . DP=114 +chr 398045 . G A, 209.77 . DP=89 +chr 398046 . G . . DP=109 +chr 398047 . T . . DP=109 +chr 398048 . G . . DP=109 +chr 398049 . C . . DP=109 +chr 398050 . A . . DP=109 +chr 398051 . A . . DP=109 +chr 398052 . T . . DP=109 +chr 398053 . G . . DP=109 +chr 398054 . C . . DP=109 +chr 398055 . A . . DP=98 diff --git a/test/remove-overlaps.3.vcf b/test/remove-overlaps.3.vcf new file mode 100644 index 000000000..4f5e35c0b --- /dev/null +++ b/test/remove-overlaps.3.vcf @@ -0,0 +1,26 @@ +##fileformat=VCFv4.2 +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr 398016 . T . . DP=74 +chr 398017 . GGGGACCCTTCCCTGTGCCTGGCAGAGCGGTGCAATGCA G, 27.74 . DP=93 +chr 398036 . T C, 161.77 . DP=79 +chr 398037 . G . . DP=114 +chr 398038 . G . . DP=114 +chr 398039 . C . . DP=114 +chr 398040 . A . . DP=114 +chr 398041 . G . . DP=114 +chr 398042 . A . . DP=114 +chr 398043 . G . . DP=114 +chr 398044 . C . . DP=114 +chr 398045 . G A, 209.77 . DP=89 +chr 398046 . G . . DP=109 +chr 398047 . T . . DP=109 +chr 398048 . G . . DP=109 +chr 398049 . C . . DP=109 +chr 398050 . A . . DP=109 +chr 398051 . A . . DP=109 +chr 398052 . T . . DP=109 +chr 398053 . G . . DP=109 +chr 398054 . C . . DP=109 +chr 398055 . A . . DP=98 diff --git a/test/split-vep.12.3.out b/test/split-vep.12.3.out new file mode 100644 index 000000000..0eddfda70 --- /dev/null +++ b/test/split-vep.12.3.out @@ -0,0 +1,8 @@ +#POS Allele Consequence IMPACT SYMBOL Gene Feature_type Feature BIOTYPE EXON INTRON HGVSc HGVSp cDNA_position CDS_position Protein_position Amino_acids Codons Existing_variation ALLELE_NUM DISTANCE STRAND FLAGS VARIANT_CLASS SYMBOL_SOURCE HGNC_ID CANONICAL TSL APPRIS CCDS ENSP SWISSPROT TREMBL UNIPARC SOURCE GENE_PHENO SIFT PolyPhen DOMAINS miRNA HGVS_OFFSET AF AFR_AF AMR_AF EAS_AF EUR_AF SAS_AF AA_AF EA_AF gnomAD_AF gnomAD_AFR_AF gnomAD_AMR_AF gnomAD_ASJ_AF gnomAD_EAS_AF gnomAD_FIN_AF gnomAD_NFE_AF gnomAD_OTH_AF gnomAD_SAS_AF MAX_AF MAX_AF_POPS CLIN_SIG SOMATIC PHENO PUBMED MOTIF_NAME MOTIF_POS HIGH_INF_POS MOTIF_SCORE_CHANGE LoF LoF_filter LoF_flags LoF_info CADD_PHRED CADD_RAW gnomAD2.1 gnomAD2.1_AF_raw gnomAD2.1_AF_popmax gnomAD2.1_AF_afr gnomAD2.1_AF_amr gnomAD2.1_AF_asj gnomAD2.1_AF_eas gnomAD2.1_AF_fin gnomAD2.1_AF_nfe gnomAD2.1_AF_oth gnomAD2.1_AF_sas +14464 T non_coding_transcript_exon_variant MODIFIER WASH7P ENSG00000227232 Transcript ENST00000423562 unprocessed_pseudogene 10/10 . ENST00000423562.1:n.1568T>A . 1568 . . . . rs546169444 1 . -1 . SNV HGNC 38034 . . . . . . . . . . . . . . . 0.0958 0.0144 0.1138 0.005 0.1859 0.1943 . . . . . . . . . . . 0.1943 SAS . . . . . . . . . . . . . . . . . . . . . . . . . +14464 T non_coding_transcript_exon_variant MODIFIER WASH7P ENSG00000227232 Transcript ENST00000438504 unprocessed_pseudogene 12/12 . ENST00000438504.2:n.1682T>A . 1682 . . . . rs546169444 1 . -1 . SNV HGNC 38034 YES . . . . . . . . . . . . . . 0.0958 0.0144 0.1138 0.005 0.1859 0.1943 . . . . . . . . . . . 0.1943 SAS . . . . . . . . . . . . . . . . . . . . . . . . . +14464 T downstream_gene_variant MODIFIER DDX11L1 ENSG00000223972 Transcript ENST00000456328 processed_transcript . . . . . . . . . rs546169444 1 55 1 . SNV HGNC 37102 YES . . . . . . . . . . . . . . 0.0958 0.0144 0.1138 0.005 0.1859 0.1943 . . . . . . . . . . . 0.1943 SAS . . . . . . . . . . . . . . . . . . . . . . . . . +14464 T non_coding_transcript_exon_variant MODIFIER WASH7P ENSG00000227232 Transcript ENST00000488147 unprocessed_pseudogene 11/11 . ENST00000488147.1:n.1291T>A . 1291 . . . . rs546169444 1 . -1 . SNV HGNC 38034 . . . . . . . . . . . . . . . 0.0958 0.0144 0.1138 0.005 0.1859 0.1943 . . . . . . . . . . . 0.1943 SAS . . . . . . . . . . . . . . . . . . . . . . . . . +14464 T non_coding_transcript_exon_variant MODIFIER WASH7P ENSG00000227232 Transcript ENST00000538476 unprocessed_pseudogene 13/13 . ENST00000538476.1:n.1530T>A . 1530 . . . . rs546169444 1 . -1 . SNV HGNC 38034 . . . . . . . . . . . . . . . 0.0958 0.0144 0.1138 0.005 0.1859 0.1943 . . . . . . . . . . . 0.1943 SAS . . . . . . . . . . . . . . . . . . . . . . . . . +14464 T non_coding_transcript_exon_variant MODIFIER WASH7P ENSG00000227232 Transcript ENST00000541675 unprocessed_pseudogene 9/9 . ENST00000541675.1:n.1315T>A . 1315 . . . . rs546169444 1 . -1 . SNV HGNC 38034 . . . . . . . . . . . . . . . 0.0958 0.0144 0.1138 0.005 0.1859 0.1943 . . . . . . . . . . . 0.1943 SAS . . . . . . . . . . . . . . . . . . . . . . . . . +14464 T regulatory_region_variant MODIFIER . . RegulatoryFeature ENSR00000000002 open_chromatin_region . . . . . . . . . rs546169444 1 . . . SNV . . . . . . . . . . . . . . . . . 0.0958 0.0144 0.1138 0.005 0.1859 0.1943 . . . . . . . . . . . 0.1943 SAS . . . . . . . . . . . . . . . . . . . . . . . . . diff --git a/test/test-regidx.c b/test/test-regidx.c index a12aa129e..eed36ab74 100644 --- a/test/test-regidx.c +++ b/test/test-regidx.c @@ -33,12 +33,14 @@ #include #include #include +#include #include #include "regidx.h" static int verbose = 0; -void debug(const char *format, ...) +void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) +debug(const char *format, ...) { if ( verbose<2 ) return; va_list ap; @@ -46,7 +48,8 @@ void debug(const char *format, ...) vfprintf(stderr, format, ap); va_end(ap); } -void info(const char *format, ...) +void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) +info(const char *format, ...) { if ( verbose<1 ) return; va_list ap; @@ -54,7 +57,8 @@ void info(const char *format, ...) vfprintf(stderr, format, ap); va_end(ap); } -void error(const char *format, ...) +void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) HTS_NORETURN +error(const char *format, ...) { va_list ap; va_start(ap, format); diff --git a/test/test.pl b/test/test.pl index a455a5381..cf1e81fdf 100755 --- a/test/test.pl +++ b/test/test.pl @@ -1,6 +1,6 @@ #!/usr/bin/env perl # -# Copyright (C) 2012-2023 Genome Research Ltd. +# Copyright (C) 2012-2024 Genome Research Ltd. # # Author: Petr Danecek # @@ -59,6 +59,7 @@ run_test(\&test_vcf_isec,$opts,in=>['isec-miss.1.1','isec-miss.1.2','isec-miss.1.3'],out=>'isec-miss.1.1.out',args=>'-R {PATH}/isec-miss.1.regs.txt -n +1'); run_test(\&test_vcf_isec,$opts,in=>['isec-miss.2.1','isec-miss.2.2','isec-miss.2.3'],out=>'isec-miss.2.1.out',args=>'-n +1 -r 20:100,20:140,12:55,20:140,20:100'); run_test(\&test_vcf_isec,$opts,in=>['isec-miss.2.1','isec-miss.2.2','isec-miss.2.3'],out=>'isec-miss.2.1.out',args=>'-R {PATH}/isec-miss.1.regs.txt -n +1'); +run_test(\&test_vcf_merge,$opts,in=>['merge.11.a','merge.11.b'],out=>'merge.11.1.out',args=>''); run_test(\&test_vcf_merge,$opts,in=>['merge.join.a','merge.join.b'],out=>'merge.join.1.out',args=>'-i AF:join'); run_test(\&test_vcf_merge,$opts,in=>['merge.LPL.a'],out=>'merge.LPL.0.out',args=>'--force-single'); run_test(\&test_vcf_merge,$opts,in=>['merge.LPL.a','merge.LPL.b','merge.LPL.c'],out=>'merge.LPL.1.out',args=>'--force-samples'); @@ -240,6 +241,10 @@ run_test(\&test_vcf_query,$opts,in=>'query.smpl',out=>'query.smpl.6.out',args=>q[-l -S {PATH}/query.smpl.txt]); run_test(\&test_vcf_query,$opts,in=>'query.filter.id',out=>'query.filter.id.1.out',args=>q[-f'%ID\\n' -i'ID~"s12"']); run_test(\&test_vcf_query,$opts,in=>'query.filter.id',out=>'query.filter.id.2.out',args=>q[-f'%ID\\n' -i'ID="rs123"']); +run_test(\&test_vcf_query,$opts,in=>'query.filter.id',out=>'query.filter.id.3.out',args=>q[-f'%ID\\n' -i'ID="abc"']); +run_test(\&test_vcf_query,$opts,in=>'query.filter.id',out=>'query.filter.id.3.out',args=>q[-f'%ID\\n' -i'ID=@].$$opts{path}.q[/query.filter.id.3.txt']); +run_test(\&test_vcf_query,$opts,in=>'query.filter.id',out=>'query.filter.id.4.out',args=>q[-f'%ID\\n' -i'ID!="abc"']); +run_test(\&test_vcf_query,$opts,in=>'query.filter.id',out=>'query.filter.id.4.out',args=>q[-f'%ID\\n' -i'ID!=@].$$opts{path}.q[/query.filter.id.3.txt']); run_test(\&test_vcf_query,$opts,in=>'filter.12',out=>'query.85.out',args=>q[-i'FILTER="A"' -f'%FILTER\\n']); run_test(\&test_vcf_query,$opts,in=>'filter.12',out=>'query.86.out',args=>q[-i'FILTER~"A"' -f'%FILTER\\n']); run_test(\&test_vcf_query,$opts,in=>'filter.12',out=>'query.87.out',args=>q[-i'FILTER="A;B"' -f'%FILTER\\n']); @@ -254,8 +259,14 @@ run_test(\&test_vcf_query,$opts,in=>'query.header',out=>'query.97.out',args=>q[-H -f'%CHROM %POS[ %SAMPLE %DP %GT]']); run_test(\&test_vcf_query,$opts,in=>'query.header',out=>'query.98.out',args=>q[-H -f'%CHROM %POS[ %SAMPLE][ %DP][ %GT]\\n']); run_test(\&test_vcf_query,$opts,in=>'query.header',out=>'query.98.out',args=>q[-H -f'%CHROM %POS[ %SAMPLE][ %DP][ %GT]']); +run_test(\&test_vcf_query,$opts,in=>'query.header',out=>'query.98.2.out',args=>q[-HH -f'%CHROM %POS[ %SAMPLE][ %DP][ %GT]']); run_test(\&test_vcf_query,$opts,in=>'query.filter-or',out=>'query.filter-or.1.out',args=>q[-f'[%SAMPLE %DP\\n]' -i'DP=1 || DP=2']); run_test(\&test_vcf_query,$opts,in=>'query.filter-or',out=>'query.filter-or.2.out',args=>q[-f'[%SAMPLE %DP\\n]' -i'DP=1 | DP=2']); +run_test(\&test_vcf_norm,$opts,in=>'norm.sort',out=>'norm.sort.1.out',args=>'-m -'); +run_test(\&test_vcf_norm,$opts,in=>'norm.sort',out=>'norm.sort.2.out',args=>'-m - -S lex'); +run_test(\&test_vcf_norm,$opts,in=>'norm.join-missing-ploidy',out=>'norm.join-missing-ploidy.1.out',args=>'-m +both'); +run_test(\&test_vcf_norm,$opts,in=>'norm.split.5',out=>'norm.split.5.1.out',args=>'-m - --multi-overlaps .'); +run_test(\&test_vcf_norm,$opts,in=>'norm.symbolic.3',out=>'norm.symbolic.3.1.out',fai=>'norm.symbolic.3',args=>''); run_test(\&test_vcf_norm,$opts,in=>'norm',out=>'norm.out',fai=>'norm',args=>'-cx'); run_test(\&test_vcf_norm,$opts,in=>'norm.split',out=>'norm.split.out',args=>'-m-'); run_test(\&test_vcf_norm,$opts,in=>'norm.split.2',out=>'norm.split.2.out',args=>'-m-'); @@ -281,9 +292,12 @@ run_test(\&test_vcf_norm,$opts,in=>'norm.rmdup.2',out=>'norm.rmdup.2.2.out',args=>'-d any'); run_test(\&test_vcf_norm,$opts,in=>'norm.rmdup.2',out=>'norm.rmdup.2.2.out',args=>'-d both'); run_test(\&test_vcf_norm,$opts,in=>'norm.rmdup.2',out=>'norm.rmdup.2.2.out',args=>'-d snps'); +run_test(\&test_vcf_norm,$opts,in=>'norm.rmdup.3',fai=>'norm.rmdup.3',out=>'norm.rmdup.3.1.out',args=>'-d exact'); +run_test(\&test_vcf_norm,$opts,in=>'norm.rmdup.3',fai=>'norm.rmdup.3',out=>'norm.rmdup.3.2.out',args=>'-d all'); run_test(\&test_vcf_norm,$opts,in=>'norm.2',fai=>'norm.2',out=>'norm.2.out',args=>'-c s -a'); run_test(\&test_vcf_norm,$opts,in=>'norm.iupac',fai=>'norm.iupac',out=>'norm.iupac.out',args=>'-c s'); run_test(\&test_vcf_norm,$opts,in=>'norm.3',fai=>'norm.3',out=>'norm.3.out',args=>'-c s'); +run_test(\&test_vcf_norm,$opts,in=>'norm.3',fai=>'norm.3',out=>'norm.3.2.out',args=>q[-c s -i'alt="N"']); run_test(\&test_vcf_norm,$opts,in=>'atomize.split.1',out=>'atomize.split.1.0.out',args=>['-a --old-rec-tag OLD_REC','-m -any --force']); run_test(\&test_vcf_norm,$opts,in=>'atomize.split.1',out=>'atomize.split.1.1.out',args=>['-m -any --old-rec-tag OLD_REC --force','-a']); run_test(\&test_vcf_norm,$opts,in=>'atomize.split.1',out=>'atomize.split.1.1.out',args=>'-m -any --old-rec-tag OLD_REC --force -a'); @@ -293,6 +307,9 @@ run_test(\&test_vcf_norm,$opts,in=>'atomize.split.2',out=>'atomize.split.2.2.out',args=>'--atomize --atom-overlaps . --old-rec-tag OLD_REC'); run_test(\&test_vcf_norm,$opts,in=>'atomize.split.3',out=>'atomize.split.3.1.out',args=>'--atomize --atom-overlaps .'); run_test(\&test_vcf_norm,$opts,in=>'atomize.split.4',out=>'atomize.split.4.1.out',args=>'--atomize --atom-overlaps . --old-rec-tag OLD_REC'); +run_test(\&test_vcf_norm,$opts,in=>'atomize.split.4',out=>'atomize.split.4.2.out',args=>q[--atomize --atom-overlaps . --old-rec-tag OLD_REC -i 'ILEN=0']); +run_test(\&test_vcf_norm,$opts,in=>'atomize.split.5',out=>'atomize.split.5.1.out',args=>q[--atomize --old-rec-tag OLD_REC --atom-overlaps .]); +run_test(\&test_vcf_norm,$opts,in=>'atomize.split.5',out=>'atomize.split.5.2.out',args=>q[--atomize --old-rec-tag OLD_REC]); run_test(\&test_vcf_norm,$opts,in=>'norm.4',out=>'norm.4.1.out',args=>'-m +both'); run_test(\&test_vcf_norm,$opts,in=>'norm.4',out=>'norm.4.2.out',args=>'-m +any'); run_test(\&test_vcf_norm,$opts,in=>'norm.5',out=>'norm.5.1.out',args=>'-m - --multi-overlaps 0'); @@ -309,7 +326,9 @@ run_test(\&test_vcf_norm,$opts,in=>'norm.split.merge',out=>'norm.split.merge.1.out',args=>['-m -','-m +both']); run_test(\&test_vcf_norm,$opts,in=>'norm.split.merge',out=>'norm.split.merge.2.out',args=>['-m -','-m +indels']); run_test(\&test_vcf_norm,$opts,in=>'norm.split.merge',out=>'norm.split.merge.3.out',args=>['-m -','-m +snps']); +run_test(\&test_vcf_norm,$opts,in=>'norm.split.merge',out=>'norm.split.merge.3.out',args=>['-m -','-m +snps']); run_test(\&test_vcf_norm,$opts,in=>'norm.split.merge',out=>'norm.split.merge.4.out',args=>['-m -','-m +any']); +run_test(\&test_vcf_norm,$opts,in=>'norm.split.merge',out=>'norm.split.merge.5.out',args=>q[-m - -i 'type="snp"']); run_test(\&test_vcf_norm,$opts,in=>'norm.merge.4',out=>'norm.merge.4.1.out',args=>'-m +any'); run_test(\&test_vcf_norm,$opts,in=>'norm.merge.4',out=>'norm.merge.4.2.out',args=>'-m +both'); run_test(\&test_vcf_view,$opts,in=>'filter.string.1',out=>'filter.string.1.1.out',args=>q[-i 'INFO/TAG=@{PATH}/filter.string.1.txt']); @@ -501,6 +520,10 @@ run_test(\&test_vcf_sort,$opts,in=>'sort',out=>'sort.out',args=>q[-m 0],fmt=>'%CHROM\\t%POS\\t%REF,%ALT\\n'); run_test(\&test_vcf_sort,$opts,in=>'sort',out=>'sort.out',args=>q[-m 1000],fmt=>'%CHROM\\t%POS\\t%REF,%ALT\\n'); run_test(\&test_vcf_regions,$opts,in=>'regions'); +run_test(\&test_vcf_annotate,$opts,in=>'annotate.escape.1',tab=>'annotate.escape.1',out=>'annotate.escape.1.1.out',args=>q[-c CHROM,POS,ISTR,FMT/FSTR]); +run_test(\&test_vcf_annotate,$opts,in=>'annotate.match.1',tab=>'annotate.match.1',out=>'annotate.match.1.1.out',args=>q[-c CHROM,POS,-,-,SCORE,~X,-,- -i'STR={X}']); +run_test(\&test_vcf_annotate,$opts,in=>'annotate.match.1',tab=>'annotate.match.1',out=>'annotate.match.1.2.out',args=>q[-c CHROM,POS,REF,ALT,SCORE,-,~X,- -i'INT={X}']); +run_test(\&test_vcf_annotate,$opts,in=>'annotate.match.1',tab=>'annotate.match.1',out=>'annotate.match.1.2.out',args=>q[-c CHROM,POS,REF,ALT,SCORE,-,-,~X -i'FLT={X}']); run_test(\&test_vcf_annotate,$opts,in=>'annotate',tab=>'annotate',out=>'annotate.out',args=>'-c CHROM,POS,REF,ALT,ID,QUAL,INFO/T_INT,INFO/T_FLOAT,INDEL'); run_test(\&test_vcf_annotate,$opts,in=>'annotate',tab=>'annotate2',out=>'annotate2.out',args=>'-c CHROM,POS,-,T_STR'); run_test(\&test_vcf_annotate,$opts,in=>'annotate',tab=>'annotate2',out=>'annotate22.out',args=>'-c CHROM,FROM,TO,T_STR'); @@ -556,7 +579,7 @@ run_test(\&test_vcf_annotate,$opts,in=>'annotate23',tab=>'annotate23',out=>'annotate31.out',args=>'-c CHROM,POS,~ID,REF,ALT,INFO/END'); run_test(\&test_vcf_annotate,$opts,in=>'annotate24.dst',vcf=>'annotate24.src',out=>'annotate24.1.out',args=>'-c XX'); run_test(\&test_vcf_annotate,$opts,in=>'annotate25',tab=>'annotate25',out=>'annotate25.1.out',args=>'-c CHROM,POS,ID,REF,ALT,~INFO/END'); -run_test(\&test_vcf_annotate,$opts,in=>'annotate26',tab=>'annotate26',out=>'annotate26.1.out',args=>'-c CHROM,POS,~POS'); +run_test(\&test_vcf_annotate,$opts,in=>'annotate26',tab=>'annotate26',out=>'annotate26.1.out',args=>'-c CHROM,POS,-POS'); run_test(\&test_vcf_annotate,$opts,in=>'annotate.missing',tab=>'annotate.missing',out=>'annotate.missing.1.out',args=>'-c CHROM,POS,REF,ALT,TSTR,TFLT,TINT'); run_test(\&test_vcf_annotate,$opts,in=>'annotate.missing',tab=>'annotate.missing',out=>'annotate.missing.2.out',args=>'-c CHROM,POS,REF,ALT,.TSTR,.TFLT,.TINT'); run_test(\&test_vcf_annotate,$opts,in=>'annotate.missing',tab=>'annotate.missing',out=>'annotate.missing.3.out',args=>'-c CHROM,POS,REF,ALT,.+TSTR,.+TFLT,.+TINT'); @@ -688,13 +711,21 @@ run_test(\&test_vcf_plugin,$opts,in=>'trio-dnm/trio-dnm.9',out=>'trio-dnm/trio-dnm.9.1.out',cmd=>'+trio-dnm2',args=>"-p 1X:proband,father,mother --use-NAIVE | $$opts{bin}/bcftools query -f'[\\t%DNM]\\n'"); run_test(\&test_vcf_plugin,$opts,in=>'trio-dnm/trio-dnm.9',out=>'trio-dnm/trio-dnm.9.2.out',cmd=>'+trio-dnm2',args=>"-p 2X:proband,father,mother --use-NAIVE | $$opts{bin}/bcftools query -f'[\\t%DNM]\\n'"); run_test(\&test_vcf_plugin,$opts,in=>'trio-dnm/trio-dnm.10',out=>'trio-dnm/trio-dnm.10.1.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother --with-pAD | $$opts{bin}/bcftools query -f'[\\t%DNM][\\t%VAF]\\n'"); -run_test(\&test_vcf_plugin,$opts,in=>'trio-dnm/trio-dnm.11',out=>'trio-dnm/trio-dnm.11.1.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother | $$opts{bin}/bcftools query -f'[\\t%DNM][\\t%VAF]\\n'"); -run_test(\&test_vcf_plugin,$opts,in=>'trio-dnm/trio-dnm.11',out=>'trio-dnm/trio-dnm.11.2.out',cmd=>'+trio-dnm2',args=>"-p 1X:proband,father,mother --strictly-novel | $$opts{bin}/bcftools query -f'[\\t%DNM][\\t%VAF]\\n'"); +run_test(\&test_vcf_plugin,$opts,in=>'trio-dnm/trio-dnm.11',out=>'trio-dnm/trio-dnm.11.1.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother | $$opts{bin}/bcftools query -f'%CHROM:%POS DNM=[%DNM ]\\tAD=[%AD ]\\tQS=[%QS ]\\tVAF=[%VAF ]\\n'"); +run_test(\&test_vcf_plugin,$opts,in=>'trio-dnm/trio-dnm.11',out=>'trio-dnm/trio-dnm.11.2.out',cmd=>'+trio-dnm2',args=>"-p 1X:proband,father,mother --strictly-novel | $$opts{bin}/bcftools query -f'%CHROM:%POS DNM=[%DNM ]\\tAD=[%AD ]\\tQS=[%QS ]\\tVAF=[%VAF ]\\n'"); run_test(\&test_vcf_plugin,$opts,in=>'gvcfz',out=>'gvcfz.1.out',cmd=>'+gvcfz',args=>qq[-g 'PASS:GT!="alt"' -a | $$opts{bin}/bcftools query -f'%POS\\t%REF\\t%ALT\\t%END[\\t%GT][\\t%DP][\\t%GQ][\\t%RGQ]\\n']); run_test(\&test_vcf_plugin,$opts,in=>'gvcfz',out=>'gvcfz.2.out',cmd=>'+gvcfz',args=>qq[-g 'PASS:GQ>10; FLT:-' -a | $$opts{bin}/bcftools query -f'%POS\\t%REF\\t%ALT\\t%FILTER\\t%END[\\t%GT][\\t%DP][\\t%GQ][\\t%RGQ]\\n']); run_test(\&test_vcf_plugin,$opts,in=>'gvcfz.2',out=>'gvcfz.2.1.out',cmd=>'+gvcfz',args=>qq[-g 'PASS:GT!="alt"' -a | $$opts{bin}/bcftools query -f'%POS\\t%REF\\t%ALT\\t%FILTER\\t%END[\\t%GT][\\t%DP]\\n']); -run_test(\&test_vcf_plugin,$opts,in=>'remove-overlaps',out=>'remove-overlaps.1.out',cmd=>'+remove-overlaps',args=>''); -run_test(\&test_vcf_plugin,$opts,in=>'remove-overlaps',out=>'remove-overlaps.2.out',cmd=>'+remove-overlaps',args=>'-d'); +run_test(\&test_vcf_plugin,$opts,in=>'remove-overlaps.1',out=>'remove-overlaps.1.1.out',cmd=>'+remove-overlaps',args=>'-m overlap'); +run_test(\&test_vcf_plugin,$opts,in=>'remove-overlaps.1',out=>'remove-overlaps.1.2.out',cmd=>'+remove-overlaps',args=>'-m overlap -M overlap'); +run_test(\&test_vcf_plugin,$opts,in=>'remove-overlaps.1',out=>'remove-overlaps.1.3.out',cmd=>'+remove-overlaps',args=>'-m overlap -O t'); +run_test(\&test_vcf_plugin,$opts,in=>'remove-overlaps.1',out=>'remove-overlaps.1.4.out',cmd=>'+remove-overlaps',args=>'-m overlap --reverse'); +run_test(\&test_vcf_plugin,$opts,in=>'remove-overlaps.1',out=>'remove-overlaps.1.5.out',cmd=>'+remove-overlaps',args=>'-m dup -M DUP'); +run_test(\&test_vcf_plugin,$opts,in=>'remove-overlaps.1',out=>'remove-overlaps.1.6.out',cmd=>'+remove-overlaps',args=>'-m dup -M unique --reverse'); +run_test(\&test_vcf_plugin,$opts,in=>'remove-overlaps.2',out=>'remove-overlaps.2.1.out',cmd=>'+remove-overlaps',args=>q[-m 'min(QUAL)' -M rmme]); +run_test(\&test_vcf_plugin,$opts,in=>'remove-overlaps.3',out=>'remove-overlaps.3.1.out',cmd=>'+remove-overlaps',args=>q[-m 'min(QUAL)' -M rmme]); +run_test(\&test_vcf_plugin,$opts,in=>'remove-overlaps.3',out=>'remove-overlaps.3.1.out',cmd=>'+remove-overlaps',args=>q[-m 'min(QUAL)' -M rmme --missing 0]); +run_test(\&test_vcf_plugin,$opts,in=>'remove-overlaps.3',out=>'remove-overlaps.3.2.out',cmd=>'+remove-overlaps',args=>q[-m 'min(QUAL)' -M rmme --missing DP]); run_test(\&test_vcf_plugin,$opts,in=>'split-vep',out=>'split-vep.1.out',cmd=>'+split-vep',args=>qq[-c Consequence -s worst:missense+ | $$opts{bin}/bcftools query -f'%POS\\t%Consequence\\n']); run_test(\&test_vcf_plugin,$opts,in=>'split-vep',out=>'split-vep.2.out',cmd=>'+split-vep',args=>qq[-c Consequence -s worst:missense+ | $$opts{bin}/bcftools query -f'%POS\\t%Consequence\\n' -i'Consequence!="."']); run_test(\&test_vcf_plugin,$opts,in=>'split-vep',out=>'split-vep.2.out',cmd=>'+split-vep',args=>qq[-s worst:missense+ -f'%POS\\t%Consequence\\n']); @@ -713,6 +744,7 @@ run_test(\&test_vcf_plugin,$opts,in=>'split-vep',out=>'split-vep.11.out',cmd=>'+split-vep',args=>qq[-t 1:14464 -f '%POS\\t%CSQ\\n' -A tab]); run_test(\&test_vcf_plugin,$opts,in=>'split-vep',out=>'split-vep.12.out',cmd=>'+split-vep',args=>qq[-t 1:14464 -f '%POS\\t%CSQ\\n' -A tab -d]); run_test(\&test_vcf_plugin,$opts,in=>'split-vep',out=>'split-vep.12.2.out',cmd=>'+split-vep',args=>qq[-t 1:14464 -f '%POS\\t%CSQ\\n' -A tab -d -H]); +run_test(\&test_vcf_plugin,$opts,in=>'split-vep',out=>'split-vep.12.3.out',cmd=>'+split-vep',args=>qq[-t 1:14464 -f '%POS\\t%CSQ\\n' -A tab -d -HH]); run_test(\&test_vcf_plugin,$opts,in=>'split-vep.4',out=>'split-vep.13.out',cmd=>'+split-vep',args=>qq[-f '%POS\\t%BCSQ\\n' -a BCSQ -A tab -d]); run_test(\&test_vcf_plugin,$opts,in=>'split-vep.4',out=>'split-vep.13.out',cmd=>'+split-vep',args=>qq[-f '%POS\\t%BCSQ\\n' -A tab -d]); run_test(\&test_vcf_plugin,$opts,in=>'split-vep',out=>'split-vep.14.out',cmd=>'+split-vep',args=>qq[-c gnomAD_NFE_AF:real,ALLELE_NUM:int | $$opts{bin}/bcftools query -f'%POS\\t%gnomAD_NFE_AF\\t%ALLELE_NUM\\n']); @@ -842,6 +874,11 @@ run_test(\&test_vcf_convert_gvcf,$opts,in=>'convert.gvcf',out=>'convert.gvcf.out',fa=>'gvcf.fa',args=>'--gvcf2vcf -i\'FILTER="PASS"\''); run_test(\&test_vcf_convert_tsv2vcf,$opts,in=>'convert.23andme',out=>'convert.23andme.vcf',args=>'-c ID,CHROM,POS,AA -s SAMPLE1',fai=>'23andme'); run_test(\&test_vcf_convert_tsv2vcf,$opts,in=>'convert.tsv',out=>'convert.tsv.vcf',args=>'-c -,CHROM,POS,REF,ALT',fai=>'23andme'); +run_test(\&test_vcf_consensus,$opts,in=>'consensus.overlaps.1',out=>'consensus.overlaps.1.1.out',fa=>'consensus.overlaps.1.fa',args=>'-s A'); +run_test(\&test_vcf_consensus,$opts,in=>'consensus.overlaps.1',out=>'consensus.overlaps.1.2.out',fa=>'consensus.overlaps.1.fa',args=>'-s B'); +run_test(\&test_vcf_consensus,$opts,in=>'consensus.overlaps.1',out=>'consensus.overlaps.1.3.out',fa=>'consensus.overlaps.1.fa',args=>'-s A -a N'); +run_test(\&test_vcf_consensus,$opts,in=>'consensus.overlaps.1',out=>'consensus.overlaps.1.4.out',fa=>'consensus.overlaps.1.fa',args=>'-s B -a N'); +run_test(\&test_vcf_consensus,$opts,in=>'consensus.beyond',out=>'consensus.beyond.1.out',fa=>'consensus.beyond.fa',args=>''); run_test(\&test_vcf_consensus,$opts,in=>'consensus',out=>'consensus.1.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-s -'); run_test(\&test_vcf_consensus_chain,$opts,in=>'consensus',out=>'consensus.1.chain',chain=>'consensus.1.chain',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-s -'); run_test(\&test_vcf_consensus,$opts,in=>'consensus',out=>'consensus.2.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-H 1'); @@ -901,37 +938,37 @@ run_test(\&test_vcf_consensus,$opts,in=>'consensus.22',out=>'consensus22.1.out',fa=>'consensus.22.fa',args=>'--regions-overlap 0'); run_test(\&test_vcf_consensus,$opts,in=>'consensus.22',out=>'consensus22.3.out',fa=>'consensus.22.fa',args=>'--regions-overlap 1'); run_test(\&test_vcf_consensus,$opts,in=>'consensus.22',out=>'consensus22.3.out',fa=>'consensus.22.fa',args=>'--regions-overlap 2'); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.1.out',args=>q[-r17:100-150],test_list=>1); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.2.out',args=>q[-a DP,DV -r17:100-600]); # test files from samtools mpileup test suite -run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1)],out=>'mpileup/mpileup.3.out',args=>q[-B --ff 0x14 -r17:1050-1060]); # test file converted to vcf from samtools mpileup test suite -run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.4.out',args=>q[-a DP,DPR,DV,DP4,INFO/DPR,SP -r17:100-600]); #test files from samtools mpileup test suite +run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.1.out',args=>q[-r17:100-150 -a -AD],test_list=>1); +run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.2.out',args=>q[-a DP,DV -r17:100-600 -a -AD]); # test files from samtools mpileup test suite +run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1)],out=>'mpileup/mpileup.3.out',args=>q[-B --ff 0x14 -r17:1050-1060 -a -AD]); # test file converted to vcf from samtools mpileup test suite +run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.4.out',args=>q[-a DP,DPR,DV,DP4,INFO/DPR,SP,-AD -r17:100-600]); #test files from samtools mpileup test suite run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.5.out',args=>q[-a DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -r17:100-600]); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.6.out',args=>q[-a DP,DV -r17:100-600 --gvcf 0,2,5]); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.6.out',args=>q[-a DP,DV -r17:100-200,17:201-300,17:301-400,17:401-500,17:501-600 --gvcf 0,2,5]); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.7.out',args=>q[-r17:100-150 -s HG00101,HG00102]); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.7.out',args=>q[-r17:100-150 -S {PATH}/mplp.samples]); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.8.out',args=>q[-r17:100-150 -s ^HG00101,HG00102]); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.8.out',args=>q[-r17:100-150 -S ^{PATH}/mplp.samples]); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.9.out',args=>q[-t17:100-150 -S {PATH}/mplp.9.samples]); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.10.out',args=>q[-t17:100-150 -G {PATH}/mplp.10.samples]); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup.3)],out=>'mpileup/mpileup.11.out',args=>q[]); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup.3 mpileup.4)],out=>'mpileup/mpileup.11.out',args=>q[-s HG00102]); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup.3 mpileup.4)],out=>'mpileup/mpileup.11.out',args=>q[-s ^HG99999]); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup.3 mpileup.4)],out=>'mpileup/mpileup.11.out',args=>q[-G {PATH}/mplp.11.rgs]); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup.3 mpileup.4)],out=>'mpileup/mpileup.11.out',args=>q[-G {PATH}/mplp.11.rgs]); +run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.6.out',args=>q[-a DP,DV,-AD -r17:100-600 --gvcf 0,2,5]); +run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.6.out',args=>q[-a DP,DV,-AD -r17:100-200,17:201-300,17:301-400,17:401-500,17:501-600 --gvcf 0,2,5]); +run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.7.out',args=>q[-a -AD -r17:100-150 -s HG00101,HG00102]); +run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.7.out',args=>q[-a -AD -r17:100-150 -S {PATH}/mplp.samples]); +run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.8.out',args=>q[-a -AD -r17:100-150 -s ^HG00101,HG00102]); +run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.8.out',args=>q[-a -AD -r17:100-150 -S ^{PATH}/mplp.samples]); +run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.9.out',args=>q[-a -AD -t17:100-150 -S {PATH}/mplp.9.samples]); +run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.10.out',args=>q[-a -AD -t17:100-150 -G {PATH}/mplp.10.samples]); +run_test(\&test_mpileup,$opts,in=>[qw(mpileup.3)],out=>'mpileup/mpileup.11.out',args=>q[-a -AD]); +run_test(\&test_mpileup,$opts,in=>[qw(mpileup.3 mpileup.4)],out=>'mpileup/mpileup.11.out',args=>q[-a -AD -s HG00102]); +run_test(\&test_mpileup,$opts,in=>[qw(mpileup.3 mpileup.4)],out=>'mpileup/mpileup.11.out',args=>q[-a -AD -s ^HG99999]); +run_test(\&test_mpileup,$opts,in=>[qw(mpileup.3 mpileup.4)],out=>'mpileup/mpileup.11.out',args=>q[-a -AD -G {PATH}/mplp.11.rgs]); +run_test(\&test_mpileup,$opts,in=>[qw(mpileup.3 mpileup.4)],out=>'mpileup/mpileup.11.out',args=>q[-a -AD -G {PATH}/mplp.11.rgs]); run_test(\&test_mpileup,$opts,in=>[qw(indel-AD.1)],out=>'mpileup/indel-AD.1.out',ref=>'indel-AD.1.fa',args=>q[-a AD]); run_test(\&test_mpileup,$opts,in=>[qw(indel-AD.1)],out=>'mpileup/indel-AD.1cns.out',ref=>'indel-AD.1.fa',args=>q[-a AD --indels-cns]); run_test(\&test_mpileup,$opts,in=>[qw(indel-AD.2)],out=>'mpileup/indel-AD.2.out',ref=>'indel-AD.2.fa',args=>q[-a AD -r 11:75]); run_test(\&test_mpileup,$opts,in=>[qw(indel-AD.2)],out=>'mpileup/indel-AD.3.out',ref=>'indel-AD.2.fa',args=>q[-a AD -r 11:75 --ambig-reads incAD]); run_test(\&test_mpileup,$opts,in=>[qw(indel-AD.2)],out=>'mpileup/indel-AD.4.out',ref=>'indel-AD.2.fa',args=>q[-a AD -r 11:75 --ambig-reads incAD0]); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup-SCR)],out=>'mpileup/mpileup-SCR.out',ref=>'mpileup-SCR.fa',args=>q[-a INFO/SCR,FMT/SCR]); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup-filter)],out=>'mpileup/mpileup-filter.1.out',ref=>'mpileup-SCR.fa',args=>q[-t 1:100 --skip-all-set PAIRED,PROPER_PAIR,MREVERSE]); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup-filter)],out=>'mpileup/mpileup-filter.1.out',ref=>'mpileup-SCR.fa',args=>q[-t 1:100 --skip-any-set READ1]); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup-filter)],out=>'mpileup/mpileup-filter.2.out',ref=>'mpileup-SCR.fa',args=>q[-t 1:100 --skip-all-unset READ1]); -run_test(\&test_mpileup,$opts,in=>[qw(mpileup-filter)],out=>'mpileup/mpileup-filter.2.out',ref=>'mpileup-SCR.fa',args=>q[-t 1:100 --skip-any-unset READ1]); -run_test(\&test_mpileup,$opts,in=>[qw(annot-NMBZ.1)],ref=>'annot-NMBZ.1.fa',out=>'mpileup/annot-NMBZ.1.1.out',args=>q[-a INFO/NMBZ -r chr19:69-99]); -run_test(\&test_mpileup,$opts,in=>[qw(annot-NMBZ.2)],ref=>'annot-NMBZ.2.fa',out=>'mpileup/annot-NMBZ.2.1.out',args=>q[-a INFO/NMBZ -r chr6:75]); -run_test(\&test_mpileup,$opts,in=>[qw(annot-NMBZ.3.1 annot-NMBZ.3.2)],ref=>'annot-NMBZ.3.fa',out=>'mpileup/annot-NMBZ.3.1.out',args=>q[-a INFO/NMBZ -r chr16:75]); +run_test(\&test_mpileup,$opts,in=>[qw(mpileup-SCR)],out=>'mpileup/mpileup-SCR.out',ref=>'mpileup-SCR.fa',args=>q[-a -AD,INFO/SCR,FMT/SCR]); +run_test(\&test_mpileup,$opts,in=>[qw(mpileup-filter)],out=>'mpileup/mpileup-filter.1.out',ref=>'mpileup-SCR.fa',args=>q[-a -AD -t 1:100 --skip-all-set PAIRED,PROPER_PAIR,MREVERSE]); +run_test(\&test_mpileup,$opts,in=>[qw(mpileup-filter)],out=>'mpileup/mpileup-filter.1.out',ref=>'mpileup-SCR.fa',args=>q[-a -AD -t 1:100 --skip-any-set READ1]); +run_test(\&test_mpileup,$opts,in=>[qw(mpileup-filter)],out=>'mpileup/mpileup-filter.2.out',ref=>'mpileup-SCR.fa',args=>q[-a -AD -t 1:100 --skip-all-unset READ1]); +run_test(\&test_mpileup,$opts,in=>[qw(mpileup-filter)],out=>'mpileup/mpileup-filter.2.out',ref=>'mpileup-SCR.fa',args=>q[-a -AD -t 1:100 --skip-any-unset READ1]); +run_test(\&test_mpileup,$opts,in=>[qw(annot-NMBZ.1)],ref=>'annot-NMBZ.1.fa',out=>'mpileup/annot-NMBZ.1.1.out',args=>q[-a -AD,INFO/NMBZ -r chr19:69-99]); +run_test(\&test_mpileup,$opts,in=>[qw(annot-NMBZ.2)],ref=>'annot-NMBZ.2.fa',out=>'mpileup/annot-NMBZ.2.1.out',args=>q[-a -AD,INFO/NMBZ -r chr6:75]); +run_test(\&test_mpileup,$opts,in=>[qw(annot-NMBZ.3.1 annot-NMBZ.3.2)],ref=>'annot-NMBZ.3.fa',out=>'mpileup/annot-NMBZ.3.1.out',args=>q[-a -AD,INFO/NMBZ -r chr16:75]); run_test(\&test_csq,$opts,in=>'csq',out=>'csq.1.out',cmd=>'-f {PATH}/csq.fa -g {PATH}/csq.gff3'); run_test(\&test_csq,$opts,in=>'csq',out=>'csq.1.out',cmd=>'-f {PATH}/csq.fa -g {PATH}/csq.chr.gff3'); run_test(\&test_csq,$opts,in=>'csq.2',out=>'csq.2.out',cmd=>'-f {PATH}/csq.fa -g {PATH}/csq.2.gff',tbcsq=>1); @@ -952,6 +989,7 @@ run_test(\&test_roh,$opts,in=>'roh.1',out=>'roh.1.3.out',args=>q[ -G30 --AF-dflt 0.4 -r 1:100174876-100318245 --ignore-homref]); run_test(\&test_roh,$opts,in=>'roh.1',out=>'roh.1.3.out',args=>q[ -G30 --AF-dflt 0.4 -r 1:100174876-100318245 --ignore-homref --include-noalt]); run_test(\&test_roh,$opts,in=>'roh.1',out=>'roh.1.4.out',args=>q[ -G30 --AF-dflt 0.4 -r 1:100174876-100318245 --include-noalt]); +run_test(\&test_gtcheck,$opts,in=>'gtcheck.1',gts=>'gtcheck.1.gts',out=>'gtcheck.1.2.out',args=>q[-e 0 --no-HWE-prob]); run_test(\&test_gtcheck,$opts,in=>'gtcheck.1',gts=>'gtcheck.1.gts',out=>'gtcheck.1.out',args=>q[-e 0]); run_test(\&test_gtcheck,$opts,in=>'gtcheck.1',gts=>'gtcheck.1.gts',out=>'gtcheck.1.out',args=>q[-e 0 -u GT,GT]); run_test(\&test_gtcheck,$opts,in=>'gtcheck.1',gts=>'gtcheck.1.gts',out=>'gtcheck.1.out',args=>q[-e 0 -u GT,PL]); @@ -1811,7 +1849,8 @@ sub test_vcf_plugin { my ($opts,%args) = @_; if ( !$$opts{test_plugins} ) { return; } - $ENV{BCFTOOLS_PLUGINS} = "$$opts{bin}/plugins"; + # Sadly, this does not work: + # $ENV{BCFTOOLS_PLUGINS} = "$$opts{bin}/plugins"; if ( !exists($args{args}) ) { $args{args} = ''; } my $wpath = $$opts{path}; if ($^O =~ /^msys/) { @@ -1828,11 +1867,11 @@ sub test_vcf_plugin { for my $file (@{$args{index}}) { bgzip_tabix_vcf($opts,$file); } } - test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools $args{cmd} $$opts{tmp}/$args{in}.vcf.gz $args{args} | grep -v ^##bcftools_"); + test_cmd($opts,%args,cmd=>"export BCFTOOLS_PLUGINS=$$opts{bin}/plugins; $$opts{bin}/bcftools $args{cmd} $$opts{tmp}/$args{in}.vcf.gz $args{args} | grep -v ^##bcftools_"); cmd("$$opts{bin}/bcftools view -Ob $$opts{tmp}/$args{in}.vcf.gz > $$opts{tmp}/$args{in}.bcf"); cmd("$$opts{bin}/bcftools index -f $$opts{tmp}/$args{in}.bcf"); - test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools $args{cmd} $$opts{tmp}/$args{in}.bcf $args{args} | grep -v ^##bcftools_", exp_fix=>1); + test_cmd($opts,%args,cmd=>"export BCFTOOLS_PLUGINS=$$opts{bin}/plugins; $$opts{bin}/bcftools $args{cmd} $$opts{tmp}/$args{in}.bcf $args{args} | grep -v ^##bcftools_", exp_fix=>1); } sub test_vcf_concat { @@ -2094,14 +2133,13 @@ sub test_plugin_split { my ($opts,%args) = @_; if ( !$$opts{test_plugins} ) { return; } - $ENV{BCFTOOLS_PLUGINS} = "$$opts{bin}/plugins"; my ($package, $filename, $line, $test)=caller(0); $test =~ s/^.+:://; if ( !exists($args{args}) ) { $args{args} = ''; } $args{args} =~ s/{PATH}/$$opts{path}/g; - cmd("$$opts{bin}/bcftools +split $$opts{path}/$args{in}.vcf -o $$opts{tmp}/$args{tmp} $args{args}"); + cmd("export BCFTOOLS_PLUGINS=$$opts{bin}/plugins; $$opts{bin}/bcftools +split $$opts{path}/$args{in}.vcf -o $$opts{tmp}/$args{tmp} $args{args}"); opendir(my $dh,"$$opts{tmp}/$args{tmp}") or failed($opts,$test,"Cannot read $$opts{tmp}/$args{tmp}: $!"); my @files = sort grep { !(/^\./) } readdir($dh); @@ -2110,7 +2148,7 @@ sub test_plugin_split my $files = join(' ',@files); test_cmd($opts,%args, cmd=> - "$$opts{bin}/bcftools +split $$opts{path}/$args{in}.vcf -o $$opts{tmp}/$args{tmp} $args{args} " . + "export BCFTOOLS_PLUGINS=$$opts{bin}/plugins; $$opts{bin}/bcftools +split $$opts{path}/$args{in}.vcf -o $$opts{tmp}/$args{tmp} $args{args} " . " && cd $$opts{tmp}/$args{tmp} " . " && for f in $files; do echo \$f; $$opts{bin}/bcftools query -l \$f; $$opts{bin}/bcftools view -H \$f; done" ); @@ -2119,21 +2157,20 @@ sub test_plugin_scatter { my ($opts,%args) = @_; if ( !$$opts{test_plugins} ) { return; } - $ENV{BCFTOOLS_PLUGINS} = "$$opts{bin}/plugins"; my ($package, $filename, $line, $test)=caller(0); $test =~ s/^.+:://; if ( !exists($args{args}) ) { $args{args} = ''; } $args{args} =~ s/{PATH}/$$opts{path}/g; - cmd("$$opts{bin}/bcftools +scatter $$opts{path}/$args{in}.vcf -o $$opts{tmp}/$args{tmp} $args{args}"); + cmd("export BCFTOOLS_PLUGINS=$$opts{bin}/plugins; $$opts{bin}/bcftools +scatter $$opts{path}/$args{in}.vcf -o $$opts{tmp}/$args{tmp} $args{args}"); opendir(my $dh,"$$opts{tmp}/$args{tmp}") or failed($opts,$test,"Cannot read $$opts{tmp}/$args{tmp}: $!"); my @files = sort grep { !(/^\./) } readdir($dh); closedir($dh) or failed($opts,$test,"Close failed: $$opts{tmp}/$args{tmp}"); my $files = join(' ',@files); - test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools +scatter $$opts{path}/$args{in}.vcf -o $$opts{tmp}/$args{tmp} $args{args} && cd $$opts{tmp}/$args{tmp} && cat $files | grep -v ^##"); + test_cmd($opts,%args,cmd=>"export BCFTOOLS_PLUGINS=$$opts{bin}/plugins; $$opts{bin}/bcftools +scatter $$opts{path}/$args{in}.vcf -o $$opts{tmp}/$args{tmp} $args{args} && cd $$opts{tmp}/$args{tmp} && cat $files | grep -v ^##"); } sub test_roh { diff --git a/test/trio-dnm/trio-dnm.11.1.out b/test/trio-dnm/trio-dnm.11.1.out index 864183963..2400afe42 100644 --- a/test/trio-dnm/trio-dnm.11.1.out +++ b/test/trio-dnm/trio-dnm.11.1.out @@ -1,3 +1,6 @@ - -0.0953102 . . 100 100 0 - -5.01188e-10 . . 29 0 0 - -20.5943 . . 79 56 0 +chr1:10000057 DNM=0 . . AD=30,30 0,26 0,26 QS=1116,1116 0,921 0,921 VAF=50 0 0 +chr1:10000057 DNM=-0.0953102 . . AD=0,30 0,26 38,0 QS=0,1116 0,921 1364,0 VAF=100 100 0 +chr1:10697377 DNM=-5.01188e-10 . . AD=4,10 0,61 0,69 QS=240,566 0,3553 0,409 VAF=29 0 0 +chr1:141907033 DNM=-inf . . AD=53,8 79,0 128,7 QS=2673,257 3268,0 5087,438 VAF=13 0 5 +chrX:10000804 DNM=-20.5943 . . AD=5,19 7,9 10,0 QS=174,671 255,323 370,0 VAF=79 56 0 +chrX:141907033 DNM=-5.98923 . . AD=53,8 79,0 128,7 QS=2673,257 3268,0 5087,238 VAF=13 0 5 diff --git a/test/trio-dnm/trio-dnm.11.2.out b/test/trio-dnm/trio-dnm.11.2.out index 8269ea0a7..766ee7719 100644 --- a/test/trio-dnm/trio-dnm.11.2.out +++ b/test/trio-dnm/trio-dnm.11.2.out @@ -1,3 +1,6 @@ - -inf . . 0 0 100 - -inf . . 29 0 0 - -inf . . 79 56 0 +chr1:10000057 DNM=-inf . . AD=30,30 0,26 0,26 QS=1116,1116 0,921 0,921 VAF=50 0 0 +chr1:10000057 DNM=-inf . . AD=0,30 0,26 38,0 QS=0,1116 0,921 1364,0 VAF=0 0 100 +chr1:10697377 DNM=-inf . . AD=4,10 0,61 0,69 QS=240,566 0,3553 0,409 VAF=29 0 0 +chr1:141907033 DNM=-inf . . AD=53,8 79,0 128,7 QS=2673,257 3268,0 5087,438 VAF=13 0 5 +chrX:10000804 DNM=-inf . . AD=5,19 7,9 10,0 QS=174,671 255,323 370,0 VAF=79 56 0 +chrX:141907033 DNM=-inf . . AD=53,8 79,0 128,7 QS=2673,257 3268,0 5087,238 VAF=13 0 5 diff --git a/test/trio-dnm/trio-dnm.11.vcf b/test/trio-dnm/trio-dnm.11.vcf index cf3d7a91d..4fd8a9c80 100644 --- a/test/trio-dnm/trio-dnm.11.vcf +++ b/test/trio-dnm/trio-dnm.11.vcf @@ -7,6 +7,9 @@ ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT proband father mother -chr1 10000057 . C T . . . GT:PL:AD:QS 1/1:255,90,0:0,30:0,1116 1/1:255,78,0:0,26:0,921 0/0:0,114,255:38,0:1364,0 -chr1 10697377 . C T . . . GT:PL:AD:QS 0/1:255,0,173:4,10:240,566 1/1:255,184,0:0,61:0,3553 1/1:255,208,0:0,69:0,409 -chrX 10000804 . C G . . . GT:PL:AD:QS 0/1:255,0,85:5,19:174,671 0/1:202,0,157:7,9:255,323 0/0:0,30,227:10,0:370,0 +chr1 10000057 . C T . . . PL:AD:QS 90,0,255:30,30:1116,1116 255,78,0:0,26:0,921 255,78,0:0,26:0,921 +chr1 10000057 . C T . . . PL:AD:QS 255,90,0:0,30:0,1116 255,78,0:0,26:0,921 0,114,255:38,0:1364,0 +chr1 10697377 . C T . . . PL:AD:QS 255,0,173:4,10:240,566 255,184,0:0,61:0,3553 255,208,0:0,69:0,409 +chr1 141907033 . G A . . . PL:AD:QS 54,0,255:53,8:2673,257 0,238,255:79,0:3268,0 0,188,255:128,7:5087,438 +chrX 10000804 . C G . . . PL:AD:QS 255,0,85:5,19:174,671 202,0,157:7,9:255,323 0,30,227:10,0:370,0 +chrX 141907033 . G A . . . PL:AD:QS 54,0,255:53,8:2673,257 0,238,255:79,0:3268,0 0,188,255:128,7:5087,238 diff --git a/variantkey.h b/variantkey.h index 82cb9d174..3c7959674 100644 --- a/variantkey.h +++ b/variantkey.h @@ -92,7 +92,7 @@ static inline uint8_t encode_numeric_chrom(const char *chrom, size_t size) { if ((chrom[i] > '9') || (chrom[i] < '0')) { - return 0; // NA: a character that is not a numebr was found. + return 0; // NA: a character that is not a number was found. } v = ((v * 10) + (chrom[i] - '0')); } diff --git a/vcfannotate.c b/vcfannotate.c index bc2800143..b66c8cf51 100644 --- a/vcfannotate.c +++ b/vcfannotate.c @@ -1,6 +1,6 @@ /* vcfannotate.c -- Annotate and edit VCF/BCF files. - Copyright (C) 2013-2023 Genome Research Ltd. + Copyright (C) 2013-2024 Genome Research Ltd. Author: Petr Danecek @@ -104,6 +104,19 @@ typedef struct _annot_col_t } annot_col_t; +typedef struct +{ + char *name; // column name + int ht_type; // type, one of BCF_HT_STR,BCF_HT_INT,BCF_HT_REAL + int icol; // index of the annotation column to use + union { // memory area with the current annotation value to pass to filter_test_ext + int i; + float f; + char *s; + }; +} +ext_t; + // Logic of the filters: include or exclude sites which match the filters? #define FLT_INCLUDE 1 #define FLT_EXCLUDE 2 @@ -125,7 +138,7 @@ typedef struct _args_t regitr_t *tgt_itr; int tgt_is_bed; - filter_t *filter; + filter_t *filter, *filter_ext; // only one is initialized, the latter contains external values to set dynamically on the fly char *filter_str; int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE int keep_sites; @@ -149,6 +162,11 @@ typedef struct _args_t convert_t *set_ids; int set_ids_replace; + // external values for dynamic -i/-e expressions + int n_ext; + ext_t *ext; + void **ext_ptr; + int nsmpl_annot; int *sample_map, nsample_map, sample_is_file; // map[idst] -> isrc uint8_t *src_smpl_pld, *dst_smpl_pld; // for Number=G format fields @@ -617,7 +635,7 @@ static int setter_pos(args_t *args, bcf1_t *line, annot_col_t *col, void *data) char *tmp; int pos = strtol(tab->cols[col->icol], &tmp, 10); if ( tmp==tab->cols[col->icol] ) - error("Could not parse ~POS at %s:%"PRId64" .. [%s]\n",bcf_seqname(args->hdr,line),(int64_t)line->pos+1,tab->cols[col->icol]); + error("Could not parse -POS at %s:%"PRId64" .. [%s]\n",bcf_seqname(args->hdr,line),(int64_t)line->pos+1,tab->cols[col->icol]); line->pos = pos - 1; return 0; } @@ -1168,6 +1186,29 @@ void khash_str2int_clear_free(void *_hash) if (kh_exist(hash, k)) free((char*)kh_key(hash, k)); kh_clear(str2int, hash); } +static const char *escape_string(const char *str, char needle[], char **rmme, size_t *len) +{ + kstring_t tmp = {0,0,0}; + const char *bp = str, *ep = str; + while ( *ep ) + { + int i = 0; + while ( needle[i] && needle[i]!=*ep ) i++; + if ( !needle[i] ) { ep++; continue; } + kputsn(bp,ep-bp,&tmp); + ksprintf(&tmp,"%%%X",*ep); + bp = ++ep; + } + if ( !tmp.l ) + { + *len = strlen(str); + return str; + } + kputs(bp,&tmp); + *len = tmp.l; + *rmme = tmp.s; + return tmp.s; +} static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { if ( (col->replace & REPLACE_MISSING) && col->number!=BCF_VL_A && col->number!=BCF_VL_R ) @@ -1181,13 +1222,17 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_UNIQUE; annot_line_t *tab = (annot_line_t*) data; + const char *escaped = NULL; + char *rmme = NULL; - int len = 0; + size_t len = 0; if ( tab ) { - len = strlen(tab->cols[col->icol]); - if ( !len ) return 0; - if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1; + char *str = tab->cols[col->icol]; + if ( !str || !*str ) return 0; + if ( !str[1] && str[0]=='.' && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1; + char needle[] = {';','=',0}; + escaped = escape_string(tab->cols[col->icol],needle,&rmme,&len); } if ( col->merge_method!=MM_FIRST ) @@ -1201,8 +1246,12 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d if ( col->merge_method==MM_UNIQUE ) { if ( !col->mm_str_hash ) col->mm_str_hash = (khash_t(str2int)*)khash_str2int_init(); - if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 1; - khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol])); + if ( khash_str2int_has_key(col->mm_str_hash, escaped) ) + { + free(rmme); + return 1; + } + khash_str2int_inc(col->mm_str_hash, strdup(escaped)); } if ( (col->replace & SET_OR_APPEND) && !col->mm_kstr.l ) @@ -1214,17 +1263,20 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d } if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr); - kputs(tab->cols[col->icol], &col->mm_kstr); + kputs(escaped, &col->mm_kstr); + free(rmme); return 1; } - if ( col->mm_kstr.l ) { hts_expand(char,col->mm_kstr.l+1,args->mtmps,args->tmps); memcpy(args->tmps,col->mm_kstr.s,col->mm_kstr.l+1); } else + { + free(rmme); return 0; + } // flush the line if ( col->merge_method==MM_UNIQUE ) @@ -1235,13 +1287,13 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d { assert(tab); hts_expand(char,len+1,args->mtmps,args->tmps); - memcpy(args->tmps,tab->cols[col->icol],len+1); - + memcpy(args->tmps,escaped,len+1); if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_string(args,line,col,tab->nals,tab->als); } - - return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); + int ret = bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); + free(rmme); + return ret; } static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { @@ -1675,11 +1727,18 @@ static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void if ( col->icol+args->nsmpl_annot > tab->ncols ) error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + char needle[] = {':',0}; int ismpl; for (ismpl=0; ismplnsmpl_annot; ismpl++) - args->tmpp[ismpl] = tab->cols[col->icol + ismpl]; - - return core_setter_format_str(args,line,col,args->tmpp); + { + size_t len; + char *rmme = NULL; + const char *str = escape_string(tab->cols[col->icol + ismpl],needle,&rmme,&len); + args->tmpp[ismpl] = rmme ? rmme : strdup(str); + } + int ret = core_setter_format_str(args,line,col,args->tmpp); + for (ismpl=0; ismplnsmpl_annot; ismpl++) free(args->tmpp[ismpl]); + return ret; } static int determine_ploidy(int nals, int *vals, int nvals1, uint8_t *smpl, int nsmpl) { @@ -2211,7 +2270,23 @@ static void init_columns(args_t *args) kputsn(ss, se-ss, &str); if ( !str.s[0] || !strcasecmp("-",str.s) ) ; else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = icol; - else if ( !strcasecmp("POS",str.s) ) args->beg_idx = icol; + else if ( !strcasecmp("POS",str.s) ) + { + if ( replace==REPLACE_NON_MISSING && !args->tgts_is_vcf ) + { + args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); + annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); + col->icol = icol; + col->replace = replace; + col->setter = setter_pos; + col->hdr_key_src = strdup(str.s); + col->hdr_key_dst = strdup(str.s); + args->match_end = icol; + } + else + args->beg_idx = icol; + } else if ( !strcasecmp("FROM",str.s) || !strcasecmp("BEG",str.s) ) args->beg_idx = icol; else if ( !strcasecmp("TO",str.s) || !strcasecmp("END",str.s) ) args->end_idx = icol; else if ( !strcasecmp("REF",str.s) ) @@ -2270,9 +2345,23 @@ static void init_columns(args_t *args) col->hdr_key_dst = strdup(str.s); args->match_end = icol; } - else if ( !strcasecmp("~POS",str.s) && !args->tgts_is_vcf ) + else if ( !strcasecmp("~POS",str.s) ) + { + error("Error: the use of ~POS has been deprecated, use -POS to transfer the column POS.\n"); + } + else if ( str.s[0]=='~' ) { - if ( args->tgts_is_vcf ) error("Error: cannot use ~POS, position can be replaced only from a tab-delimited file\n"); + args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); + annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); + col->icol = icol; + col->replace = MATCH_VALUE; + col->setter = NULL; + col->hdr_key_src = strdup(str.s+1); + } + else if ( !strcasecmp("-POS",str.s) && !args->tgts_is_vcf ) + { + if ( args->tgts_is_vcf ) error("Error: cannot use -POS, position can be replaced only from a tab-delimited file\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; memset(col,0,sizeof(*col)); @@ -2832,6 +2921,106 @@ static void rename_annots_push(args_t *args, char *src, char *dst) ksprintf(&str,"%s %s",src,dst); args->rename_annots_map[ args->rename_annots_nmap - 1 ] = str.s; } +static void init_filters(args_t *args) +{ + // Check if the -i/-e expressions contain external values that should be determined + // on the fly from the annotation file. The expressions can be given as + // TAG={NAME} + // TAG={str:NAME} + // TAG={int:NAME} + // TAG={float:NAME} + kstring_t str = {0,0,0}; + char *src = strdup(args->filter_str); + int len = 0; + while (1) + { + char *beg = strchr(src+len,'{'); + if ( !beg ) break; + + // check if "{" appears inside quotes, in such case do not modify + char skip = 0; + char *tmp = src; + while ( tmpfilter_str); // unbalanced quotation; todo: check for escape char + len = tmp - src + 1; + skip = 1; + } + if ( skip ) continue; + + char *end = ++beg; + while ( *end && *end!='}' ) end++; + if ( !*end ) error("Could not parse the expression: %s\n",args->filter_str); + *end = 0; + + // explicit typing? + int type = -1; + tmp = beg; + while ( *tmp && *tmp!=':' ) tmp++; + if ( *tmp ) + { + *tmp = 0; + if ( !strcasecmp(beg,"str") ) type = BCF_HT_STR; + else if ( !strcasecmp(beg,"int") ) type = BCF_HT_INT; + else if ( !strcasecmp(beg,"float") ) type = BCF_HT_REAL; + } + args->n_ext++; + args->ext = (ext_t*)realloc(args->ext,sizeof(*args->ext)*args->n_ext); + ext_t *ext = &args->ext[args->n_ext-1]; + ext->ht_type = type; + ext->name = strdup(beg); + if ( beg-1 > src ) kputsn(src,beg-1-src,&str); + if ( type==-1 ) kputs("{}",&str); + else if ( type==BCF_HT_STR ) kputs("{str}",&str); + else if ( type==BCF_HT_INT ) kputs("{int}",&str); + else if ( type==BCF_HT_REAL ) kputs("{float}",&str); + len = str.l; + kputs(end+1,&str); + free(src); + src = strdup(str.s); + str.l = 0; + } + args->filter = filter_init(args->hdr, src); + free(src); + free(str.s); + + int i,j,n_ext; + const int *ext_type = filter_ext_types(args->filter, &n_ext); + if ( n_ext != args->n_ext ) + error("Failed to parse the expression, unexpected number of dynamic variables (%d vs %d): %s\n",n_ext,args->n_ext,args->filter_str); + + if ( !args->n_ext ) return; + + if ( !args->tgts ) + error("Error: dynamic variables in -i/-e expressions can be currently used only with tab-delimited file, not with VCF (todo)\n"); + + // contains external values + args->ext_ptr = malloc(sizeof(*args->ext_ptr)*args->n_ext); + for (i=0; in_ext; i++) args->ext[i].ht_type = ext_type[i]; + args->filter_ext = args->filter; + args->filter = NULL; + + // set the column idx + if ( args->ncols ) + { + for (i=0; in_ext; i++) + { + for (j=0; jncols; j++) + { + if ( strcmp(args->ext[i].name,args->cols[j].hdr_key_src) ) continue; + args->ext[i].icol = args->cols[j].icol; + break; + } + if ( j==args->ncols ) error("No such column: %s\n",args->ext[i].name); + } + } +} static void init_data(args_t *args) { @@ -2900,7 +3089,7 @@ static void init_data(args_t *args) args->vcmp = vcmp_init(); if ( args->filter_str ) - args->filter = filter_init(args->hdr, args->filter_str); + init_filters(args); if ( args->mark_sites ) { @@ -2938,6 +3127,13 @@ static void init_data(args_t *args) static void destroy_data(args_t *args) { int i; + for (i=0; in_ext; i++) + { + free(args->ext[i].name); + if ( args->ext[i].ht_type!=BCF_HT_STR ) continue; + } + free(args->ext_ptr); + free(args->ext); for (i=0; inrm; i++) free(args->rm[i].key); free(args->rm); if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out); @@ -2992,8 +3188,8 @@ static void destroy_data(args_t *args) free(args->dst_smpl_pld); if ( args->set_ids ) convert_destroy(args->set_ids); - if ( args->filter ) - filter_destroy(args->filter); + if ( args->filter ) filter_destroy(args->filter); + if ( args->filter_ext ) filter_destroy(args->filter_ext); if (args->out_fh) { if ( args->write_index ) @@ -3072,7 +3268,7 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en } else i++; } - if ( args->ref_idx==-1 && args->nalines ) return; + if ( !args->filter_ext && args->ref_idx==-1 && args->nalines ) return; while ( !bcf_sr_regions_overlap(args->tgts, bcf_seqname(args->hdr,line), start_pos,end_pos) ) { @@ -3084,7 +3280,7 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en tmp->start = args->tgts->start; tmp->end = args->tgts->end; parse_annot_line(args, args->tgts->line.s, tmp); - if ( args->ref_idx != -1 ) + if ( args->filter_ext || args->ref_idx != -1 ) { int iseq = args->tgts->iseq; if ( bcf_sr_regions_next(args->tgts)<0 || args->tgts->iseq!=iseq ) break; @@ -3123,172 +3319,181 @@ static int strstr_match(char *a, char *b) } return 0; } -static void annotate(args_t *args, bcf1_t *line) +static int annotate_from_regidx(args_t *args, bcf1_t *line) { - args->current_rec = line; - - int i, j; - for (i=0; inrm; i++) - args->rm[i].handler(args, line, &args->rm[i]); - + int j; int has_overlap = 0; - if ( args->tgt_idx ) + + for (j=0; jncols; j++) args->cols[j].done = 0; + if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) ) { - for (j=0; jncols; j++) args->cols[j].done = 0; - if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) ) + hts_pos_t vcf_end = line->pos + line->rlen - 1; + while ( regitr_overlap(args->tgt_itr) ) { - hts_pos_t vcf_end = line->pos + line->rlen - 1; - while ( regitr_overlap(args->tgt_itr) ) - { - annot_line_t *tmp = &args->alines[0]; - tmp->rid = line->rid; - tmp->start = args->tgt_itr->beg; - tmp->end = args->tgt_itr->end; + annot_line_t *tmp = &args->alines[0]; + tmp->rid = line->rid; + tmp->start = args->tgt_itr->beg; + tmp->end = args->tgt_itr->end; - // Check min overlap - int len_ann = tmp->end - tmp->start + 1; - int len_vcf = line->rlen; - int isec = (tmp->end < vcf_end ? tmp->end : vcf_end) - (tmp->start > line->pos ? tmp->start : line->pos) + 1; - assert( isec > 0 ); - if ( args->min_overlap_ann && args->min_overlap_ann > (float)isec/len_ann ) continue; - if ( args->min_overlap_vcf && args->min_overlap_vcf > (float)isec/len_vcf ) continue; + // Check min overlap + int len_ann = tmp->end - tmp->start + 1; + int len_vcf = line->rlen; + int isec = (tmp->end < vcf_end ? tmp->end : vcf_end) - (tmp->start > line->pos ? tmp->start : line->pos) + 1; + assert( isec > 0 ); + if ( args->min_overlap_ann && args->min_overlap_ann > (float)isec/len_ann ) continue; + if ( args->min_overlap_vcf && args->min_overlap_vcf > (float)isec/len_vcf ) continue; - parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp); + parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp); - // If a plain BED file is provided and we are asked to just mark overlapping sites, there are - // no additional columns. Not sure if there can be any side effects for ill-formatted BED files - // with variable number of columns - if ( !args->ncols && args->mark_sites ) has_overlap = 1; + // If a plain BED file is provided and we are asked to just mark overlapping sites, there are + // no additional columns. Not sure if there can be any side effects for ill-formatted BED files + // with variable number of columns + if ( !args->ncols && args->mark_sites ) has_overlap = 1; - for (j=0; jncols; j++) - { - if ( args->cols[j].done==1 ) continue; - int ret = args->cols[j].setter(args,line,&args->cols[j],tmp); - if ( ret < 0 ) - error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - if ( ret==0 ) - args->cols[j].done = 1; - has_overlap = 1; - } + for (j=0; jncols; j++) + { + if ( args->cols[j].done==1 ) continue; + int ret = args->cols[j].setter(args,line,&args->cols[j],tmp); + if ( ret < 0 ) + error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + if ( ret==0 ) + args->cols[j].done = 1; + has_overlap = 1; } } - for (j=0; jncols; j++) + } + for (j=0; jncols; j++) + { + if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue; + if ( !args->cols[j].setter ) continue; + if ( args->cols[j].setter(args,line,&args->cols[j],NULL) < 0 ) + error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + } + return has_overlap; +} +static int pass_filter_test_ext(args_t *args, bcf1_t *line, annot_line_t *ann) +{ + char *tmp; + int i; + for (i=0; in_ext; i++) + { + int j = args->ext[i].icol; + if ( args->ext[i].ht_type==BCF_HT_STR ) args->ext_ptr[i] = args->ext[i].s = ann->cols[j]; + else if ( args->ext[i].ht_type==BCF_HT_INT ) { - if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue; - if ( !args->cols[j].setter ) continue; - if ( args->cols[j].setter(args,line,&args->cols[j],NULL) < 0 ) - error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + args->ext[i].i = strtol(ann->cols[j],&tmp,10); + if ( *tmp ) + { + if ( strcmp(".",ann->cols[j]) ) error("Error: could not parse the annotation file, expected an integer, found \"%s\"\n",ann->cols[j]); + args->ext_ptr[i] = NULL; + } + else + args->ext_ptr[i] = &args->ext[i].i; + } + else if ( args->ext[i].ht_type==BCF_HT_REAL ) + { + args->ext[i].f = strtod(ann->cols[j],&tmp); + if ( *tmp ) + { + if ( strcmp(".",ann->cols[j]) ) error("Error: could not parse the annotation file, expected a float, found \"%s\"\n",ann->cols[j]); + args->ext_ptr[i] = NULL; + } + else + args->ext_ptr[i] = &args->ext[i].f; } } - else if ( args->tgts ) - { - // Buffer annotation lines. When multiple ALT alleles are present in the annotation file, at least one - // must match some of the VCF alleles. If the append-missing mode is set (and REF+ALT is requested), the - // buffered lines will annotate the VCF respecting the order in ALT and when no matching line is found - // for an ALT, missing value is appended instead. - int end_pos = line->pos + line->rlen - 1; - buffer_annot_lines(args, line, line->pos, end_pos); + int pass = filter_test_ext(args->filter_ext,line,NULL,(const void**)args->ext_ptr); + if ( args->filter_logic==FLT_EXCLUDE ) pass = pass ? 0 : 1; + return pass; +} +static int annotate_from_tab(args_t *args, bcf1_t *line) +{ + int i,j; + int has_overlap = 0; - args->nsrt_alines = 0; - hts_expand(uint32_t,args->nalines,args->msrt_alines,args->srt_alines); - if ( args->nalines >= 0xffff || line->n_allele >= 0xffff ) - error("Error: too many alleles or annotation lines in the buffer at %s:%"PRId64" (todo:skip?)\n",bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + // Buffer annotation lines. When multiple ALT alleles are present in the annotation file, at least one + // must match some of the VCF alleles. If the append-missing mode is set (and REF+ALT is requested), the + // buffered lines will annotate the VCF respecting the order in ALT and when no matching line is found + // for an ALT, missing value is appended instead. + int end_pos = line->pos + line->rlen - 1; + buffer_annot_lines(args, line, line->pos, end_pos); - kstring_t match_end = {0,0,0}; - if ( args->match_end>=0 && bcf_get_info_int32(args->hdr,line,"END",&args->tmpi,&args->mtmpi)==1 ) - kputw(args->tmpi[0],&match_end); + args->nsrt_alines = 0; + hts_expand(uint32_t,args->nalines,args->msrt_alines,args->srt_alines); + if ( args->nalines >= 0xffff || line->n_allele >= 0xffff ) + error("Error: too many alleles or annotation lines in the buffer at %s:%"PRId64" (todo:skip?)\n",bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - // Find matching lines - for (i=0; inalines; i++) + kstring_t match_end = {0,0,0}; + if ( args->match_end>=0 && bcf_get_info_int32(args->hdr,line,"END",&args->tmpi,&args->mtmpi)==1 ) + kputw(args->tmpi[0],&match_end); + + // Find matching lines + for (i=0; inalines; i++) + { + if ( line->pos > args->alines[i].end || end_pos < args->alines[i].start ) continue; + if ( args->ref_idx != -1 ) // REF+ALT matching requested { - if ( line->pos > args->alines[i].end || end_pos < args->alines[i].start ) continue; - if ( args->ref_idx != -1 ) // REF+ALT matching requested + if ( line->pos!=args->alines[i].start || vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue; // refs are not compatible + for (j=1; jalines[i].nals; j++) { - if ( line->pos!=args->alines[i].start || vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue; // refs are not compatible - for (j=1; jalines[i].nals; j++) + int ialt; + if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 ) // match: no ALT allele in VCF and annot file has "." + ialt = 0; + else { - int ialt; - if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 ) // match: no ALT allele in VCF and annot file has "." - ialt = 0; - else - { - ialt = vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]); - if ( ialt < 0 ) continue; - ialt++; - } - if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue; - if ( args->match_end>=0 && match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue; - args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i; - has_overlap = 1; - break; + ialt = vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]); + if ( ialt < 0 ) continue; + ialt++; } + if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue; + if ( args->match_end>=0 && match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue; + if ( args->filter_ext && !pass_filter_test_ext(args,line,&args->alines[i]) ) continue; + args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i; + has_overlap = 1; + break; } - else // overlap, REF+ALT matching not requested + } + else if ( args->filter_ext ) + { + if ( pass_filter_test_ext(args,line,&args->alines[i]) ) { args->srt_alines[args->nsrt_alines++] = (0xffff<<16) | i; has_overlap = 1; } } + else // overlap, REF+ALT matching not requested + { + args->srt_alines[args->nsrt_alines++] = (0xffff<<16) | i; + has_overlap = 1; + } + } - free(match_end.s); + free(match_end.s); + if ( !has_overlap && args->filter_ext && !args->keep_sites ) return has_overlap; - // Sort lines if needed + // Sort lines if needed + if ( args->has_append_mode ) + { + // insertion sort by VCF ALT index (top bits) and alines index (low bits) + uint32_t tmp; + for (i=1; insrt_alines; i++) + for (j=i; j>0 && args->srt_alines[j] < args->srt_alines[j-1]; j--) + tmp = args->srt_alines[j], args->srt_alines[j] = args->srt_alines[j-1], args->srt_alines[j-1] = tmp; + } + // Annotate + for (j=0; jncols; j++) args->cols[j].done = 0; + int ialt_exp = 1; + for (i=0; insrt_alines; i++) + { + int ialt = args->srt_alines[i] >> 16; + int ilin = args->srt_alines[i] & 0xffff; if ( args->has_append_mode ) { - // insertion sort by VCF ALT index (top bits) and alines index (low bits) - uint32_t tmp; - for (i=1; insrt_alines; i++) - for (j=i; j>0 && args->srt_alines[j] < args->srt_alines[j-1]; j--) - tmp = args->srt_alines[j], args->srt_alines[j] = args->srt_alines[j-1], args->srt_alines[j-1] = tmp; - } - // Annotate - for (j=0; jncols; j++) args->cols[j].done = 0; - int ialt_exp = 1; - for (i=0; insrt_alines; i++) - { - int ialt = args->srt_alines[i] >> 16; - int ilin = args->srt_alines[i] & 0xffff; - if ( args->has_append_mode ) - { - if ( ialt_exp > ialt ) continue; // multiple annotation lines for the same position - if ( ialt_exp < ialt ) - { - // REF+ALT matching requested, append-missing mode: insert "." if no annotation line was found for the ALT - while ( ialt_exp++ < ialt ) - { - for (j=0; jncols; j++) - { - if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue; - if ( args->cols[j].done==1 ) continue; - if ( !args->cols[j].setter ) continue; - int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing); - if ( ret < 0 ) - error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - if ( ret==0 ) - args->cols[j].done = 1; - } - } - } - } - for (j=0; jncols; j++) - { - if ( args->cols[j].done==1 ) continue; - if ( !args->cols[j].setter ) continue; - int ret = args->cols[j].setter(args,line,&args->cols[j],&args->alines[ilin]); - if ( ret < 0 ) - error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - if ( ret==0 ) - args->cols[j].done = 1; - } - ialt_exp = ialt + 1; - } - if ( args->nsrt_alines ) - { - // In the append-missing mode fill missing values to all trailing ALTs, but only if at least one - // record was found. Otherwise leave the row will be left without annotation. - if ( args->has_append_mode && ialt_exp < line->n_allele ) + if ( ialt_exp > ialt ) continue; // multiple annotation lines for the same position + if ( ialt_exp < ialt ) { - while ( ialt_exp++ < line->n_allele ) + // REF+ALT matching requested, append-missing mode: insert "." if no annotation line was found for the ALT + while ( ialt_exp++ < ialt ) { for (j=0; jncols; j++) { @@ -3303,41 +3508,97 @@ static void annotate(args_t *args, bcf1_t *line) } } } - // Flush - for (j=0; jncols; j++) - { - if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue; - if ( !args->cols[j].setter ) continue; - int ret = args->cols[j].setter(args,line,&args->cols[j],NULL); - if ( ret < 0 ) - error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - } } + for (j=0; jncols; j++) + { + if ( args->cols[j].done==1 ) continue; + if ( !args->cols[j].setter ) continue; + int ret = args->cols[j].setter(args,line,&args->cols[j],&args->alines[ilin]); + if ( ret < 0 ) + error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + if ( ret==0 ) + args->cols[j].done = 1; + } + ialt_exp = ialt + 1; } - else if ( args->files->nreaders == 2 ) + if ( args->nsrt_alines ) { - if ( bcf_sr_has_line(args->files,1) ) + // In the append-missing mode fill missing values to all trailing ALTs, but only if at least one + // record was found. Otherwise leave the row will be left without annotation. + if ( args->has_append_mode && ialt_exp < line->n_allele ) { - bcf1_t *aline = bcf_sr_get_line(args->files,1); - for (j=0; jncols; j++) + while ( ialt_exp++ < line->n_allele ) { - if ( !args->cols[j].setter ) continue; - if ( args->cols[j].setter(args,line,&args->cols[j],aline) ) - error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + for (j=0; jncols; j++) + { + if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue; + if ( args->cols[j].done==1 ) continue; + if ( !args->cols[j].setter ) continue; + int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing); + if ( ret < 0 ) + error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + if ( ret==0 ) + args->cols[j].done = 1; + } } - - has_overlap = 1; } - } - else if ( args->ncols ) - { + // Flush for (j=0; jncols; j++) { + if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue; if ( !args->cols[j].setter ) continue; - if ( args->cols[j].setter(args,line,&args->cols[j],NULL) ) + int ret = args->cols[j].setter(args,line,&args->cols[j],NULL); + if ( ret < 0 ) error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); } } + return has_overlap; +} +static int annotate_from_vcf(args_t *args, bcf1_t *line) +{ + if ( !bcf_sr_has_line(args->files,1) ) return 0; + int j; + bcf1_t *aline = bcf_sr_get_line(args->files,1); + for (j=0; jncols; j++) + { + if ( !args->cols[j].setter ) continue; + if ( args->cols[j].setter(args,line,&args->cols[j],aline) ) + error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + } + return 1; +} +static int annotate_from_self(args_t *args, bcf1_t *line) +{ + int j; + for (j=0; jncols; j++) + { + if ( !args->cols[j].setter ) continue; + if ( args->cols[j].setter(args,line,&args->cols[j],NULL) ) + error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + } + return 0; +} +static int annotate_line(args_t *args, bcf1_t *line) +{ + args->current_rec = line; + + int i; + for (i=0; inrm; i++) + args->rm[i].handler(args, line, &args->rm[i]); + + int has_overlap = 0; + if ( args->tgt_idx ) + has_overlap = annotate_from_regidx(args,line); + + else if ( args->tgts ) + has_overlap = annotate_from_tab(args,line); + + else if ( args->files->nreaders == 2 ) + has_overlap = annotate_from_vcf(args,line); + + else if ( args->ncols ) + has_overlap = annotate_from_self(args,line); + if ( args->set_ids ) { args->tmpks.l = 0; @@ -3362,6 +3623,8 @@ static void annotate(args_t *args, bcf1_t *line) else bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?0:1); } + + return has_overlap; } static void usage(args_t *args) @@ -3604,7 +3867,8 @@ int main_vcfannotate(int argc, char *argv[]) continue; } } - annotate(args, line); + int keep = annotate_line(args, line); + if ( args->filter_ext && !args->keep_sites && !keep ) continue; if ( bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname); } destroy_data(args); diff --git a/vcfbuf.c b/vcfbuf.c index 3d822948b..22390d0fa 100644 --- a/vcfbuf.c +++ b/vcfbuf.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2016-2022 Genome Research Ltd. + Copyright (c) 2016-2024 Genome Research Ltd. Author: Petr Danecek @@ -29,6 +29,7 @@ #include #include #include +#include #include "bcftools.h" #include "vcfbuf.h" #include "rbuf.h" @@ -61,28 +62,61 @@ typedef struct } prune_t; + +#define MARK_OVERLAP 1 +#define MARK_DUP 2 +#define MARK_EXPR 3 + +#define MARK_MISSING_SCALAR 0 // actual value to use +#define MARK_MISSING_MAX_DP 1 // max overlap_t.value scaled by INFO/DP + +// temporary internal structure for iterative overlap removal by mark_t.expr typedef struct { - int active; + double value; // the sort value + int rmme, idx; // mark for removal, index in vcfbuf_t.rbuf + int dp; // with MARK_MISSING_MAX_DP, INFO/DP is used extrapolate missing QUAL + kbitset_t *bset; // mark which records it overlaps with, given as 0-based indexes to vcfbuf_t.rbuf + bcf1_t *rec; } -rmdup_t; - +overlap_t; typedef struct { - int active, rid, end; + // modes + int mode; + char *expr; + + // sites marked according to expr, returned to the caller via vcfbuf_get() + rbuf_t rbuf; + uint8_t *mark; + int last; + + // MARK_OVERLAP + int overlap_rid, overlap_end; + + // MARK_EXPR + int nbuf; + overlap_t *buf, **buf_ptr; + int missing_expr; // the value to use when min(QUAL) encounters a missing value + float missing_value; // the default missing value + float max_qual; // with MARK_MISSING_MAX_DP + int max_qual_dp; // + int ntmpi; // temporary int array and the allocated memory + int32_t *tmpi; } -overlap_t; +mark_t; struct _vcfbuf_t { - int win, dummy; + int win, // maximum number of sites in the buffer, either number of sites (<0) or bp (<0) + dummy; // the caller maintains the buffer via push/peek/flush bcf_hdr_t *hdr; vcfrec_t *vcf; rbuf_t rbuf; ld_t ld; prune_t prune; - overlap_t overlap; - rmdup_t rmdup; + mark_t mark; + enum { clean, dirty } status; }; vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win) @@ -90,7 +124,8 @@ vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win) vcfbuf_t *buf = (vcfbuf_t*) calloc(1,sizeof(vcfbuf_t)); buf->hdr = hdr; buf->win = win; - buf->overlap.rid = -1; + buf->status = clean; + buf->mark.overlap_rid = -1; int i; for (i=0; ild.max[i] = HUGE_VAL; rbuf_init(&buf->rbuf, 0); @@ -106,38 +141,119 @@ void vcfbuf_destroy(vcfbuf_t *buf) free(buf->prune.farr); free(buf->prune.vrec); free(buf->prune.ac); + free(buf->prune.af_tag); free(buf->prune.idx); + free(buf->mark.mark); + free(buf->mark.expr); + for (i=0; imark.nbuf; i++) kbs_destroy(buf->mark.buf[i].bset); + free(buf->mark.buf); + free(buf->mark.buf_ptr); + free(buf->mark.tmpi); free(buf); } -void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value) +int vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, ...) { - if ( key==LD_FILTER1 ) { buf->ld.filter1 = *((int*)value); return; } - if ( key==LD_RAND_MISSING ) { buf->ld.rand_missing = *((int*)value); return; } - if ( key==LD_MAX_R2 ) { buf->ld.max[VCFBUF_LD_IDX_R2] = *((double*)value); return; } - if ( key==LD_MAX_LD ) { buf->ld.max[VCFBUF_LD_IDX_LD] = *((double*)value); return; } - if ( key==LD_MAX_HD ) { buf->ld.max[VCFBUF_LD_IDX_HD] = *((double*)value); return; } - - if ( key==VCFBUF_DUMMY ) { buf->dummy = *((int*)value); return; } - if ( key==VCFBUF_NSITES ) + va_list args; + switch (key) { - buf->prune.max_sites = *((int*)value); - if ( !buf->prune.mode ) buf->prune.mode = PRUNE_MODE_MAX_AF; - return; + case LD_FILTER1: + va_start(args, key); + buf->ld.filter1 = va_arg(args,int); + va_end(args); + return 0; + + case LD_RAND_MISSING: + va_start(args, key); + buf->ld.rand_missing = va_arg(args,int); + va_end(args); + return 0; + + case LD_MAX_R2: + va_start(args, key); + buf->ld.max[VCFBUF_LD_IDX_R2] = va_arg(args,double); + va_end(args); + return 0; + + case LD_MAX_LD: + va_start(args, key); + buf->ld.max[VCFBUF_LD_IDX_LD] = va_arg(args,double); + va_end(args); + return 0; + + case LD_MAX_HD: + va_start(args, key); + buf->ld.max[VCFBUF_LD_IDX_HD] = va_arg(args,double); + va_end(args); + return 0; + + case VCFBUF_DUMMY: + va_start(args, key); + buf->dummy = va_arg(args,int); + va_end(args); + return 0; + + case PRUNE_NSITES: + va_start(args, key); + buf->prune.max_sites = va_arg(args,int); + if ( !buf->prune.mode ) buf->prune.mode = PRUNE_MODE_MAX_AF; + va_end(args); + return 0; + + case PRUNE_NSITES_MODE: + va_start(args, key); + char *mode = va_arg(args,char*); + va_end(args); + if ( !strcasecmp(mode,"maxAF") ) buf->prune.mode = PRUNE_MODE_MAX_AF; + else if ( !strcasecmp(mode,"1st") ) buf->prune.mode = PRUNE_MODE_1ST; + else if ( !strcasecmp(mode,"rand") ) buf->prune.mode = PRUNE_MODE_RAND; + else error("The mode \"%s\" is not recognised\n",mode); + return 0; + + case PRUNE_AF_TAG: + va_start(args, key); + buf->prune.af_tag = strdup(va_arg(args,char*)); + va_end(args); + return 0; + + case MARK: + va_start(args, key); + buf->mark.expr = strdup(va_arg(args,char*)); + if ( !strcasecmp(buf->mark.expr,"overlap") ) buf->mark.mode = MARK_OVERLAP; + else if ( !strcasecmp(buf->mark.expr,"dup") ) buf->mark.mode = MARK_DUP; + else buf->mark.mode = MARK_EXPR; + va_end(args); + return 0; + + case MARK_MISSING_EXPR: + va_start(args, key); + char *expr = va_arg(args,char*); + if ( !strcasecmp(expr,"0") ) + { + buf->mark.missing_expr = MARK_MISSING_SCALAR; + buf->mark.missing_value = 0; + } + else if ( !strcasecmp(expr,"DP") ) + { + if ( buf->mark.mode!=MARK_EXPR ) error("Only the combination of --mark 'min(QUAL)' with --missing DP is currently supported\n"); + buf->mark.missing_expr = MARK_MISSING_MAX_DP; + } + else + error("todo: MARK_MISSING_EXPR=%s\n",expr); + va_end(args); + return 0; } - if ( key==VCFBUF_AF_TAG ) { buf->prune.af_tag = *((char**)value); return; } - if ( key==VCFBUF_OVERLAP_WIN ) { buf->overlap.active = *((int*)value); return; } - if ( key==VCFBUF_RMDUP) { buf->rmdup.active = *((int*)value); return; } + return 0; +} - if ( key==VCFBUF_NSITES_MODE ) - { - char *mode = *((char**)value); - if ( !strcasecmp(mode,"maxAF") ) buf->prune.mode = PRUNE_MODE_MAX_AF; - else if ( !strcasecmp(mode,"1st") ) buf->prune.mode = PRUNE_MODE_1ST; - else if ( !strcasecmp(mode,"rand") ) buf->prune.mode = PRUNE_MODE_RAND; - else error("The mode \"%s\" is not recognised\n",mode); - return; - } +void *vcfbuf_get(vcfbuf_t *buf, vcfbuf_opt_t key, ...) +{ + va_list args; + va_start(args, key); + if ( key==MARK ) + return &buf->mark.last; + va_end(args); + return NULL; } int vcfbuf_nsites(vcfbuf_t *buf) @@ -147,8 +263,12 @@ int vcfbuf_nsites(vcfbuf_t *buf) bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec) { - rbuf_expand0(&buf->rbuf, vcfrec_t, buf->rbuf.n+1, buf->vcf); + // make sure the caller is using the buffer correctly and calls vcfbuf_flush() + // before placing next vcfbuf_push() call + assert(buf->status!=dirty); + if ( !buf->dummy ) buf->status = dirty; + rbuf_expand0(&buf->rbuf, vcfrec_t, buf->rbuf.n+1, buf->vcf); int i = rbuf_append(&buf->rbuf); if ( !buf->vcf[i].rec ) buf->vcf[i].rec = bcf_init1(); @@ -163,6 +283,7 @@ bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec) bcf1_t *vcfbuf_peek(vcfbuf_t *buf, int idx) { + buf->status = clean; int i = rbuf_kth(&buf->rbuf, idx); return i<0 ? NULL : buf->vcf[i].rec; } @@ -195,6 +316,7 @@ static int cmpint_desc(const void *_a, const void *_b) static void _prune_sites(vcfbuf_t *buf, int flush_all) { + int nbuf = flush_all ? buf->rbuf.n : buf->rbuf.n - 1; int nprune = nbuf - buf->prune.max_sites; @@ -266,37 +388,75 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all) rbuf_remove_kth(&buf->rbuf, vcfrec_t, buf->prune.idx[i], buf->vcf); } -static int _rmdup_can_flush(vcfbuf_t *buf, int flush_all) +static int mark_dup_can_flush_(vcfbuf_t *buf, int flush_all) { - if ( flush_all ) return 1; + int flush = flush_all; + mark_t *mark = &buf->mark; + if ( buf->status==dirty ) + { + // a new site was just added by vcfbuf_push() + rbuf_expand0(&mark->rbuf, uint8_t, buf->rbuf.n, mark->mark); + int i = rbuf_append(&mark->rbuf); + mark->mark[i] = 0; + + if ( buf->rbuf.n==1 ) goto flush; - if ( buf->rbuf.n==1 ) return 0; + // there is at least one previous site, check if it's a duplicate + int k1 = rbuf_kth(&buf->rbuf, -1); + int k2 = rbuf_kth(&buf->rbuf, -2); + vcfrec_t *rec1 = &buf->vcf[k1]; + vcfrec_t *rec2 = &buf->vcf[k2]; - int k1 = rbuf_kth(&buf->rbuf, -1); - int k2 = rbuf_kth(&buf->rbuf, -2); + int is_dup = 1; + if ( rec1->rec->rid!=rec2->rec->rid ) is_dup = 0; + else if ( rec1->rec->pos!=rec2->rec->pos ) is_dup = 0; - vcfrec_t *rec1 = &buf->vcf[k1]; - vcfrec_t *rec2 = &buf->vcf[k2]; + if ( is_dup ) + { + // it is, mark the last two sites as duplicates + int k1 = rbuf_kth(&mark->rbuf, -1); + int k2 = rbuf_kth(&mark->rbuf, -2); + mark->mark[k1] = 1; + mark->mark[k2] = 1; + goto flush; + } - if ( rec1->rec->rid!=rec2->rec->rid ) return 1; - if ( rec1->rec->pos!=rec2->rec->pos ) return 1; + // the last site is not a duplicate with the previous, all sites but the last one can be flushed + flush = 1; + } + else if ( buf->rbuf.n > 1 ) flush = 1; - return 0; +flush: + if ( !flush ) return 0; + + int i = rbuf_shift(&mark->rbuf); + mark->last = mark->mark[i]; + return 1; } -static int _overlap_can_flush(vcfbuf_t *buf, int flush_all) +static int mark_overlap_helper_(vcfbuf_t *buf, int flush_all) { - if ( flush_all ) { buf->overlap.rid = -1; return 1; } + if ( buf->status!=dirty ) return flush_all; - int i = rbuf_last(&buf->rbuf); - vcfrec_t *last = &buf->vcf[i]; - if ( buf->overlap.rid != last->rec->rid ) buf->overlap.end = 0; + int flush = flush_all; + mark_t *mark = &buf->mark; + // a new site was just added by vcfbuf_push() + buf->status = clean; + + rbuf_expand0(&mark->rbuf, uint8_t, buf->rbuf.n, mark->mark); + int i = rbuf_append(&mark->rbuf); + mark->mark[i] = 0; + + // determine beg and end of the last record that was just added + i = rbuf_last(&buf->rbuf); + vcfrec_t *last = &buf->vcf[i]; + if ( mark->overlap_rid != last->rec->rid ) mark->overlap_end = 0; int beg_pos = last->rec->pos; int end_pos = last->rec->pos + last->rec->rlen - 1; // Assuming left-aligned indels. In case it is a deletion, the real variant - // starts one base after. If an insertion, the overlap with previous zero length. + // starts one base after. If an insertion, the overlap with previous is zero int imin = last->rec->rlen; for (i=0; irec->n_allele; i++) { @@ -306,24 +466,175 @@ static int _overlap_can_flush(vcfbuf_t *buf, int flush_all) while ( *ref && *alt && nt_to_upper(*ref)==nt_to_upper(*alt) ) { ref++; alt++; } if ( imin > ref - last->rec->d.allele[0] ) imin = ref - last->rec->d.allele[0]; } - - if ( beg_pos <= buf->overlap.end ) + if ( beg_pos <= mark->overlap_end ) { + // the new site overlaps with the previous beg_pos += imin; if ( beg_pos > end_pos ) end_pos = beg_pos; } - if ( buf->rbuf.n==1 ) { - buf->overlap.rid = last->rec->rid; - buf->overlap.end = end_pos; - return 0; + mark->overlap_rid = last->rec->rid; + mark->overlap_end = end_pos; + return flush; + } + if ( beg_pos <= mark->overlap_end ) + { + if ( mark->overlap_end < end_pos ) mark->overlap_end = end_pos; + int k1 = rbuf_kth(&mark->rbuf, -1); + int k2 = rbuf_kth(&mark->rbuf, -2); + mark->mark[k1] = 1; + mark->mark[k2] = 1; + } + else + { + if ( mark->overlap_end < end_pos ) mark->overlap_end = end_pos; + flush = 1; + } + return flush; +} + + +static int mark_overlap_can_flush_(vcfbuf_t *buf, int flush_all) +{ + int flush = flush_all; + if ( buf->status==dirty ) flush = mark_overlap_helper_(buf,flush_all); + else if ( buf->rbuf.n > 1 ) flush = 1; + if ( !flush ) return 0; + + mark_t *mark = &buf->mark; + int i = rbuf_shift(&mark->rbuf); + mark->last = mark->mark[i]; + return 1; +} + + +static int records_overlap(bcf1_t *a, bcf1_t *b) +{ + if ( a->rid != b->rid ) return 0; + if ( a->pos + a->rlen - 1 < b->pos ) return 0; + return 1; +} + +static int cmp_overlap_ptr_asc(const void *aptr, const void *bptr) +{ + overlap_t *a = *((overlap_t**)aptr); + overlap_t *b = *((overlap_t**)bptr); + if ( a->value < b->value ) return -1; + if ( a->value > b->value ) return 1; + return 0; +} +static void mark_expr_missing_reset_(vcfbuf_t *buf) +{ + buf->mark.max_qual = 0; + buf->mark.max_qual_dp = 0; +} +static void mark_expr_missing_prep_(vcfbuf_t *buf, overlap_t *olap) +{ + int nval = bcf_get_info_int32(buf->hdr,olap->rec,"DP",&buf->mark.tmpi,&buf->mark.ntmpi); + if ( nval!=1 ) return; + + olap->dp = buf->mark.tmpi[0]; + if ( bcf_float_is_missing(olap->rec->qual) ) return; + if ( buf->mark.max_qual < olap->rec->qual ) + { + buf->mark.max_qual = olap->rec->qual; + buf->mark.max_qual_dp = olap->dp; } - if ( beg_pos <= buf->overlap.end ) +} +static void mark_expr_missing_set_(vcfbuf_t *buf, overlap_t *olap) +{ + if ( !bcf_float_is_missing(olap->rec->qual) ) return; + if ( !buf->mark.max_qual_dp ) return; + + // scale QUAL of the most confident variant in the overlap proportionally to the coverage + // and use that to prioritize the records + olap->value = buf->mark.max_qual * olap->dp / buf->mark.max_qual_dp; +} +static int mark_expr_can_flush_(vcfbuf_t *buf, int flush_all) +{ + mark_t *mark = &buf->mark; + if ( strcasecmp("min(QUAL)",mark->expr) ) error("Todo; at this time only min(QUAL) is supported\n"); + + int flush = flush_all; + if ( buf->status==dirty ) { - if ( buf->overlap.end < end_pos ) buf->overlap.end = end_pos; - return 0; + flush = mark_overlap_helper_(buf,flush_all); + if ( !flush ) return 0; + + if ( mark->missing_expr==MARK_MISSING_MAX_DP ) mark_expr_missing_reset_(buf); + + // init overlaps, each overlap_t structure keeps a list of overlapping records, symmetrical + size_t nori = mark->nbuf; + hts_resize(overlap_t, buf->rbuf.n, &mark->nbuf, &mark->buf, HTS_RESIZE_CLEAR); + hts_resize(overlap_t*, buf->rbuf.n, &nori, &mark->buf_ptr, HTS_RESIZE_CLEAR); + int i; + for (i=0; irbuf.n; i++) + { + overlap_t *oi = &mark->buf[i]; + int j = rbuf_kth(&buf->rbuf, i); + assert(j>=0); + bcf1_t *rec = buf->vcf[j].rec; + assert(rec); + oi->rec = rec; + + // todo: other than QUAL values + oi->value = bcf_float_is_missing(rec->qual) ? mark->missing_value : rec->qual; + if ( mark->missing_expr==MARK_MISSING_MAX_DP ) mark_expr_missing_prep_(buf,oi); + if ( oi->bset ) + { + kbs_resize(&oi->bset,buf->rbuf.n); + kbs_clear(oi->bset); + } + else + oi->bset = kbs_init(buf->rbuf.n); + oi->idx = i; + mark->buf_ptr[i] = oi; + mark->mark[oi->idx] = 0; + } + int nolap = 0; + for (i=0; irbuf.n; i++) + { + overlap_t *oi = &mark->buf[i]; + if ( mark->missing_expr==MARK_MISSING_MAX_DP ) mark_expr_missing_set_(buf,oi); + int j; + for (j=i+1; jrbuf.n; j++) + { + overlap_t *oj = &mark->buf[j]; + if ( !records_overlap(oi->rec,oj->rec) ) continue; + kbs_insert(oi->bset,j); + kbs_insert(oj->bset,i); + nolap++; + } + } + + // sort according to the requested criteria, currently only min(QUAL) + qsort(mark->buf_ptr,buf->rbuf.n,sizeof(*mark->buf_ptr),cmp_overlap_ptr_asc); // todo: other than min() + + // go through the list sorted by overlap_t.value, eg QUAL + for (i=0; nolap && irbuf.n; i++) + { + kbitset_iter_t itr; + overlap_t *oi = mark->buf_ptr[i]; + kbs_start(&itr); + int j; + while ((j=kbs_next(oi->bset, &itr)) >= 0) + { + kbs_delete(oi->bset,j); + assert(nolap); + assert(kbs_exists(mark->buf[j].bset,oi->idx)); + kbs_delete(mark->buf[j].bset,oi->idx); + nolap--; + } + j = rbuf_kth(&mark->rbuf,oi->idx); + mark->mark[j] = 1; + } } + else if ( buf->rbuf.n > 1 ) flush = 1; + if ( !flush ) return 0; + + int i = rbuf_shift(&mark->rbuf); + mark->last = mark->mark[i]; return 1; } @@ -331,32 +642,56 @@ bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all) { int i,j; + // nothing to do, no lines in the buffer if ( buf->rbuf.n==0 ) return NULL; - if ( flush_all || buf->dummy ) goto ret; - - i = rbuf_kth(&buf->rbuf, 0); // first - j = rbuf_last(&buf->rbuf); // last - if ( buf->vcf[i].rec->rid != buf->vcf[j].rec->rid ) goto ret; - if ( buf->overlap.active && _overlap_can_flush(buf, flush_all) ) goto ret; - if ( buf->rmdup.active && _rmdup_can_flush(buf, flush_all) ) goto ret; + // dummy mode, always flushing + if ( buf->dummy ) goto ret; - if ( buf->win > 0 ) + // pruning mode + if ( buf->win ) { - if ( buf->rbuf.n <= buf->win ) return NULL; + int can_flush = flush_all; + i = rbuf_kth(&buf->rbuf, 0); // first + j = rbuf_last(&buf->rbuf); // last + if ( buf->vcf[i].rec->rid != buf->vcf[j].rec->rid ) can_flush = 1; + else if ( buf->win > 0 ) + { + if ( buf->rbuf.n > buf->win ) can_flush = 1; + } + else if ( buf->win < 0 ) + { + if ( !(buf->vcf[i].rec->pos - buf->vcf[j].rec->pos > buf->win) ) can_flush = 1; + } + buf->status = clean; + if ( !can_flush ) return NULL; + if ( buf->prune.max_sites && buf->prune.max_sites < buf->rbuf.n ) _prune_sites(buf, flush_all); goto ret; } - else if ( buf->win < 0 ) + + // overlaps and duplicates + if ( buf->mark.mode ) { - if ( buf->vcf[i].rec->pos - buf->vcf[j].rec->pos > buf->win ) return NULL; + int can_flush = 0; + if ( buf->mark.mode==MARK_OVERLAP ) + { + if ( mark_overlap_can_flush_(buf,flush_all) ) can_flush = 1; + } + else if ( buf->mark.mode==MARK_DUP ) + { + if ( mark_dup_can_flush_(buf,flush_all) ) can_flush = 1; + } + if ( buf->mark.mode==MARK_EXPR ) + { + if ( mark_expr_can_flush_(buf,flush_all) ) can_flush = 1; + } + buf->status = clean; + if ( !can_flush ) return NULL; goto ret; } - else - return NULL; ret: - if ( buf->prune.max_sites && buf->prune.max_sites < buf->rbuf.n ) _prune_sites(buf, flush_all); - + buf->status = clean; i = rbuf_shift(&buf->rbuf); return buf->vcf[i].rec; } diff --git a/vcfbuf.h b/vcfbuf.h index 878fd1044..96d7115c2 100644 --- a/vcfbuf.h +++ b/vcfbuf.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2017-2022 Genome Research Ltd. + Copyright (c) 2017-2024 Genome Research Ltd. Author: Petr Danecek @@ -38,13 +38,27 @@ typedef struct _vcfbuf_t vcfbuf_t; // Modes of operation typedef enum { - VCFBUF_DUMMY, // the caller maintains the buffer via push/peek/flush, nothing is removed by vcfbuf - - VCFBUF_OVERLAP_WIN, // keep only overlapping variants in the window - VCFBUF_RMDUP, // remove duplicate sites (completely) - VCFBUF_NSITES, // leave at max this many sites in the window - VCFBUF_NSITES_MODE, // one of: maxAF (keep sites with max AF), 1st (sites that come first), rand (pick randomly) - VCFBUF_AF_TAG, // use this INFO tag with VCFBUF_NSITES + VCFBUF_DUMMY, // int {0,1}, the caller maintains the buffer via push/peek/flush, nothing is removed by vcfbuf + + // pruning + PRUNE_NSITES, // int, leave max this many sites in the window + PRUNE_NSITES_MODE, // char *, maxAF (keep sites with max AF), 1st (sites that come first), rand (pick randomly) + PRUNE_AF_TAG, // char *, use this INFO/AF tag with VCFBUF_NSITES + + // duplicates and overlaps + MARK, // w: char *, resolve overlaps by preferentially removing sites according to EXPR: + // min(QUAL) .. remove sites with lowest QUAL until overlaps are resolved + // overlap .. select all overlapping sites + // dup .. select duplicate sites + // r: use as + // while ( (rec=vcfbuf_flush(buf,flush_all)) ) + // { + // int is_marked = vcfbuf_get_val(buf,int,MARK); + // if ( is_marked ) do_something(rec); + // } + MARK_MISSING_EXPR, // char *, what to do when missing value are encountered with min(QUAL) + // 0 .. set to 0 (the default) + // DP .. scale max quality in the window proportionally to INFO/DP // LD related options LD_RAND_MISSING, // randomize rather than ignore missing genotypes @@ -55,8 +69,23 @@ typedef enum } vcfbuf_opt_t; -#define vcfbuf_set_opt(buf,type,key,value) { type tmp = value; vcfbuf_set(buf, key, (void*)&tmp); } -void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value); + +/** + * vcfbuf_set() - set various options, see the vcfbuf_opt_t keys for the complete list + * + * Returns 0 if the call succeeded, or negative number on error. + */ +int vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, ...); // returns 0 on success + +/** + * vcfbuf_get() - get various options, see the vcfbuf_opt_t keys + * vcfbuf_get_val() - wrapper for `vcfbuf_get()` to return typed value + * + * The former returns pointer to the memory area populated by the requested setting, + * its type can be inferred from the vcfbuf_opt_t documentation. + */ +void *vcfbuf_get(vcfbuf_t *buf, vcfbuf_opt_t key, ...); +#define vcfbuf_get_val(buf,type,key) (*(type*)vcfbuf_get(buf, key)) /* @@ -67,7 +96,9 @@ vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win); void vcfbuf_destroy(vcfbuf_t *buf); /* - * vcfbuf_push() - push a new site for analysis + * vcfbuf_push() - push a new site for analysis. + * + * Note that vcfbuf_flush() or vcfbuf_peek() must be called before next site is pushed. */ bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec); @@ -86,6 +117,8 @@ bcf1_t *vcfbuf_remove(vcfbuf_t *buf, int idx); /* * vcfbuf_flush() - returns the next record or NULL, depending on the mode of operation and * the content of the buffer + * + * @flush_all: 1 if no more vcfbuf_push() calls will follow, 0 otherwise */ bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all); diff --git a/vcfcall.c b/vcfcall.c index bc958ed9a..13e516f83 100644 --- a/vcfcall.c +++ b/vcfcall.c @@ -1,6 +1,6 @@ /* vcfcall.c -- SNP/indel variant calling from VCF/BCF. - Copyright (C) 2013-2023 Genome Research Ltd. + Copyright (C) 2013-2024 Genome Research Ltd. Author: Petr Danecek @@ -696,7 +696,10 @@ static void init_data(args_t *args) } if ( args->aux.flag & CALL_CONSTR_ALLELES ) + { args->vcfbuf = vcfbuf_init(args->aux.hdr, 0); + vcfbuf_set(args->vcfbuf,VCFBUF_DUMMY,1); + } char wmode[8]; set_wmode(wmode,args->output_type,args->output_fname,args->clevel); @@ -941,7 +944,7 @@ static void usage(args_t *args) // todo (and more) // fprintf(stderr, "\nContrast calling and association test options:\n"); // fprintf(stderr, " -1 INT number of group-1 samples [0]\n"); - // fprintf(stderr, " -C FLOAT posterior constrast for LRTaux.min_lrt); + // fprintf(stderr, " -C FLOAT posterior contrast for LRTaux.min_lrt); // fprintf(stderr, " -U INT number of permutations for association testing (effective with -1) [0]\n"); // fprintf(stderr, " -X FLOAT only perform permutations for P(chi^2)aux.min_perm_p); fprintf(stderr, "\n"); diff --git a/vcfcnv.c b/vcfcnv.c index 2a9721a87..e970b043b 100644 --- a/vcfcnv.c +++ b/vcfcnv.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "bcftools.h" #include "HMM.h" #include "rbuf.h" @@ -105,7 +106,7 @@ typedef struct _args_t } args_t; -FILE *open_file(char **fname, const char *mode, const char *fmt, ...); +FILE *open_file(char **fname, const char *mode, const char *fmt, ...) HTS_FORMAT(HTS_PRINTF_FMT, 3, 4); static inline void hmm2cn_state(int nstates, int i, int *a, int *b) { @@ -879,7 +880,7 @@ static int update_sample_args(args_t *args, sample_t *smpl, int ismpl) /* A noisy CN2 band is hard to distinguish from two CN3 bands which are - close to each other. Set a treshold on the minimum separation based + close to each other. Set a threshold on the minimum separation based on the BAF deviation at p=0.95 */ baf_dev2 /= norm_cn3; diff --git a/vcfconcat.c b/vcfconcat.c index 56e152c2f..232b3ae39 100644 --- a/vcfconcat.c +++ b/vcfconcat.c @@ -785,7 +785,7 @@ static void _check_hrecs(const bcf_hdr_t *hdr0, const bcf_hdr_t *hdr, char *fnam for (j=0; jnhrec; j++) { bcf_hrec_t *hrec0 = hdr0->hrec[j]; - if ( hrec0->type!=BCF_HL_FLT && hrec0->type!=BCF_HL_INFO && hrec0->type!=BCF_HL_FMT && hrec0->type!=BCF_HL_CTG ) continue; // skip fiels w/o IDX + if ( hrec0->type!=BCF_HL_FLT && hrec0->type!=BCF_HL_INFO && hrec0->type!=BCF_HL_FMT && hrec0->type!=BCF_HL_CTG ) continue; // skip fields w/o IDX int itag = bcf_hrec_find_key(hrec0, "ID"); bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, hrec0->type, "ID", hrec0->vals[itag], NULL); diff --git a/vcfgtcheck.c b/vcfgtcheck.c index f21f7cf05..be886db34 100644 --- a/vcfgtcheck.c +++ b/vcfgtcheck.c @@ -1,6 +1,6 @@ /* vcfgtcheck.c -- Check sample identity. - Copyright (C) 2013-2023 Genome Research Ltd. + Copyright (C) 2013-2024 Genome Research Ltd. Author: Petr Danecek @@ -74,7 +74,7 @@ typedef struct pair_t *pairs; double *hwe_prob, dsg2prob[8][3], pl2prob[256]; double min_inter_err, max_intra_err; - int all_sites, hom_only, ntop, cross_check, calc_hwe_prob, sort_by_hwe, dry_run, use_PLs; + int all_sites, hom_only, ntop, cross_check, calc_hwe_prob, sort_by_hwe, dry_run, gt_err; BGZF *out_fh; unsigned int nskip_no_match, nskip_not_ba, nskip_mono, nskip_no_data, nskip_dip_GT, nskip_dip_PL, nskip_filter; kstring_t kstr; @@ -141,7 +141,7 @@ static int cmp_pair(const void *_a, const void *_b) typedef struct { - uint32_t ndiff,rid,pos,rand; // rand is to shuffle sites with the same ndiff from across all chromosoms + uint32_t ndiff,rid,pos,rand; // rand is to shuffle sites with the same ndiff from across all chromosomes unsigned long kbs_dat[1]; } diff_sites_t; @@ -389,7 +389,7 @@ static void init_data(args_t *args) args->qry_dsg = (uint8_t*) malloc(args->nqry_smpl); args->gt_dsg = args->cross_check ? args->qry_dsg : (uint8_t*) malloc(args->ngt_smpl); } - if ( args->use_PLs ) + if ( args->gt_err ) { args->pdiff = (double*) calloc(args->npairs,sizeof(*args->pdiff)); // log probability of pair samples being the same args->qry_prob = (double*) malloc(3*args->nqry_smpl*sizeof(*args->qry_prob)); @@ -412,7 +412,7 @@ static void init_data(args_t *args) for (i=0; i<8; i++) for (j=0; j<3; j++) args->dsg2prob[i][j] = HUGE_VAL; - double eprob = pow(10,-0.1*args->use_PLs); // convert from phred score to probability + double eprob = pow(10,-0.1*args->gt_err); // convert from phred score to probability args->dsg2prob[1][0] = 0; // P(00|0) = 1 args->dsg2prob[1][1] = -log(eprob); // P(01|0) = e args->dsg2prob[1][2] = -2*log(eprob); // P(11|0) = e^2 @@ -658,7 +658,7 @@ static void process_line(args_t *args) // The sample pairs were given explicitly via -p/-P options if ( args->pairs ) { - if ( !args->use_PLs ) + if ( !args->gt_err ) { int ndiff = 0; if ( args->kbs_diff ) diff_sites_reset(args); @@ -693,7 +693,7 @@ static void process_line(args_t *args) if ( ndiff ) diff_sites_push(args, ndiff, qry_rec->rid, qry_rec->pos); } - else // use_PLs set + else // gt_err set { for (i=0; inpairs; i++) { @@ -721,7 +721,7 @@ static void process_line(args_t *args) { int match = qry_dsg & gt_dsg; args->hwe_prob[i] += hwe_dsg[match]; - args->nmatch[i]++; + if ( match ) args->nmatch[i]++; } args->ncnt[i]++; } @@ -730,7 +730,7 @@ static void process_line(args_t *args) } int idx=0; - if ( !args->use_PLs ) + if ( !args->gt_err ) { for (i=0; inqry_smpl; i++) { @@ -767,7 +767,7 @@ static void process_line(args_t *args) } } } - else // use_PLs set + else // gt_err set { for (i=0; inqry_smpl; i++) { @@ -802,7 +802,7 @@ static void process_line(args_t *args) { int match = args->qry_dsg[i] & args->gt_dsg[j]; args->hwe_prob[idx] += hwe_dsg[match]; - args->nmatch[idx]++; + if ( match ) args->nmatch[idx]++; } args->ncnt[idx]++; idx++; @@ -895,7 +895,7 @@ static void report(args_t *args) "# match, the observed concordance was less likely to occur by chance)\n"); ksprintf(&args->kstr,"# - Number of sites compared for this pair of samples (bigger = more informative)\n"); ksprintf(&args->kstr,"# - Number of matching genotypes\n"); - ksprintf(&args->kstr,"#DCv2\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]Average -log P(HWE)\t[6]Number of sites compared\t[6]Number of matching genotypes\n"); + ksprintf(&args->kstr,"#DCv2\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]Average -log P(HWE)\t[6]Number of sites compared\t[7]Number of matching genotypes\n"); if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname); int trim = args->ntop; @@ -955,7 +955,7 @@ static void report(args_t *args) args->ndiff[idx], (args->calc_hwe_prob && args->nmatch[idx]) ? args->hwe_prob[idx]/args->nmatch[idx] : 0, args->ncnt[idx], - args->nmatch[idx]); + args->calc_hwe_prob ? args->nmatch[idx] : 0); } else { @@ -965,7 +965,7 @@ static void report(args_t *args) args->pdiff[idx], (args->calc_hwe_prob && args->nmatch[idx]) ? args->hwe_prob[idx]/args->nmatch[idx] : 0, args->ncnt[idx], - args->nmatch[idx]); + args->calc_hwe_prob ? args->nmatch[idx] : 0); } if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname); idx++; @@ -1006,7 +1006,7 @@ static void report(args_t *args) args->ndiff[idx], (args->calc_hwe_prob && args->nmatch[idx]) ? args->hwe_prob[idx]/args->nmatch[idx] : 0, args->ncnt[idx], - args->nmatch[idx]); + args->calc_hwe_prob ? args->nmatch[idx] : 0); } else { @@ -1016,7 +1016,7 @@ static void report(args_t *args) args->pdiff[idx], (args->calc_hwe_prob && args->nmatch[idx]) ? args->hwe_prob[idx]/args->nmatch[idx] : 0, args->ncnt[idx], - args->nmatch[idx]); + args->calc_hwe_prob ? args->nmatch[idx] : 0); } if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname); } @@ -1073,7 +1073,7 @@ static void report(args_t *args) args->ndiff[idx], (args->calc_hwe_prob && args->nmatch[idx]) ? args->hwe_prob[idx]/args->nmatch[idx] : 0, args->ncnt[idx], - args->nmatch[idx]); + args->calc_hwe_prob ? args->nmatch[idx] : 0); } else { @@ -1083,7 +1083,7 @@ static void report(args_t *args) args->pdiff[idx], (args->calc_hwe_prob && args->nmatch[idx]) ? args->hwe_prob[idx]/args->nmatch[idx] : 0, args->ncnt[idx], - args->nmatch[idx]); + args->calc_hwe_prob ? args->nmatch[idx] : 0); } if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname); } @@ -1197,7 +1197,7 @@ int main_vcfgtcheck(int argc, char *argv[]) args->qry_use_GT = -1; args->gt_use_GT = -1; args->calc_hwe_prob = 1; - args->use_PLs = 40; + args->gt_err = 40; args->regions_overlap = 1; args->targets_overlap = 0; args->output_fname = "-"; @@ -1285,7 +1285,7 @@ int main_vcfgtcheck(int argc, char *argv[]) else { // this could be the old -e, --error-probability option - args->use_PLs = strtol(optarg,&tmp,10); + args->gt_err = strtol(optarg,&tmp,10); if ( !tmp || *tmp ) { // it is not @@ -1322,7 +1322,7 @@ int main_vcfgtcheck(int argc, char *argv[]) } break; case 'E': - args->use_PLs = strtol(optarg,&tmp,10); + args->gt_err = strtol(optarg,&tmp,10); if ( !tmp || *tmp ) error("Could not parse: --error-probability %s\n", optarg); break; case 'u': @@ -1367,7 +1367,7 @@ int main_vcfgtcheck(int argc, char *argv[]) while ( *tmp && *tmp!=',' ) tmp++; if ( *tmp ) { *tmp = 0; args->es_tmp_prefix = tmp+1; } } - args->use_PLs = 0; + args->gt_err = 0; break; case 'c': error("The -c option is to be implemented, please open an issue on github\n"); @@ -1427,7 +1427,7 @@ int main_vcfgtcheck(int argc, char *argv[]) } if ( args->distinctive_sites && !args->pair_samples ) error("The experimental option --distinctive-sites requires -p/-P\n"); if ( args->hom_only && !args->gt_fname ) error("The option --homs-only requires --genotypes\n"); - if ( args->distinctive_sites && args->use_PLs ) error("The option --distinctive-sites cannot be combined with --error-probability\n"); + if ( args->distinctive_sites && args->gt_err ) error("The option --distinctive-sites cannot be combined with --error-probability\n"); init_data(args); diff --git a/vcfindex.c b/vcfindex.c index 1dd960ea7..17eac5f32 100644 --- a/vcfindex.c +++ b/vcfindex.c @@ -1,6 +1,6 @@ /* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access. - Copyright (C) 2014-2021 Genome Research Ltd. + Copyright (C) 2014-2024 Genome Research Ltd. Author: Shane McCarthy @@ -264,6 +264,7 @@ int main_vcfindex(int argc, char *argv[]) default: usage(); } } + if (!min_shift) tbi = 1; if (stats > total) { fprintf(stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__); diff --git a/vcfisec.c b/vcfisec.c index 98ed25b62..24a45685b 100644 --- a/vcfisec.c +++ b/vcfisec.c @@ -34,6 +34,7 @@ THE SOFTWARE. */ #include #include #include +#include #include "bcftools.h" #include "filter.h" @@ -69,19 +70,21 @@ args_t; * mkdir_p() - create new directory for a file $fname * @fname: the file name to create the directory for, the part after last "/" is ignored */ -void mkdir_p(const char *fmt, ...) +void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) +mkdir_p(const char *fmt, ...) { va_list ap; va_start(ap, fmt); int n = vsnprintf(NULL, 0, fmt, ap) + 2; va_end(ap); - char *path = (char*)malloc(n); + char *tmp = (char*)malloc(n); + if (!tmp) error("Couldn't allocate space for path: %s\n", strerror(errno)); va_start(ap, fmt); - vsnprintf(path, n, fmt, ap); + vsnprintf(tmp, n, fmt, ap); va_end(ap); - char *tmp = strdup(path), *p = tmp+1; + char *p = tmp+1; while (*p) { while (*p && *p!='/') p++; @@ -89,12 +92,11 @@ void mkdir_p(const char *fmt, ...) char ctmp = *p; *p = 0; int ret = mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); - if ( ret!=0 && errno!=EEXIST ) error("Error creating directory %s: %s\n", path,strerror(errno)); + if ( ret!=0 && errno!=EEXIST ) error("Error creating directory %s: %s\n", tmp,strerror(errno)); *p = ctmp; while ( *p && *p=='/' ) p++; } free(tmp); - free(path); } /** @@ -105,7 +107,8 @@ void mkdir_p(const char *fmt, ...) * * Returns open file descriptor or NULL if mode is NULL. */ -FILE *open_file(char **fname, const char *mode, const char *fmt, ...) +FILE * HTS_FORMAT(HTS_PRINTF_FMT, 3, 4) +open_file(char **fname, const char *mode, const char *fmt, ...) { va_list ap; va_start(ap, fmt); @@ -117,7 +120,7 @@ FILE *open_file(char **fname, const char *mode, const char *fmt, ...) vsnprintf(str, n, fmt, ap); va_end(ap); - mkdir_p(str); + mkdir_p("%s", str); if ( !mode ) { if ( !fname ) error("Uh: expected fname or mode\n"); diff --git a/vcfmerge.c b/vcfmerge.c index 64796a342..3ca5f287a 100644 --- a/vcfmerge.c +++ b/vcfmerge.c @@ -2257,9 +2257,8 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule { \ tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize; \ src = fmt_ori->p + sizeof(src_type_t) * j * fmt_ori->n; \ - int tag_missing = src_is_missing && fmt_ori->n==1; \ - if (!tag_missing) \ - { \ + int tag_missing = src_is_missing && fmt_ori->n==1; \ + if ( src_is_missing && fmt_ori->n>1 ) { \ src += sizeof(src_type_t); \ tag_missing = src_is_vector_end ; \ } \ @@ -2345,9 +2344,9 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule for (j=0; jtmp_arr + (ismpl+j)*nsize; \ - src = fmt_ori->p + sizeof(src_type_t) * j * fmt_ori->size; \ + src = fmt_ori->p + j*fmt_ori->size; \ int tag_missing = src_is_missing && fmt_ori->n==1; \ - if (!tag_missing) { \ + if ( src_is_missing && fmt_ori->n>1 ) { \ src += sizeof(src_type_t); \ tag_missing = src_is_vector_end ; \ } \ @@ -2358,7 +2357,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule for (l=1; lp + sizeof(src_type_t) * j *fmt_ori->size; \ + src = fmt_ori->p + j*fmt_ori->size; \ if ( ma->buf[i].unkn_allele ) /* Use value from the unknown allele when available */ \ { \ int iunkn = ma->buf[i].unkn_allele; \ diff --git a/vcfnorm.c b/vcfnorm.c index a7605d461..f47253385 100644 --- a/vcfnorm.c +++ b/vcfnorm.c @@ -42,6 +42,7 @@ THE SOFTWARE. */ #include "abuf.h" #include "gff.h" #include "regidx.h" +#include "filter.h" #define CHECK_REF_EXIT 1 #define CHECK_REF_WARN 2 @@ -51,6 +52,10 @@ THE SOFTWARE. */ #define MROWS_SPLIT 1 #define MROWS_MERGE 2 +// Logic of the filters: include or exclude sites which match the filters? +#define FLT_INCLUDE 1 +#define FLT_EXCLUDE 2 + // for -m+, mapping from allele indexes of a single input record // to allele indexes of output record typedef struct @@ -64,7 +69,7 @@ typedef struct { int n; // number of alleles char *ref, *alt; - void *hash; + void *hash; // str2int hash } cmpals1_t; @@ -88,8 +93,8 @@ typedef struct int32_t *int32_arr; int ntmp_arr1, ntmp_arr2, nint32_arr; kstring_t *tmp_str; - kstring_t *tmp_als, *tmp_del, tmp_kstr; - int ntmp_als, ntmp_del; + kstring_t *tmp_als, *tmp_sym, tmp_kstr; + int ntmp_als, ntmp_sym; rbuf_t rbuf; int buf_win; // maximum distance between two records to consider int aln_win; // the realignment window size (maximum repeat size) @@ -100,7 +105,7 @@ typedef struct struct { int tot, set, swap; } nref; char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets; int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels, clevel; - int nchanged, nskipped, nsplit, njoined, ntotal, mrows_op, mrows_collapse, parsimonious; + int nchanged, nskipped, nsplit, njoined, ntotal, nfilter, mrows_op, mrows_collapse, parsimonious; int record_cmd_line, force, force_warned, keep_sum_ad; abuf_t *abuf; abuf_opt_t atomize; @@ -108,12 +113,17 @@ typedef struct char *old_rec_tag; htsFile *out; char *index_fn; - int write_index; + int write_index, gff_verbosity; int right_align; char *gff_fname; gff_t *gff; regidx_t *idx_tscript; regitr_t *itr_tscript; + int (*cmp_func)(const void *aptr, const void *bptr); + char *filter_str; + int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE + int filter_pass; + filter_t *filter; } args_t; @@ -555,32 +565,54 @@ static int realign(args_t *args, bcf1_t *line) if ( bcf_get_variant_types(line)==VCF_BND ) return ERR_SYMBOLIC; // breakend, not an error // make a copy of each allele for trimming - hts_expand0(kstring_t,line->n_allele,args->ntmp_als,args->tmp_als); - hts_expand0(kstring_t,line->n_allele,args->ntmp_del,args->tmp_del); + hts_expand0(kstring_t,line->n_allele,args->ntmp_als,args->tmp_als); // the actual sequence to realign + hts_expand0(kstring_t,line->n_allele,args->ntmp_sym,args->tmp_sym); // the original symbolic allele strings to output kstring_t *als = args->tmp_als; - kstring_t *del = args->tmp_del; + kstring_t *sym = args->tmp_sym; int symbolic_alts = 1; for (i=0; in_allele; i++) { - del[i].l = 0; + sym[i].l = 0; if ( line->d.allele[i][0]=='<' ) { - // symbolic allele, only will be realigned - if ( strncmp("d.allele[i],4) ) return ERR_SYMBOLIC; - if ( nref < line->rlen ) + // symbolic allele, only and will be realigned + // TODO: there should be check for symbolic allele length. If too big, perhaps should not attempt realignment + int32_t sv_len = 0; + if ( !strncmp("d.allele[i],4) ) sv_len = -line->rlen; + else if ( !strncmp("d.allele[i],4) ) + { + if ( bcf_get_info_int32(args->hdr,line,"SVLEN",&args->int32_arr,&args->nint32_arr)==1 ) sv_len = args->int32_arr[0]; + } + if ( !sv_len ) return ERR_SYMBOLIC; + + als[i].l = 0; + if ( sv_len<0 ) { + // del, expand REF and replace ALT, for example, replace "REF=C ALT=" with "REF=CAT ALT=C" + if ( nref < line->rlen ) + { + free(ref); + reflen = line->rlen; + ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref); + if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1); + seq_to_upper(ref,0); + replace_iupac_codes(ref,nref); // any non-ACGT character in fasta ref is replaced with N + als[0].l = 0; + kputs(ref, &als[0]); + } + kputsn(als[0].s,1,&als[i]); + } + else // sv_len>0 + { + // dup, replace "REF=C ALT=" with "REF=C ALT=CAT" free(ref); - reflen = line->rlen; - ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref); + ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+sv_len, &nref); if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1); seq_to_upper(ref,0); replace_iupac_codes(ref,nref); // any non-ACGT character in fasta ref is replaced with N - als[0].l = 0; - kputs(ref, &als[0]); + kputs(ref,&als[i]); } - als[i].l = 0; - kputsn(als[0].s,1,&als[i]); - kputs(line->d.allele[i],&del[i]); + kputs(line->d.allele[i],&sym[i]); // preserve the symbolic allele string continue; } if ( i>0 ) symbolic_alts = 0; @@ -630,7 +662,7 @@ static int realign(args_t *args, bcf1_t *line) for (i=0; in_allele; i++) { if (i>0) kputc(',',&args->tmp_kstr); - if ( del[i].l ) kputs(del[i].s,&args->tmp_kstr); + if ( sym[i].l ) kputs(sym[i].s,&args->tmp_kstr); else kputsn(als[i].s,als[i].l,&args->tmp_kstr); } args->tmp_kstr.s[ args->tmp_kstr.l ] = 0; @@ -872,7 +904,7 @@ static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int { if ( gt[j]==bcf_int32_vector_end ) break; if ( bcf_gt_is_missing(gt[j]) ) continue; // missing allele: leave as is - if ( (ialt==0 || args->ma_use_ref_allele) && bcf_gt_allele(gt[j])==0 ) continue; // ref && `--multi-overlaps 0`: leave as is + if ( bcf_gt_allele(gt[j])==0 ) continue; // ref && `--multi-overlaps 0`: leave as is if ( bcf_gt_allele(gt[j])==ialt+1 ) gt[j] = bcf_gt_unphased(1) | bcf_gt_is_phased(gt[j]); // set to first ALT else if ( args->ma_use_ref_allele ) @@ -1440,6 +1472,23 @@ static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info bcf_update_info_string(args->out_hdr,dst,tag,args->tmp_arr1); } } +static int gt_array_grow_ploidy(args_t *args, uint8_t **tmp_arr, int *ntmp_arr, int ngt_ori, int ngt_new, int nsmpl) +{ + *ntmp_arr = 4*ngt_new*nsmpl; + int32_t *ptr = (int32_t*)realloc(*tmp_arr,*ntmp_arr); + if ( !ptr ) error("Error: failed to allocate %d bytes\n",*ntmp_arr); + *tmp_arr = (uint8_t*) ptr; + + int i,j; + for (i=nsmpl-1; i>=0; i--) + { + int32_t *src = ptr + i*ngt_ori; + int32_t *dst = ptr + i*ngt_new; + for (j=ngt_new; j>ngt_ori; j--) dst[j-1] = bcf_int32_vector_end; + for (j=ngt_ori; j>0; j--) dst[j-1] = src[j-1]; + } + return ngt_new; +} static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_fmt_t *fmt, bcf1_t *dst) { // reusing int8_t arrays as int32_t arrays @@ -1458,7 +1507,9 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_ int ngts2 = bcf_get_genotypes(args->hdr,lines[i],&args->tmp_arr2,&ntmp2); args->ntmp_arr2 = ntmp2 * 4; ngts2 /= nsmpl; - if ( ngts!=ngts2 ) error("Error at %s:%"PRId64": cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); + int ploidy_changed = ngts - ngts2; + if ( ngts < ngts2 ) ngts = gt_array_grow_ploidy(args,&args->tmp_arr1,&args->ntmp_arr1,ngts,ngts2,nsmpl); + if ( ngts > ngts2 ) ngts2 = gt_array_grow_ploidy(args,&args->tmp_arr2,&args->ntmp_arr2,ngts2,ngts,nsmpl); int32_t *gt = (int32_t*) args->tmp_arr1; // the first, destination line int32_t *gt2 = (int32_t*) args->tmp_arr2; // one of the subsequent lines, i.e. the source line @@ -1468,16 +1519,22 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_ // never overwrite with ref allele for (k2=0; k2=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial2); // The destination allele int ial = args->maps[i].map[ial2]; if ( gt[k2]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k2]) || !bcf_gt_allele(gt[k2]) ) - gt[k2] = bcf_gt_is_phased(gt[k2]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial); + gt[k2] = (gt[k2]!=bcf_int32_vector_end && bcf_gt_is_phased(gt[k2])) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial); else { // conflict, the first line has non-zero allele, use the old way, possibly disrupt the phasing @@ -1967,7 +2024,32 @@ static bcf1_t *mrows_flush(args_t *args) args->mrows_first++; return args->mrows[ibeg]; } -static void cmpals_add(cmpals_t *ca, bcf1_t *rec) +static char *strdup_alt_svlen(args_t *args, bcf1_t *rec, int ial) +{ + if ( rec->d.allele[ial][0]!='<' ) return strdup(rec->d.allele[ial]); + + int ntmp = args->ntmp_arr1 / sizeof(int32_t); + int n = bcf_get_info_int32(args->hdr, rec, "SVLEN", &args->tmp_arr1, &ntmp); + args->ntmp_arr1 = ntmp * sizeof(int32_t); + int32_t *svlen = (int32_t *) args->tmp_arr1; + if ( n<=0 ) return strdup(rec->d.allele[ial]); + + if ( n+1 != rec->n_allele ) + { + // there should be as many SVLEN numbers as there are ALT alleles + static int warned = 0; + if ( !warned ) + { + fprintf(stderr,"TODO: different number of ALT alleles and SVLEN fields %s:%"PRIhts_pos"\n",bcf_seqname(args->hdr,rec),rec->pos+1); + warned = 1; + } + } + + kstring_t str = {0,0,0}; + ksprintf(&str,"%s.%d",rec->d.allele[ial],svlen[ial-1]); + return str.s; +} +static void cmpals_add(args_t *args, cmpals_t *ca, bcf1_t *rec) { ca->ncmpals++; hts_expand0(cmpals1_t, ca->ncmpals, ca->mcmpals, ca->cmpals); @@ -1975,10 +2057,11 @@ static void cmpals_add(cmpals_t *ca, bcf1_t *rec) free(cmpals->ref); cmpals->ref = strdup(rec->d.allele[0]); cmpals->n = rec->n_allele; + if ( rec->n_allele==2 ) { free(cmpals->alt); - cmpals->alt = strdup(rec->d.allele[1]); + cmpals->alt = strdup_alt_svlen(args,rec,1); } else { @@ -1989,9 +2072,10 @@ static void cmpals_add(cmpals_t *ca, bcf1_t *rec) khash_str2int_inc(cmpals->hash, strdup(rec->d.allele[i])); } } -static int cmpals_match(cmpals_t *ca, bcf1_t *rec) +static int cmpals_match(args_t *args, cmpals_t *ca, bcf1_t *rec) { int i, j; + char *alt_svlen = rec->n_allele==2 ? strdup_alt_svlen(args,rec,1) : NULL; for (i=0; incmpals; i++) { cmpals1_t *cmpals = ca->cmpals + i; @@ -2003,7 +2087,8 @@ static int cmpals_match(cmpals_t *ca, bcf1_t *rec) // the most frequent case if ( rec->n_allele==2 ) { - if ( strcasecmp(rec->d.allele[1], cmpals->alt) ) continue; + if ( strcasecmp(alt_svlen, cmpals->alt) ) continue; + free(alt_svlen); return 1; } @@ -2013,6 +2098,7 @@ static int cmpals_match(cmpals_t *ca, bcf1_t *rec) if ( jn_allele ) continue; return 1; } + free(alt_svlen); return 0; } static void cmpals_reset(cmpals_t *ca) { ca->ncmpals = 0; } @@ -2055,7 +2141,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n) if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; - if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(&args->cmpals_out, args->lines[k]) ) continue; + if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, &args->cmpals_out, args->lines[k]) ) continue; } else { @@ -2065,7 +2151,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n) if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(&args->cmpals_out); } prev_type |= line_type; - if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_out, args->lines[k]); + if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args,&args->cmpals_out, args->lines[k]); } if ( bcf_write1(file, args->out_hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); } @@ -2118,17 +2204,22 @@ static void init_data(args_t *args) if ( args->gff_fname ) { args->gff = gff_init(args->gff_fname); - gff_set(args->gff,verbosity,1); + gff_set(args->gff,verbosity,args->gff_verbosity); gff_set(args->gff,strip_chr_names,1); gff_parse(args->gff); args->idx_tscript = gff_get(args->gff,idx_tscript); args->itr_tscript = regitr_init(NULL); } + if ( args->filter_str ) + args->filter = filter_init(args->hdr, args->filter_str); + args->filter_pass = 1; + args->out_hdr = bcf_hdr_dup(args->out_hdr); } static void destroy_data(args_t *args) { + if ( args->filter ) filter_destroy(args->filter); if ( args->gff ) { gff_destroy(args->gff); @@ -2150,10 +2241,10 @@ static void destroy_data(args_t *args) free(args->maps[i].map); for (i=0; intmp_als; i++) free(args->tmp_als[i].s); - for (i=0; intmp_del; i++) - free(args->tmp_del[i].s); + for (i=0; intmp_sym; i++) + free(args->tmp_sym[i].s); free(args->tmp_als); - free(args->tmp_del); + free(args->tmp_sym); free(args->tmp_kstr.s); if ( args->tmp_str ) { @@ -2178,10 +2269,10 @@ static void normalize_line(args_t *args, bcf1_t *line) { if ( args->fai ) { - if ( args->check_ref & CHECK_REF_FIX ) fix_ref(args, line); + if ( args->filter_pass && (args->check_ref & CHECK_REF_FIX) ) fix_ref(args, line); if ( args->do_indels ) { - int ret = realign(args, line); + int ret = args->filter_pass ? realign(args, line) : ERR_OK; // exclude broken VCF lines if ( ret==ERR_REF_MISMATCH && args->check_ref & CHECK_REF_SKIP ) @@ -2201,10 +2292,10 @@ static void normalize_line(args_t *args, bcf1_t *line) } } - if ( args->atomize==SPLIT ) abuf_push(args->abuf,line); + if ( args->filter_pass && args->atomize==SPLIT ) abuf_push(args->abuf,line); while (1) { - if ( args->atomize==SPLIT ) + if ( args->filter_pass && args->atomize==SPLIT ) { line = abuf_flush(args->abuf, 0); if ( !line ) break; @@ -2218,10 +2309,15 @@ static void normalize_line(args_t *args, bcf1_t *line) args->lines[i] = bcf_dup(line); while ( rbuf_prev(&args->rbuf,&i) ) { - if ( args->lines[i]->rid==args->lines[j]->rid && args->lines[i]->pos > args->lines[j]->pos ) SWAP(bcf1_t*, args->lines[i], args->lines[j]); + if ( args->lines[i]->rid==args->lines[j]->rid ) + { + bcf_unpack(args->lines[i], BCF_UN_STR); + bcf_unpack(args->lines[j], BCF_UN_STR); + if ( args->cmp_func(&args->lines[i], &args->lines[j]) > 0) SWAP(bcf1_t*, args->lines[i], args->lines[j]); + } j = i; } - if ( args->atomize!=SPLIT ) break; + if ( !args->filter_pass || args->atomize!=SPLIT ) break; } } @@ -2233,7 +2329,14 @@ static int split_and_normalize(args_t *args) bcf1_t *line = bcf_sr_get_line(args->files,0); args->ntotal++; - if ( args->mrows_op!=MROWS_SPLIT || line->n_allele<=2 ) + if ( args->filter ) + { + args->filter_pass = filter_test(args->filter,line,NULL); + if ( args->filter_logic==FLT_EXCLUDE ) args->filter_pass = args->filter_pass ? 0 : 1; + if ( !args->filter_pass ) args->nfilter++; + } + + if ( args->mrows_op!=MROWS_SPLIT || line->n_allele<=2 || !args->filter_pass ) { // normal operation, no splitting normalize_line(args, line); @@ -2322,7 +2425,8 @@ static void normalize_vcf(args_t *args) } if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - fprintf(stderr,"Lines total/split/joined/realigned/skipped:\t%d/%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->njoined,args->nchanged,args->nskipped); + fprintf(stderr,"Lines total/split/joined/realigned/removed/skipped:\t%d/%d/%d/%d/%d/%d\n", + args->ntotal,args->nsplit,args->njoined,args->nchanged,args->nskipped,args->nfilter); if ( args->check_ref & CHECK_REF_FIX ) fprintf(stderr,"REF/ALT total/modified/added: \t%d/%d/%d\n", args->nref.tot,args->nref.swap,args->nref.set); } @@ -2341,9 +2445,11 @@ static void usage(void) fprintf(stderr, " -c, --check-ref e|w|x|s Check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n"); fprintf(stderr, " -D, --remove-duplicates Remove duplicate lines of the same type.\n"); fprintf(stderr, " -d, --rm-dup TYPE Remove duplicate snps|indels|both|all|exact\n"); + fprintf(stderr, " -e, --exclude EXPR Do not normalize records for which the expression is true (see man page for details)\n"); fprintf(stderr, " -f, --fasta-ref FILE Reference sequence\n"); fprintf(stderr, " --force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n"); fprintf(stderr, " -g, --gff-annot FILE Follow HGVS 3'rule and right-align variants in transcripts on the forward strand\n"); + fprintf(stderr, " -i, --include EXPR Normalize only records for which the expression is true (see man page for details)\n"); fprintf(stderr, " --keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360)\n"); fprintf(stderr, " -m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); fprintf(stderr, " --multi-overlaps 0|. Fill in the reference (0) or missing (.) allele when splitting multiallelics [0]\n"); @@ -2356,10 +2462,12 @@ static void usage(void) fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " -s, --strict-filter When merging (-m+), merged site is PASS only if all sites being merged PASS\n"); + fprintf(stderr, " -S, --sort METHOD Sort order: chr_pos,lex [chr_pos]\n"); fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); + fprintf(stderr, " -v, --verbose INT Verbosity level (0-2) of GFF parsing [1]\n"); fprintf(stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n"); fprintf(stderr, " -W, --write-index[=FMT] Automatically index the output files [off]\n"); fprintf(stderr, "\n"); @@ -2392,8 +2500,10 @@ int main_vcfnorm(int argc, char *argv[]) int region_is_file = 0; int targets_is_file = 0; args->use_star_allele = 1; + args->gff_verbosity = 1; int regions_overlap = 1; int targets_overlap = 0; + args->cmp_func = cmp_bcf_pos; static struct option loptions[] = { @@ -2401,9 +2511,12 @@ int main_vcfnorm(int argc, char *argv[]) {"force",no_argument,NULL,7}, {"atomize",no_argument,NULL,'a'}, {"atom-overlaps",required_argument,NULL,11}, + {"include",required_argument,NULL,'i'}, + {"exclude",required_argument,NULL,'e'}, {"old-rec-tag",required_argument,NULL,12}, {"keep-sum",required_argument,NULL,10}, {"fasta-ref",required_argument,NULL,'f'}, + {"sort",required_argument,NULL,'S'}, {"gff-annot",required_argument,NULL,'g'}, {"right-align",no_argument,NULL,15}, // undocumented, only for debugging {"do-not-normalize",no_argument,NULL,'N'}, @@ -2425,10 +2538,11 @@ int main_vcfnorm(int argc, char *argv[]) {"strict-filter",no_argument,NULL,'s'}, {"no-version",no_argument,NULL,8}, {"write-index",optional_argument,NULL,'W'}, + {"verbose",required_argument,NULL,'v'}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNag:W::",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNag:W::v:S:i:e:",loptions,NULL)) >= 0) { switch (c) { case 10: // possibly generalize this also to INFO/AD and other tags @@ -2437,7 +2551,22 @@ int main_vcfnorm(int argc, char *argv[]) args->keep_sum_ad = 1; // this will be set to the header id or -1 in init_data break; case 'g': args->gff_fname = optarg; break; + case 'v': + args->gff_verbosity = atoi(optarg); + if ( args->gff_verbosity<0 || args->gff_verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n"); + break; case 'a': args->atomize = SPLIT; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'S': + if ( !strcasecmp(optarg,"pos") ) args->cmp_func = cmp_bcf_pos; + else if ( !strcasecmp(optarg,"lex") ) args->cmp_func = cmp_bcf_pos_ref_alt; + else error("Error: the sort order --sort %s is not recognised\n",optarg); + break; case 11 : if ( optarg[0]=='*' ) args->use_star_allele = 1; else if ( optarg[0]=='.' ) args->use_star_allele = 0; diff --git a/vcfquery.c b/vcfquery.c index b3a6b8299..7b1dd4391 100644 --- a/vcfquery.c +++ b/vcfquery.c @@ -98,6 +98,7 @@ static void init_data(args_t *args) if ( args->force_newline ) convert_set_option(args->convert, force_newline, 1); convert_set_option(args->convert, subset_samples, &args->smpl_pass); if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1); + if ( args->print_header>1 ) convert_set_option(args->convert, no_hdr_indices, 1); free(samples); int max_unpack = convert_max_unpack(args->convert); @@ -238,7 +239,7 @@ static void usage(void) fprintf(stderr, " --force-samples Only warn about unknown subset samples\n"); fprintf(stderr, " -F, --print-filtered STR Output STR for samples failing the -i/-e filtering expression\n"); fprintf(stderr, " -f, --format STRING See man page for details\n"); - fprintf(stderr, " -H, --print-header Print header\n"); + fprintf(stderr, " -H, --print-header Print header, -HH to omit column indices\n"); fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n"); fprintf(stderr, " -l, --list-samples Print the list of samples and exit\n"); fprintf(stderr, " -N, --disable-automatic-newline Disable automatic addition of newline character when not present\n"); @@ -304,7 +305,7 @@ int main_vcfquery(int argc, char *argv[]) case 'F': args->print_filtered = optarg; break; case 'f': args->format_str = strdup(optarg); break; case 'N': args->force_newline = 0; break; - case 'H': args->print_header = 1; break; + case 'H': args->print_header++; break; case 'v': args->vcf_list = optarg; break; case 'c': error("The --collapse option is obsolete, pipe through `bcftools norm -c` instead.\n"); diff --git a/vcfroh.c b/vcfroh.c index a0802db7e..f1d1c86e9 100644 --- a/vcfroh.c +++ b/vcfroh.c @@ -254,7 +254,7 @@ static void init_data(args_t *args) { if ( *end!=',') error("Could not parse: --buffer-size %s\n", args->buffer_size); args->nbuf_olap = strtol(end+1,&end,10); - if ( *end || args->nbuf_olap<0 ) error("Could not parse: --bufer-size %s\n", args->buffer_size); + if ( *end || args->nbuf_olap<0 ) error("Could not parse: --buffer-size %s\n", args->buffer_size); } if ( tmp<0 ) args->nbuf_max = fabs(tmp)*1e6/(4+8*2)/args->roh_smpl->n; diff --git a/vcfsom.c b/vcfsom.c index db01d24fd..f7a5dbe6e 100644 --- a/vcfsom.c +++ b/vcfsom.c @@ -37,6 +37,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include "bcftools.h" @@ -83,10 +84,9 @@ typedef struct args_t; static void usage(void); -FILE *open_file(char **fname, const char *mode, const char *fmt, ...); -void mkdir_p(const char *fmt, ...); +FILE *open_file(char **fname, const char *mode, const char *fmt, ...) HTS_FORMAT(HTS_PRINTF_FMT, 3, 4); -char *msprintf(const char *fmt, ...) +char * HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) msprintf(const char *fmt, ...) { va_list ap; va_start(ap, fmt); diff --git a/vcfsort.c b/vcfsort.c index b355319a2..6e21f85be 100644 --- a/vcfsort.c +++ b/vcfsort.c @@ -1,6 +1,6 @@ /* vcfsort.c -- sort subcommand - Copyright (C) 2017-2023 Genome Research Ltd. + Copyright (C) 2017-2024 Genome Research Ltd. Author: Petr Danecek @@ -41,28 +41,48 @@ #include #include #include +#include +#include #include "kheap.h" #include "bcftools.h" +#define MAX_TMP_FILES_PER_LAYER 32 +#define MERGE_LAYERS 12 +#define MAX_TMP_FILES (MAX_TMP_FILES_PER_LAYER * MERGE_LAYERS) + typedef struct { char *fname; htsFile *fh; + BGZF *bgz; + size_t idx; bcf1_t *rec; + int is_merged; } blk_t; +typedef struct +{ + size_t len; + hts_pos_t pos; + int rid; + float qual; + uint8_t data[]; +} +packed_bcf_t; + typedef struct _args_t { bcf_hdr_t *hdr; char **argv, *fname, *output_fname, *tmp_dir; int argc, output_type, clevel; size_t max_mem, mem; - bcf1_t **buf; + packed_bcf_t **buf; uint8_t *mem_block; - size_t nbuf, mbuf, nblk; - blk_t *blk; - char *index_fn; + + size_t nbuf, mbuf, nblk, tmp_count; + blk_t blk[MAX_TMP_FILES]; + uint32_t tmp_layers[MERGE_LAYERS]; int write_index; } args_t; @@ -71,9 +91,9 @@ void clean_files(args_t *args) { int i; fprintf(stderr,"Cleaning\n"); - for (i=0; inblk; i++) + for (i=0; iblk + i; + blk_t *blk = &args->blk[i]; if ( blk->fname ) { unlink(blk->fname); @@ -84,7 +104,8 @@ void clean_files(args_t *args) } rmdir(args->tmp_dir); } -void clean_files_and_throw(args_t *args, const char *format, ...) +void HTS_FORMAT(HTS_PRINTF_FMT, 2, 3) HTS_NORETURN +clean_files_and_throw(args_t *args, const char *format, ...) { va_list ap; va_start(ap, format); @@ -95,6 +116,16 @@ void clean_files_and_throw(args_t *args, const char *format, ...) } int cmp_bcf_pos(const void *aptr, const void *bptr) +{ + bcf1_t *a = *((bcf1_t**)aptr); + bcf1_t *b = *((bcf1_t**)bptr); + if ( a->rid < b->rid ) return -1; + if ( a->rid > b->rid ) return 1; + if ( a->pos < b->pos ) return -1; + if ( a->pos > b->pos ) return 1; + return 0; +} +int cmp_bcf_pos_ref_alt(const void *aptr, const void *bptr) { bcf1_t *a = *((bcf1_t**)aptr); bcf1_t *b = *((bcf1_t**)bptr); @@ -118,33 +149,279 @@ int cmp_bcf_pos(const void *aptr, const void *bptr) return 0; } -void buf_flush(args_t *args) +static int cmp_packed_bcf_pos_ref_alt(const void *aptr, const void *bptr) { - if ( !args->nbuf ) return; + packed_bcf_t *a = *(packed_bcf_t **) aptr; + packed_bcf_t *b = *(packed_bcf_t **) bptr; - qsort(args->buf, args->nbuf, sizeof(*args->buf), cmp_bcf_pos); + if ( a->rid < b->rid ) return -1; + if ( a->rid > b->rid ) return 1; + if ( a->pos < b->pos ) return -1; + if ( a->pos > b->pos ) return 1; + + // Sort lexicographically by ref,alt. These are stored tab-separated + // as the first item in packed_bcf_t::data + return strcmp((char *) a->data, (char *) b->data); +} - args->nblk++; - args->blk = (blk_t*) realloc(args->blk, sizeof(blk_t)*args->nblk); - if ( !args->blk ) error("Error: could not allocate %zu bytes of memory, try reducing --max-mem\n",sizeof(blk_t)*args->nblk); - blk_t *blk = args->blk + args->nblk - 1; +static int cmp_packed_bcf_pos_ref_alt_stable(const void *aptr, const void *bptr) +{ + // cmp_bcf_pos_ref_alt() with tie-breaker to make qsort stable + int res = cmp_packed_bcf_pos_ref_alt(aptr, bptr); + if (res != 0) return res; + + // Got a tie - use the position in the original input to break it. + // As everything is read into a big memory buffer, for most records + // we can just compare the pointers directly. The exception is + // any record that didn't quite fit in the memory buffer, causing it to be + // flushed. Those are flagged by setting packed_bcf_t::len = SIZE_MAX, and + // as they were the last record in the segment, they should always sort + // after unflagged records. + + packed_bcf_t *a = *(packed_bcf_t **) aptr; + packed_bcf_t *b = *(packed_bcf_t **) bptr; + + if (a->len == SIZE_MAX) return 1; + if (b->len == SIZE_MAX) return -1; + + return a < b ? -1 : 1; +} + +static uint8_t *pack_unsigned(uint8_t *data, uint64_t val) +{ + do { + *data++ = (val & 0x7f) | ((val > 0x7f) ? 0x80 : 0); + val >>= 7; + } while (val > 0); + return data; +} + +static uint8_t *pack_hts_pos(uint8_t *data, hts_pos_t val) +{ + uint64_t sign = val < 0; + uint64_t v = val < 0 ? -(val + 1) : val; + v = v << 1 | sign; + return pack_unsigned(data, v); +} + +static uint8_t *pack_bcf_data(packed_bcf_t *dest, const bcf1_t *src, + int outside_buffer) +{ + uint32_t i; + uint8_t *data = dest->data; + uint8_t *start = dest->data; + dest->pos = src->pos; + dest->rid = src->rid; + dest->qual = src->qual; + + // Copy in alleles, for the comparison function + for (i = 0; i < src->n_allele; i++) + { + size_t l = strlen(src->d.allele[i]); + if (i > 0) *data++ = '\t'; + memcpy(data, src->d.allele[i], l); + data += l; + } + *data++ = '\0'; + + if (outside_buffer) + { + dest->len = SIZE_MAX; + memcpy(data, &src, sizeof(src)); + data += sizeof(src); + return data; + } + + data = pack_hts_pos(data, src->rlen); + data = pack_unsigned(data, src->n_info); + data = pack_unsigned(data, src->n_allele); + data = pack_unsigned(data, src->n_fmt); + data = pack_unsigned(data, src->n_sample); + data = pack_unsigned(data, src->shared.l); + data = pack_unsigned(data, src->indiv.l); + if (src->shared.l) + memcpy(data, src->shared.s, src->shared.l); + data += src->shared.l; + if (src->indiv.l) + memcpy(data, src->indiv.s, src->indiv.l); + data += src->indiv.l; + dest->len = data - start; + return data; +} + +static int write_packed_bcf(BGZF *fp, packed_bcf_t *src) +{ + // Write pos, rid, qual + size_t len = src->data - (uint8_t *) &src->pos; + if (bgzf_write_small(fp, &src->pos, len) < len) + return -1; + + // Skip the copy of the alleles + size_t skip = strlen((char *) src->data) + 1; + + // Write everything else + if (src->len < SIZE_MAX) + { + // In main memory block + len = src->len - skip; + if (bgzf_write_small(fp, src->data + skip, len) < len) + return -1; + } + else + { + // Record didn't fit in the main block. To minimize the + // overflow, its packed_bcf_t data will be imcomplete. A pointer to + // its bcf1_t struct will have been placed after the allele data + // so we can finish the packing job and write it in the same format + // the rest of the data + bcf1_t *rec; + uint8_t tmp[100], *data = tmp; + memcpy(&rec, src->data + skip, sizeof(rec)); + + data = pack_hts_pos(data, rec->rlen); + data = pack_unsigned(data, rec->n_info); + data = pack_unsigned(data, rec->n_allele); + data = pack_unsigned(data, rec->n_fmt); + data = pack_unsigned(data, rec->n_sample); + data = pack_unsigned(data, rec->shared.l); + data = pack_unsigned(data, rec->indiv.l); + if (bgzf_write_small(fp, tmp, data - tmp) < data - tmp) + return -1; + if (rec->shared.l > 0 && + bgzf_write_small(fp, rec->shared.s, rec->shared.l) < rec->shared.l) + return -1; + if (rec->indiv.l > 0 && + bgzf_write_small(fp, rec->indiv.s, rec->indiv.l) < rec->indiv.l) + return -1; + } + + return 0; +} + +static uint64_t unpack_unsigned(BGZF *fp, int *err) +{ + uint8_t data; + uint64_t val = 0; + uint32_t i = 0; + + if (bgzf_read_small(fp, &data, sizeof(data)) <= 0) + goto short_read; + + while (data & 0x80) + { + val |= (uint64_t)(data & 0x7f) << i; + i += 7; + if (bgzf_read_small(fp, &data, sizeof(data)) <= 0) + goto short_read; + } + val |= (uint64_t)data << i; + return val; + + short_read: + *err = 1; + return 0; +} + +static hts_pos_t unpack_hts_pos(BGZF *fp, int *err) +{ + uint64_t v = unpack_unsigned(fp, err); + + if ((v & 1) == 0) + return (hts_pos_t)(v >> 1); + else + return -(hts_pos_t)(v >> 1) - 1; +} + +static int read_packed_bcf(BGZF *fp, bcf1_t *dest) +{ + int err = 0; + packed_bcf_t tmp; + size_t len = tmp.data - (uint8_t *) &tmp.pos; + + bcf_clear(dest); + ssize_t got = bgzf_read_small(fp, &tmp.pos, len); + if (got == 0) + return -1; // EOF + if (got < len) + return -2; // Error or short read + dest->pos = tmp.pos; + dest->rid = tmp.rid; + dest->qual = tmp.qual; + dest->rlen = unpack_hts_pos(fp, &err); + dest->n_info = unpack_unsigned(fp, &err); + dest->n_allele = unpack_unsigned(fp, &err); + dest->n_fmt = unpack_unsigned(fp, &err); + dest->n_sample = unpack_unsigned(fp, &err); + len = unpack_unsigned(fp, &err); + if (ks_resize(&dest->shared, len) != 0) + return -2; + dest->shared.l = len; + len = unpack_unsigned(fp, &err); + if (ks_resize(&dest->indiv, len) != 0) + return -2; + dest->indiv.l = len; + err |= bgzf_read_small(fp, dest->shared.s, dest->shared.l) < dest->shared.l; + err |= bgzf_read_small(fp, dest->indiv.s, dest->indiv.l) < dest->indiv.l; + return err == 0 ? 0 : -2; +} +void open_tmp_file(args_t *args, blk_t *blk, int is_merged) +{ kstring_t str = {0,0,0}; - ksprintf(&str, "%s/%05d.bcf", args->tmp_dir, (int)args->nblk); - blk->fname = str.s; - blk->rec = NULL; - blk->fh = NULL; + int tries = 1000; - htsFile *fh = hts_open(blk->fname, "wbu"); - if ( fh == NULL ) clean_files_and_throw(args, "Cannot write %s: %s\n", blk->fname, strerror(errno)); - if ( bcf_hdr_write(fh, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname); + blk->fh = NULL; + blk->bgz = NULL; + + do { + if (ksprintf(ks_clear(&str), "%s/%05zd%s", + args->tmp_dir, args->tmp_count++, + is_merged ? ".bcf" : "") < 0) { + clean_files_and_throw(args, "%s", strerror(errno)); + } + + if (is_merged) + blk->fh = hts_open(str.s, "wbx1"); + else + blk->bgz = bgzf_open(str.s, "wx1"); + if ( blk->fh == NULL && blk->bgz == NULL && (errno != EEXIST || --tries <= 0)) { + clean_files_and_throw(args, "Cannot write %s: %s\n", + str.s, strerror(errno)); + } + } while (blk->fh == NULL && blk->bgz == NULL); + + blk->fname = ks_release(&str); + blk->idx = args->tmp_count - 1; +} + +void do_partial_merge(args_t *args); + +void buf_flush(args_t *args, bcf1_t *last_rec) +{ + if ( !args->nbuf ) return; + + qsort(args->buf, args->nbuf, sizeof(*args->buf), cmp_packed_bcf_pos_ref_alt_stable); + + if (args->tmp_layers[0] >= MAX_TMP_FILES_PER_LAYER) + do_partial_merge(args); + + assert(args->nblk < MAX_TMP_FILES); + blk_t *blk = &args->blk[args->nblk]; + blk->is_merged = 0; + args->nblk++; + args->tmp_layers[0]++; + assert(blk->fname == NULL && blk->fh == NULL && blk->bgz == NULL); + + open_tmp_file(args, blk, 0); int i; for (i=0; inbuf; i++) { - if ( bcf_write(fh, args->hdr, args->buf[i])!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname); + if ( write_packed_bcf(blk->bgz, args->buf[i])!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname); } - if ( hts_close(fh)!=0 ) clean_files_and_throw(args, "[%s] Error: close failed .. %s\n", __func__,blk->fname); + + if ( bgzf_close(blk->bgz)!=0 ) clean_files_and_throw(args, "[%s] Error: close failed .. %s\n", __func__,blk->fname); + blk->bgz = NULL; args->nbuf = 0; args->mem = 0; @@ -156,19 +433,38 @@ static inline uint8_t *_align_up(uint8_t *ptr) return (uint8_t*)(((size_t)ptr + 8 - 1) & ~((size_t)(8 - 1))); } +#define varint_size(X) ((sizeof(X) * 8 + 7) / 7) // worst case + void buf_push(args_t *args, bcf1_t *rec) { - size_t delta = sizeof(bcf1_t) + rec->shared.l + rec->indiv.l + rec->unpack_size[0] + rec->unpack_size[1] - + sizeof(*rec->d.allele)*rec->d.m_allele - + sizeof(bcf1_t*) // args->buf + size_t delta = sizeof(rec->pos) + + sizeof(rec->rid) + + sizeof(rec->qual) + + varint_size(rec->rlen) + + varint_size(2) // n_info + + varint_size(2) // n_allele + + varint_size(1) // n_fmt + + varint_size(3) // n_sample + + varint_size(rec->shared.l) + + varint_size(rec->indiv.l) + + rec->shared.l + rec->indiv.l + + rec->unpack_size[1] // Alleles + 8; // the number of _align_up() calls if ( delta > args->max_mem - args->mem ) { + packed_bcf_t *tmp = malloc(sizeof(*tmp) + rec->unpack_size[1] * sizeof(bcf1_t *)); + if (!tmp) + clean_files_and_throw(args, "[%s] Out of memory\n", __func__); + pack_bcf_data(tmp, rec, 1); + args->nbuf++; hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf); - args->buf[args->nbuf-1] = rec; - buf_flush(args); + args->buf[args->nbuf-1] = tmp; + + buf_flush(args, rec); + + free(tmp); bcf_destroy(rec); return; } @@ -178,48 +474,13 @@ void buf_push(args_t *args, bcf1_t *rec) uint8_t *ptr_beg = args->mem_block + args->mem; uint8_t *ptr = _align_up(ptr_beg); - bcf1_t *new_rec = (bcf1_t*)ptr; - memcpy(new_rec,rec,sizeof(*rec)); - ptr += sizeof(*rec); - - // The array of allele pointers does not need alignment as bcf1_t is already padded to the biggest - // data type in the structure - char **allele = (char**)ptr; - ptr += rec->n_allele*sizeof(*allele); - - // This is just to prevent valgrind from complaining about memcpy, unpack_size is a high-water mark - // and the end may be uninitialized - delta = rec->d.allele[rec->n_allele-1] - rec->d.allele[0]; - while ( delta < rec->unpack_size[1] ) if ( !rec->d.als[delta++] ) break; - memcpy(ptr,rec->d.als,delta); - new_rec->d.als = (char*)ptr; - ptr = ptr + delta; + packed_bcf_t *packed_rec = (packed_bcf_t *) ptr; - int i; - for (i=0; in_allele; i++) allele[i] = new_rec->d.als + (ptrdiff_t)(rec->d.allele[i] - rec->d.allele[0]); - new_rec->d.allele = allele; - - memcpy(ptr,rec->shared.s,rec->shared.l); - new_rec->shared.s = (char*)ptr; - new_rec->shared.m = rec->shared.l; - ptr += rec->shared.l; - - memcpy(ptr,rec->indiv.s,rec->indiv.l); - new_rec->indiv.s = (char*)ptr; - new_rec->indiv.m = rec->indiv.l; - ptr += rec->indiv.l; - - // This is just to prevent valgrind from complaining about memcpy, unpack_size is a high-water mark - // and the end may be uninitialized - i = 0; - while ( i < rec->unpack_size[0] ) if ( !rec->d.id[i++] ) break; - memcpy(ptr,rec->d.id,i); - new_rec->d.id = (char*)ptr; - ptr += i; + ptr = pack_bcf_data(packed_rec, rec, 0); args->nbuf++; hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf); - args->buf[args->nbuf-1] = new_rec; + args->buf[args->nbuf-1] = packed_rec; delta = ptr - ptr_beg; args->mem += delta; @@ -246,11 +507,11 @@ void sort_blocks(args_t *args) bcf_destroy(rec); break; } - if ( rec->errcode ) clean_files_and_throw(args,"Error encountered while parsing the input at %s:%d\n",bcf_seqname(args->hdr,rec),rec->pos+1); + if ( rec->errcode ) clean_files_and_throw(args,"Error encountered while parsing the input at %s:%"PRIhts_pos"\n",bcf_seqname(args->hdr,rec),rec->pos+1); bcf_unpack(rec, BCF_UN_STR); buf_push(args, rec); } - buf_flush(args); + buf_flush(args, NULL); free(args->buf); if ( hts_close(in)!=0 ) clean_files_and_throw(args,"Close failed: %s\n", args->fname); @@ -260,51 +521,80 @@ static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr) { blk_t *a = *aptr; blk_t *b = *bptr; - int ret = cmp_bcf_pos(&a->rec, &b->rec); + int ret = cmp_bcf_pos_ref_alt(&a->rec, &b->rec); if ( ret < 0 ) return 1; + if (ret == 0 && a->idx < b->idx) return 1; return 0; } KHEAP_INIT(blk, blk_t*, blk_is_smaller) void blk_read(args_t *args, khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk) { - if ( !blk->fh ) return; - int ret = bcf_read(blk->fh, hdr, blk->rec); + int ret; + if (blk->is_merged) + { + if ( !blk->fh ) return; + ret = bcf_read(blk->fh, hdr, blk->rec); + } + else + { + if ( !blk->bgz ) return; + ret = read_packed_bcf(blk->bgz, blk->rec); + } if ( ret < -1 ) clean_files_and_throw(args, "Error reading %s\n", blk->fname); if ( ret == -1 ) { - if ( hts_close(blk->fh)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", blk->fname); - blk->fh = 0; - return; + if (blk->is_merged) + { + if ( hts_close(blk->fh)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", blk->fname); + blk->fh = NULL; + return; + } + else + { + if ( bgzf_close(blk->bgz) != 0) + clean_files_and_throw(args, "Close failed: %s\n", blk->fname); + blk->bgz = NULL; + return; + } } bcf_unpack(blk->rec, BCF_UN_STR); khp_insert(blk, bhp, &blk); } -void merge_blocks(args_t *args) +void merge_blocks(args_t *args, htsFile *out, const char *output_fname, + int idx_fmt, size_t from) { - fprintf(stderr,"Merging %d temporary files\n", (int)args->nblk); khp_blk_t *bhp = khp_init(blk); + char *index_fn = NULL; + size_t i; - int i; - for (i=0; inblk; i++) + for (i=from; inblk; i++) { - blk_t *blk = args->blk + i; - blk->fh = hts_open(blk->fname, "r"); - if ( !blk->fh ) clean_files_and_throw(args, "Could not read %s: %s\n", blk->fname, strerror(errno)); - bcf_hdr_t *hdr = bcf_hdr_read(blk->fh); - bcf_hdr_destroy(hdr); - blk->rec = bcf_init(); + blk_t *blk = &args->blk[i]; + if (blk->is_merged) + { + blk->fh = hts_open(blk->fname, "r"); + if ( !blk->fh ) clean_files_and_throw(args, "Could not read %s: %s\n", blk->fname, strerror(errno)); + bcf_hdr_t *hdr = bcf_hdr_read(blk->fh); + bcf_hdr_destroy(hdr); + } + else + { + blk->bgz = bgzf_open(blk->fname, "r"); + if (!blk->bgz) + clean_files_and_throw(args, "Could not read %s: %s\n", blk->fname, strerror(errno)); + } blk_read(args, bhp, args->hdr, blk); } - char wmode[8]; - set_wmode(wmode,args->output_type,args->output_fname,args->clevel); - htsFile *out = hts_open(args->output_fname ? args->output_fname : "-", wmode); - if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname); - if ( init_index2(out,args->hdr,args->output_fname,&args->index_fn, - args->write_index)<0 ) - error("Error: failed to initialise index for %s\n",args->output_fname); + if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__, output_fname); + + if (idx_fmt) { + if ( init_index2(out,args->hdr,output_fname,&index_fn,idx_fmt)<0 ) + error("Error: failed to initialise index for %s\n",output_fname); + } + while ( bhp->ndat ) { blk_t *blk = bhp->dat[0]; @@ -312,22 +602,93 @@ void merge_blocks(args_t *args) khp_delete(blk, bhp); blk_read(args, bhp, args->hdr, blk); } - if ( args->write_index ) + if ( idx_fmt ) { if ( bcf_idx_save(out)<0 ) { - if ( hts_close(out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); - error("Error: cannot write to index %s\n", args->index_fn); + if ( hts_close(out)!=0 ) error("Error: close failed .. %s\n", output_fname); + error("Error: cannot write to index %s\n", index_fn); } - free(args->index_fn); + free(index_fn); } - if ( hts_close(out)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", args->output_fname); - clean_files(args); + for (i = from; i < args->nblk; i++) + { + blk_t *blk = &args->blk[i]; + if (unlink(blk->fname) != 0) + clean_files_and_throw(args, "Couldn't remove temporary file %s\n", blk->fname); + free(blk->fname); + blk->fname = NULL; + } - free(args->blk); khp_destroy(blk, bhp); +} + +void do_partial_merge(args_t *args) +{ + uint32_t to_layer = 0; + size_t to_merge = 0; + + // Temp. files are arranged in layers of at most MAX_TMP_FILES_PER_LAYER. + // When a layer is full, it is merged into the next layer up. Each + // layer will therefore contain files with exponentially more records + // then the previous one, but will be merged exponentially less frequently. + // The result is that the overall complexity will remain O(n*log(n)) + // even if we need to do lots of partial merges. + + while (to_layer < MERGE_LAYERS + && args->tmp_layers[to_layer] >= MAX_TMP_FILES_PER_LAYER) + { + to_merge += args->tmp_layers[to_layer]; + args->tmp_layers[to_layer] = 0; + to_layer++; + } + + assert(to_merge > 0 && to_merge <= args->nblk); + + if (to_layer == MERGE_LAYERS) { + // Edge case - if we've got here, we've completely used the + // temp file allocation, so merge absolutely everything and + // leave one file at the highest level. Strictly this breaks + // the O(n*log(n)) complexity, but unless MERGE_LAYERS and + // MAX_TMP_FILES_PER_LAYER are too small it would take so long + // to get here it should never actually happen... + assert(to_merge == MAX_TMP_FILES_PER_LAYER * MERGE_LAYERS); + to_layer = MERGE_LAYERS - 1; + } + + blk_t tmp = { NULL }; + open_tmp_file(args, &tmp, 1); + merge_blocks(args, tmp.fh, tmp.fname, 0, args->nblk - to_merge); + if (hts_close(tmp.fh) != 0) + clean_files_and_throw(args, "Close failed: %s\n", tmp.fname); + + args->nblk -= to_merge; + assert(args->blk[args->nblk].fh == NULL); + assert(args->blk[args->nblk].fname == NULL); + args->blk[args->nblk].is_merged = 1; + args->blk[args->nblk].idx = tmp.idx; + args->blk[args->nblk++].fname = tmp.fname; + args->tmp_layers[to_layer]++; +} + +void merge_to_output(args_t *args) +{ + char wmode[8] = { 0 }; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + const char *output_fname = args->output_fname ? args->output_fname : "-"; + + htsFile *out = hts_open(output_fname, wmode); + if (!out) clean_files_and_throw(args, "[%s] Error: cannot open %s\n", __func__, output_fname); + + fprintf(stderr,"Merging %zd temporary files\n", args->nblk); + merge_blocks(args, out, output_fname, args->write_index, 0); fprintf(stderr,"Done\n"); + + if ( hts_close(out)!=0 ) + clean_files_and_throw(args, "Close failed: %s\n", output_fname); + + clean_files(args); } static void usage(args_t *args) @@ -362,14 +723,23 @@ size_t parse_mem_string(const char *str) return mem; } -void mkdir_p(const char *fmt, ...); static void init(args_t *args) { + size_t i; args->max_mem *= 0.9; args->mem_block = malloc(args->max_mem); if ( !args->mem_block ) error("Error: could not allocate %zu bytes of memory, try reducing --max-mem\n",args->max_mem); args->mem = 0; + for (i = 0; i < MAX_TMP_FILES; i++) + { + args->blk[i].fname = NULL; + args->blk[i].rec = bcf_init(); + if (!args->blk[i].rec) + clean_files_and_throw(args,"Couldn't allocate bcf record\n"); + } + + args->tmp_dir = init_tmp_prefix(args->tmp_dir); #ifdef _WIN32 @@ -457,7 +827,7 @@ int main_sort(int argc, char *argv[]) init(args); sort_blocks(args); - merge_blocks(args); + merge_to_output(args); destroy(args); return 0; diff --git a/vcfstats.c b/vcfstats.c index e31ca2b30..38b4caf51 100644 --- a/vcfstats.c +++ b/vcfstats.c @@ -1,6 +1,6 @@ /* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats. - Copyright (C) 2012-2023 Genome Research Ltd. + Copyright (C) 2012-2024 Genome Research Ltd. Author: Petr Danecek @@ -408,7 +408,8 @@ static void init_user_stats(args_t *args, bcf_hdr_t *hdr, stats_t *stats) { stats->nusr = args->nusr; stats->usr = (user_stats_t*)malloc(sizeof(user_stats_t)*args->nusr); - memcpy(stats->usr,args->usr,args->nusr*sizeof(user_stats_t)); + if (args->nusr) + memcpy(stats->usr,args->usr,args->nusr*sizeof(user_stats_t)); int i; for (i=0; inusr; i++) { @@ -1376,7 +1377,10 @@ static void print_stats(args_t *args) printf("SN\t%d\tnumber of multiallelic sites:\t%"PRIu64"\n", id, stats->n_mals); printf("SN\t%d\tnumber of multiallelic SNP sites:\t%"PRIu64"\n", id, stats->n_snp_mals); } - printf("# TSTV, transitions/transversions:\n# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n"); + printf("# TSTV, transitions/transversions\n" + "# - transitions, see https://en.wikipedia.org/wiki/Transition_(genetics)\n" + "# - transversions, see https://en.wikipedia.org/wiki/Transversion\n"); + printf("# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; @@ -1396,7 +1400,9 @@ static void print_stats(args_t *args) } if ( args->indel_ctx ) { - printf("# ICS, Indel context summary:\n# ICS\t[2]id\t[3]repeat-consistent\t[4]repeat-inconsistent\t[5]not applicable\t[6]c/(c+i) ratio\n"); + printf("# ICS, Indel context:\n" + "# - repeat-consistent, inconsistent and n/a: experimental and useless stats [DEPRECATED]\n"); + printf("# ICS\t[2]id\t[3]repeat-consistent\t[4]repeat-inconsistent\t[5]not applicable\t[6]c/(c+i) ratio\n"); for (id=0; idnstats; id++) { int nc = 0, ni = 0, na = args->stats[id].n_repeat_na; @@ -1407,7 +1413,9 @@ static void print_stats(args_t *args) } printf("ICS\t%d\t%d\t%d\t%d\t%.4f\n", id, nc,ni,na,nc+ni ? (float)nc/(nc+ni) : 0.0); } - printf("# ICL, Indel context by length:\n# ICL\t[2]id\t[3]length of repeat element\t[4]repeat-consistent deletions)\t[5]repeat-inconsistent deletions\t[6]consistent insertions\t[7]inconsistent insertions\t[8]c/(c+i) ratio\n"); + printf("# ICL, Indel context by length:\n" + "# - repeat-consistent, inconsistent and n/a: experimental and useless stats [DEPRECATED]\n"); + printf("# ICL\t[2]id\t[3]length of repeat element\t[4]repeat-consistent deletions)\t[5]repeat-inconsistent deletions\t[6]consistent insertions\t[7]inconsistent insertions\t[8]c/(c+i) ratio\n"); for (id=0; idnstats; id++) { for (i=1; instats; id++) { stats_t *stats = &args->stats[id]; @@ -1516,7 +1529,7 @@ static void print_stats(args_t *args) { if ( usr->vals_ts[j]+usr->vals_tv[j] == 0 ) continue; // skip empty bins float val = usr->min + (usr->max - usr->min)*j/(usr->nbins-1); - const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n" : "USR:%s/%d\t%d\t%.0f\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n"; + const char * const fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n" : "USR:%s/%d\t%d\t%.0f\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n"; printf(fmt,usr->tag,usr->idx,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]); } } @@ -1734,6 +1747,13 @@ static void print_stats(args_t *args) } } + printf("# DP, depth:\n" + "# - set id, see above\n" + "# - the depth bin, corresponds to the depth (unless --depth was given)\n" + "# - number of genotypes with this depth (zero unless -s/-S was given)\n" + "# - fraction of genotypes with this depth (zero unless -s/-S was given)\n" + "# - number of sites with this depth\n" + "# - fraction of sites with this depth\n"); printf("# DP, Depth distribution\n# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%%)\t[6]number of sites\t[7]fraction of sites (%%)\n"); for (id=0; idnstats; id++) { diff --git a/version.c b/version.c index cf2c6d512..2defb4fbb 100644 --- a/version.c +++ b/version.c @@ -88,7 +88,8 @@ void set_wmode(char dst[8], int file_type, const char *fname, int clevel) const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL; if ( !end ) end = fname ? fname + strlen(fname) : fname; int len = end - fname; - if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ); + if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) + ret = hts_bcf_wmode(file_type & FT_BCF ? file_type : FT_BCF|FT_GZ); else if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_VCF); else if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); else if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); diff --git a/version.sh b/version.sh index 87816bc28..007c916a2 100755 --- a/version.sh +++ b/version.sh @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.20 +VERSION=1.21 # If we have a git clone, then check against the current tag if [ -e .git ]