diff --git a/INSTALL b/INSTALL
index bcdd2f4e3..379c5aa7e 100644
--- a/INSTALL
+++ b/INSTALL
@@ -232,8 +232,10 @@ Alpine Linux
 Note: To install gsl-dev, it may be necessary to enable the "community"
 repository in /etc/apk/repositories.
 
+Note: some older Alpine versions use libressl-dev rather than openssl-dev.
+
 doas apk update  # Ensure the package list is up to date
-doas apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev libressl-dev gsl-dev perl-dev
+doas apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev openssl-dev gsl-dev perl-dev
 
 OpenSUSE
 --------
diff --git a/LICENSE b/LICENSE
index 6d40ae2d1..46dc0e0e3 100644
--- a/LICENSE
+++ b/LICENSE
@@ -723,11 +723,12 @@ Public License instead of this License.  But first, please read
 
 -----------------------------------------------------------------------------
 
-LICENSE FOR VariantKey (https://github.com/Genomicsplc/variantkey)
+LICENSE FOR VariantKey (https://github.com/tecnickcom/variantkey)
 
 The MIT License
 
 Copyright (c) 2017-2018 GENOMICS plc
+Copyright (c) 2018-2023 Nicola Asuni - Tecnick.com
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/Makefile b/Makefile
index b0cd99ead..7013cd594 100644
--- a/Makefile
+++ b/Makefile
@@ -42,7 +42,7 @@ OBJS     = main.o vcfindex.o tabix.o \
            regidx.o smpl_ilist.o csq.o vcfbuf.o \
            mpileup.o bam2bcf.o bam2bcf_indel.o bam2bcf_iaux.o read_consensus.o bam_sample.o \
            vcfsort.o cols.o extsort.o dist.o abuf.o \
-           ccall.o em.o prob1.o kmin.o str_finder.o
+           ccall.o em.o prob1.o kmin.o str_finder.o gff.o
 PLUGIN_OBJS = vcfplugin.o
 
 prefix      = /usr/local
@@ -104,7 +104,7 @@ endif
 
 include config.mk
 
-PACKAGE_VERSION = 1.17
+PACKAGE_VERSION = 1.18
 
 # If building from a Git repository, replace $(PACKAGE_VERSION) with the Git
 # description of the working tree: either a release tag with the same value
@@ -246,7 +246,7 @@ vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htsli
 vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h) $(htslib_kstring_h) $(htslib_bgzf_h) $(bcftools_h)
 vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) $(filter_h)
 vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) regidx.h $(bcftools_h) vcmp.h $(htslib_khash_h)
-vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h abuf.h
+vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h abuf.h gff.h
 vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h) $(smpl_ilist_h)
 vcfroh.o: vcfroh.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(bcftools_h) HMM.h $(smpl_ilist_h) $(filter_h)
 vcfcnv.o: vcfcnv.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(bcftools_h) HMM.h rbuf.h
@@ -289,6 +289,7 @@ vcfbuf.o: vcfbuf.c $(htslib_vcf_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcf
 abuf.o: abuf.c $(htslib_vcf_h) $(bcftools_h) rbuf.h abuf.h
 extsort.o: extsort.c $(bcftools_h) extsort.h kheap.h
 smpl_ilist.o: smpl_ilist.c $(bcftools_h) $(smpl_ilist_h)
+gff.o: gff.c gff.h regidx.h
 csq.o: csq.c $(htslib_hts_h) $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_h) $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) regidx.h kheap.h $(smpl_ilist_h) rbuf.h
 
 # test programs
diff --git a/NEWS b/NEWS
index 06c0593ca..62c4699ac 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,121 @@
+## Release 1.18 (25th July 2023)
+
+
+Changes affecting the whole of bcftools, or multiple commands:
+
+* Support auto indexing during writing BCF and VCF.gz via new `--write-index` option
+
+
+Changes affecting specific commands:
+
+* bcftools annotate
+
+    - The `-m, --mark-sites` option can be now used to mark all sites without the
+      need to provide the `-a` file (#1861)
+
+    - Fix a bug where the `-m` function did not respect the `--min-overlap` option (#1869)
+
+    - Fix a bug when update of INFO/END results in assertion error (#1957)
+
+* bcftools concat
+
+    - New option `--drop-genotypes`
+
+* bcftools consensus
+
+    - Support higher-ploidy genotypes with `-H, --haplotype` (#1892)
+
+    - Allow `--mark-ins` and `--mark-snv` with a character, similarly to `--mark-del`
+
+* bcftools convert
+
+    - Support for conversion from tab-delimited files (CHROM,POS,REF,ALT) to sites-only VCFs
+
+* bcftools csq
+
+    - New `--unify-chr-names` option to automatically unify different chromosome
+      naming conventions in the input GFF, fasta and VCF files (e.g. "chrX" vs "X")
+
+    - More versatility in parsing various flavors of GFF
+
+    - A new `--dump-gff` option to help with debugging and investigating the internals
+      of hGFF parsing
+
+    - When printing consequences in nonsense mediated decay transcripts, include 'NMD_transcript'
+      in the consequence part of the annotation. This is to make filtering easier and analogous to
+      VEP annotations. For example the consequence annotation
+            3_prime_utr|PCGF3|ENST00000430644|NMD
+      is newly printed as
+            3_prime_utr&NMD_transcript|PCGF3|ENST00000430644|NMD
+
+* bcftools gtcheck
+
+    - Add stats for the number of sites matched in the GT-vs-GT, GT-vs-PL, etc modes. This
+      information is important for interpretation of the discordance score, as only the
+      GT-vs-GT matching can be interpreted as the number of mismatching genotypes.
+
+* bcftools +mendelian2
+
+    - Fix in command line argument parsing, the `-p` and `-P` options were not
+      functioning (#1906)
+
+* bcftools merge
+
+    - New `-M, --missing-rules` option to control the behavior of merging of vector tags
+      to prevent mixtures of known and missing values in tags when desired
+
+    - Use values pertaining to the unknown allele (<*> or <NON_REF>) when available
+      to prevent mixtures of known and missing values (#1888)
+
+    - Revamped line matching code to fix problems in gVCF merging where split gVCF blocks
+      would not update genotypes (#1891, #1164).
+
+* bcftool mpileup
+
+    - Fix a bug in --indels-v2.0 which caused an endless loop when CIGAR operator 'H' or 'P'
+      was encountered
+
+* bcftools norm
+
+    - The `-m, --multiallelics +` mode now preserves phasing (#1893)
+
+    - Symbolic <DEL.*> alleles are now normalized too (#1919)
+
+    - New `-g, --gff-annot` option to right-align indels in forward transcripts to follow
+      HGVS 3'rule (#1929)
+
+* bcftools query
+
+    - Force newline character in formatting expression when not given explicitly
+
+    - Fix `-H` header output in formatting expressions containing newlines
+
+* bcftools reheader
+
+    - Make `-f, --fai` aware of long contigs not representable by 32-bit integer (#1959)
+
+* bcftools +split-vep
+
+    - Prevent a segfault when `-i/-e` use a VEP subfield not included in `-f` or `-c` (#1877)
+
+    - New `-X, --keep-sites` option complementing the existing `-x, --drop-sites` options
+
+    - Force newline character in formatting expression when not given explicitly
+
+    - Fix a subtle ambiguity: identical rows must be returned when `-s` is applied regardless
+      of `-f` containing the `-a` VEP tag itself or not.
+
+* bcftools stats
+
+    - Collect new VAF (variant allele frequency) statistics from FORMAT/AD field
+
+    - When counting transitions/transversions, consider also alternate het genotypes
+
+* plot-vcfstats
+
+    - Add three new VAF plots
+
+
 ## Release 1.17 (21st February 2023)
 
 
diff --git a/bcftools.h b/bcftools.h
index c3f7ded16..bba71e3b6 100644
--- a/bcftools.h
+++ b/bcftools.h
@@ -1,6 +1,6 @@
 /*  bcftools.h -- utility function declarations.
 
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -49,6 +49,9 @@ void error(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2
 //  newline will be added by the function.
 void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2);
 
+// For on the fly index creation with --write-index
+int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname);
+
 void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd);
 const char *hts_bcf_wmode(int file_type);
 const char *hts_bcf_wmode2(int file_type, const char *fname);
diff --git a/cigar_state.h b/cigar_state.h
index a12a70995..dacac14ac 100644
--- a/cigar_state.h
+++ b/cigar_state.h
@@ -107,6 +107,12 @@ static inline int cstate_seek_fwd(cigar_state_t *cs, hts_pos_t *pos_ptr, int tri
             cs->icig++;
             continue;
         }
+        if ( op==BAM_CHARD_CLIP || op==BAM_CPAD )
+        {
+            cs->icig++;
+            continue;
+        }
+        error("FIXME: not ready for CIGAR operator %d\n",op);
     }
     // the read starts after pos
     if ( trim_left )
@@ -175,6 +181,12 @@ static inline int cstate_seek_op_fwd(cigar_state_t *cs, hts_pos_t pos, int seek_
             cs->icig++;
             continue;
         }
+        if ( op==BAM_CHARD_CLIP || op==BAM_CPAD )
+        {
+            cs->icig++;
+            continue;
+        }
+        error("FIXME: not ready for CIGAR operator %d\n",op);
     }
     return cs->icig < cs->ncig ? -1 : -2;
 }
diff --git a/consensus.c b/consensus.c
index 397d45f98..2b58670c7 100644
--- a/consensus.c
+++ b/consensus.c
@@ -54,8 +54,8 @@
 #define PICK_SHORT 8
 #define PICK_IUPAC 16
 
-#define TO_UPPER 0
-#define TO_LOWER 1
+#define TO_UPPER 1
+#define TO_LOWER 2
 
 typedef struct
 {
@@ -324,7 +324,7 @@ static void init_region(args_t *args, char *line)
 {
     char *ss, *se = line;
     while ( *se && !isspace(*se) && *se!=':' ) se++;
-    int from = 0, to = 0;
+    hts_pos_t from = 0, to = 0;
     char tmp = 0, *tmp_ptr = NULL;
     if ( *se )
     {
@@ -356,7 +356,14 @@ static void init_region(args_t *args, char *line)
     args->fa_frz_mod = -1;
     args->fa_case    = -1;
     args->vcf_rbuf.n = 0;
-    bcf_sr_seek(args->files,line,args->fa_ori_pos);
+
+    kstring_t str = {0,0,0};
+    if ( from==0 ) from = 1;
+    if ( to==0 ) to = HTS_POS_MAX;
+    ksprintf(&str,"%s:%"PRIhts_pos"-%"PRIhts_pos,line,from,to);
+    bcf_sr_set_regions(args->files,line,0);
+    free(str.s);
+
     if ( tmp_ptr ) *tmp_ptr = tmp;
     fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line);
     if ( args->chain_fname )
@@ -466,25 +473,37 @@ static char *mark_del(char *ref, int rlen, char *alt, int mark)
 static void mark_ins(char *ref, char *alt, char mark)
 {
     int i, nref = strlen(ref), nalt = strlen(alt);
-    if ( mark=='l' )
+    if ( mark==TO_LOWER )
         for (i=nref; i<nalt; i++) alt[i] = tolower(alt[i]);
-    else
+    else if ( mark==TO_UPPER )
         for (i=nref; i<nalt; i++) alt[i] = toupper(alt[i]);
+    else if ( mark )
+        for (i=nref; i<nalt; i++) alt[i] = mark;
 }
 static void mark_snv(char *ref, char *alt, char mark)
 {
     int i, nref = strlen(ref), nalt = strlen(alt);
     int n = nref < nalt ? nref : nalt;
-    if ( mark=='l' )
+    if ( mark==TO_LOWER )
     {
         for (i=0; i<n; i++)
             if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = tolower(alt[i]);
     }
-    else
+    else if ( mark==TO_UPPER)
     {
         for (i=0; i<n; i++)
             if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]);
     }
+    else if ( mark==TO_UPPER)
+    {
+        for (i=0; i<n; i++)
+            if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]);
+    }
+    else if ( mark )
+    {
+        for (i=0; i<n; i++)
+            if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = mark;
+    }
 }
 static void iupac_init(args_t *args, bcf1_t *rec)
 {
@@ -1099,19 +1118,18 @@ static void usage(args_t *args)
     fprintf(stderr, "    -f, --fasta-ref FILE           Reference sequence in fasta format\n");
     fprintf(stderr, "    -H, --haplotype WHICH          Choose which allele to use from the FORMAT/GT field, note\n");
     fprintf(stderr, "                                   the codes are case-insensitive:\n");
-    fprintf(stderr, "                                       1: first allele from GT, regardless of phasing\n");
-    fprintf(stderr, "                                       2: second allele from GT, regardless of phasing\n");
+    fprintf(stderr, "                                       N: N={1,2,3,..} is the index of the allele from GT, regardless of phasing (e.g. \"2\")\n");
     fprintf(stderr, "                                       R: REF allele in het genotypes\n");
     fprintf(stderr, "                                       A: ALT allele\n");
     fprintf(stderr, "                                       I: IUPAC code for all genotypes\n");
     fprintf(stderr, "                                       LR,LA: longer allele and REF/ALT if equal length\n");
     fprintf(stderr, "                                       SR,SA: shorter allele and REF/ALT if equal length\n");
-    fprintf(stderr, "                                       1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n");
+    fprintf(stderr, "                                       NpIu: index of the allele for phased and IUPAC code for unphased GTs (e.g. \"2pIu\")\n");
     fprintf(stderr, "    -i, --include EXPR             Select sites for which the expression is true (see man page for details)\n");
     fprintf(stderr, "    -I, --iupac-codes              Output IUPAC codes based on FORMAT/GT, use -s/-S to subset samples\n");
-    fprintf(stderr, "        --mark-del CHAR            Instead of removing sequence, insert CHAR for deletions\n");
-    fprintf(stderr, "        --mark-ins uc|lc           Highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
-    fprintf(stderr, "        --mark-snv uc|lc           Highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
+    fprintf(stderr, "        --mark-del CHAR            Instead of removing sequence, insert character CHAR for deletions\n");
+    fprintf(stderr, "        --mark-ins uc|lc|CHAR      Highlight insertions in uppercase (uc), lowercase (lc), or use CHAR, leaving the rest as is\n");
+    fprintf(stderr, "        --mark-snv uc|lc|CHAR      Highlight substitutions in uppercase (uc), lowercase (lc), or use CHAR, leaving the rest as is\n");
     fprintf(stderr, "    -m, --mask FILE                Replace regions according to the next --mask-with option. The default is --mask-with N\n");
     fprintf(stderr, "        --mask-with CHAR|uc|lc     Replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n");
     fprintf(stderr, "    -M, --missing CHAR             Output CHAR instead of skipping a missing genotype \"./.\"\n");
@@ -1163,13 +1181,15 @@ int main_consensus(int argc, char *argv[])
         {
             case  1 : args->mark_del = optarg[0]; break;
             case  2 :
-                if ( !strcasecmp(optarg,"uc") ) args->mark_ins = 'u';
-                else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = 'l';
+                if ( !strcasecmp(optarg,"uc") ) args->mark_ins = TO_UPPER;
+                else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = TO_LOWER;
+                else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_ins = optarg[0];
                 else error("The argument is not recognised: --mark-ins %s\n",optarg);
                 break;
             case  3 :
-                if ( !strcasecmp(optarg,"uc") ) args->mark_snv = 'u';
-                else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = 'l';
+                if ( !strcasecmp(optarg,"uc") ) args->mark_snv = TO_UPPER;
+                else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = TO_LOWER;
+                else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_snv = optarg[0];
                 else error("The argument is not recognised: --mark-snv %s\n",optarg);
                 break;
             case 'p': args->chr_prefix = optarg; break;
@@ -1211,7 +1231,8 @@ int main_consensus(int argc, char *argv[])
                 {
                     char *tmp;
                     args->haplotype = strtol(optarg, &tmp, 10);
-                    if ( tmp==optarg || *tmp ) error("Error: Could not parse --haplotype %s, expected numeric argument\n", optarg);
+                    if ( tmp==optarg || (*tmp && strcasecmp(tmp,"pIu")) ) error("Error: Could not parse \"--haplotype %s\", expected number of number followed with \"pIu\"\n", optarg);
+                    if ( *tmp ) args->allele |= PICK_IUPAC;
                     if ( args->haplotype <=0 ) error("Error: Expected positive integer with --haplotype\n");
                 }
                 break;
diff --git a/convert.c b/convert.c
index 80e54747d..07ff01862 100644
--- a/convert.c
+++ b/convert.c
@@ -106,6 +106,7 @@ struct _convert_t
     char **used_tags_list;
     int nused_tags;
     int allow_undef_tags;
+    int force_newline;
     uint8_t **subset_samples;
 };
 
@@ -648,6 +649,7 @@ static void process_type(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
 static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
 {
     vcf_format1(convert->header, line, str);
+    if ( str->s[str->l-1]=='\n' ) str->l--;
 }
 static void process_chrom_pos_id(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
 {
@@ -1560,7 +1562,6 @@ void convert_destroy(convert_t *convert)
 int convert_header(convert_t *convert, kstring_t *str)
 {
     int i, icol = 0, l_ori = str->l;
-    bcf_hdr_t *hdr = convert->header;
 
     // Supress the header output if LINE is present
     for (i=0; i<convert->nfmt; i++)
@@ -1568,6 +1569,12 @@ int convert_header(convert_t *convert, kstring_t *str)
     if ( i!=convert->nfmt )
         return str->l - l_ori;
 
+    // Header formatting becomes problematic when the formatting expression contains a newline.
+    // Simple cases like
+    //      -f'[%CHROM %POS %SAMPLE\n]'
+    // can be handled quite easily with has_fmt_newline. Note this will not work if multiple newlines
+    // are present.
+    int has_fmt_newline = 0;
     kputc('#', str);
     for (i=0; i<convert->nfmt; i++)
     {
@@ -1578,18 +1585,25 @@ int convert_header(convert_t *convert, kstring_t *str)
             while ( convert->fmt[j].is_gt_field ) j++;
             for (js=0; js<convert->nsamples; js++)
             {
-                int ks = convert->samples[js];
                 for (k=i; k<j; k++)
                 {
                     if ( convert->fmt[k].type == T_SEP )
                     {
-                        if ( convert->fmt[k].key ) kputs(convert->fmt[k].key, str);
+                        if ( convert->fmt[k].key )
+                        {
+                            char *tmp = convert->fmt[k].key;
+                            while ( *tmp )
+                            {
+                                if ( *tmp=='\n' ) has_fmt_newline = 1;
+                                else kputc(*tmp,str);
+                                tmp++;
+                            }
+                        }
                     }
-                    else if ( convert->fmt[k].type == T_SAMPLE )
-                        ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key);
                     else
-                        ksprintf(str, "[%d]%s:%s", ++icol, hdr->samples[ks], convert->fmt[k].key);
+                        ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key);
                 }
+                if ( has_fmt_newline ) break;
             }
             i = j-1;
             continue;
@@ -1602,6 +1616,7 @@ int convert_header(convert_t *convert, kstring_t *str)
         }
         ksprintf(str, "[%d]%s", ++icol, convert->fmt[i].key);
     }
+    if ( has_fmt_newline ) kputc('\n',str);
     return str->l - l_ori;
 }
 
@@ -1678,6 +1693,47 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
     return str->l - l_ori;
 }
 
+static void force_newline_(convert_t *convert)
+{
+    int i, has_newline = 0;
+    for (i=0; i<convert->nfmt; i++)
+    {
+        if ( !convert->fmt[i].key ) continue;
+        char *tmp = convert->fmt[i].key;
+        while (*tmp)
+        {
+            if ( *tmp=='\n' ) { has_newline = 1; break; }
+            tmp++;
+        }
+        if ( has_newline ) break;
+    }
+    if ( has_newline ) return;
+
+    // A newline is not present, force it. But where to add it?
+    // Consider
+    //      -f'%CHROM[ %SAMPLE]\n'
+    // vs
+    //      -f'[%CHROM %SAMPLE\n]'
+    for (i=0; i<convert->nfmt; i++)
+        if ( !convert->fmt[i].is_gt_field && convert->fmt[i].key ) break;
+
+    if ( i < convert->nfmt )
+        register_tag(convert, "\n", 0, T_SEP);  // the first case
+    else
+    {
+        // the second case
+        i = convert->nfmt - 1;
+        if ( !convert->fmt[i].key )
+        {
+            convert->fmt[i].key = strdup("\n");
+            convert->fmt[i].is_gt_field = 1;
+            register_tag(convert, NULL, 0, T_SEP);
+        }
+        else
+            register_tag(convert, "\n", 1, T_SEP);
+    }
+}
+
 int convert_set_option(convert_t *convert, enum convert_option opt, ...)
 {
     int ret = 0;
@@ -1692,6 +1748,10 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...)
         case subset_samples:
             convert->subset_samples = va_arg(args, uint8_t**);
             break;
+        case force_newline:
+            convert->force_newline = va_arg(args, int);
+            if ( convert->force_newline ) force_newline_(convert);
+            break;
         default:
             ret = -1;
     }
diff --git a/convert.h b/convert.h
index 5bbbc2cde..062607093 100644
--- a/convert.h
+++ b/convert.h
@@ -1,6 +1,6 @@
 /*  convert.h -- functions for converting between VCF/BCF and related formats.
 
-    Copyright (C) 2014-2021 Genome Research Ltd.
+    Copyright (C) 2014-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -32,6 +32,7 @@ enum convert_option
 {
     allow_undef_tags,
     subset_samples,
+    force_newline,
 };
 
 convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *str);
diff --git a/csq.c b/csq.c
index 49812d4de..f619e061a 100644
--- a/csq.c
+++ b/csq.c
@@ -35,7 +35,7 @@
     Read about transcript types here
         http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
         http://www.ensembl.org/info/genome/variation/predicted_data.html
-        http://www.gencodegenes.org/gencode_biotypes.html
+        https://www.gencodegenes.org/pages/biotypes.html
 
     List of supported biotypes
         antisense
@@ -45,6 +45,7 @@
         IG_LV_gene
         IG_V_gene
         lincRNA
+        lncRNA      .. generic term for 3prime_overlapping_ncRNA, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic, sense_overlapping
         macro_lncRNA
         miRNA
         misc_RNA
@@ -52,7 +53,7 @@
         Mt_tRNA
         polymorphic_pseudogene
         processed_transcript
-        protein_coding
+        protein_coding, mRNA
         ribozyme
         rRNA
         sRNA
@@ -144,6 +145,7 @@
 #include <htslib/khash_str2int.h>
 #include <htslib/kseq.h>
 #include <htslib/faidx.h>
+#include <htslib/bgzf.h>
 #include <errno.h>
 #include <unistd.h>
 #include <ctype.h>
@@ -153,6 +155,7 @@
 #include "kheap.h"
 #include "smpl_ilist.h"
 #include "rbuf.h"
+#include "gff.h"
 
 #ifndef __FUNCTION__
 #  define __FUNCTION__ __func__
@@ -162,20 +165,8 @@
 #define FLT_INCLUDE 1
 #define FLT_EXCLUDE 2
 
-// Definition of splice_region, splice_acceptor and splice_donor
-#define N_SPLICE_DONOR         2
-#define N_SPLICE_REGION_EXON   3
-#define N_SPLICE_REGION_INTRON 8
-
 #define N_REF_PAD 10    // number of bases to avoid boundary effects
 
-#define STRAND_REV 0
-#define STRAND_FWD 1
-
-#define TRIM_NONE   0
-#define TRIM_5PRIME 1
-#define TRIM_3PRIME 2
-
 // How to treat phased/unphased genotypes
 #define PHASE_REQUIRE 0     // --phase r
 #define PHASE_MERGE   1     // --phase m
@@ -223,6 +214,7 @@
 
 #define CSQ_PRN_STRAND(csq)     ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION)))
 #define CSQ_PRN_TSCRIPT         (~(CSQ_INTRON|CSQ_NON_CODING))
+#define CSQ_PRN_NMD             (~(CSQ_INTRON|CSQ_NON_CODING))
 #define CSQ_PRN_BIOTYPE         CSQ_NON_CODING
 
 // see kput_vcsq()
@@ -254,119 +246,6 @@ const char *csq_strings[] =
     "start_retained"
 };
 
-
-// GFF line types
-#define GFF_UNKN_LINE    0
-#define GFF_TSCRIPT_LINE 1
-#define GFF_GENE_LINE    2
-
-
-/*
-    Genomic features, for fast lookup by position to overlapping features
-*/
-#define GF_coding_bit 6
-#define GF_is_coding(x) ((x) & (1<<GF_coding_bit))
-#define GF_MT_rRNA                       1                      // non-coding: 1, 2, ...
-#define GF_MT_tRNA                       2
-#define GF_lincRNA                       3
-#define GF_miRNA                         4
-#define GF_MISC_RNA                      5
-#define GF_rRNA                          6
-#define GF_snRNA                         7
-#define GF_snoRNA                        8
-#define GF_PROCESSED_TRANSCRIPT          9
-#define GF_ANTISENSE                    10
-#define GF_macro_lncRNA                 11
-#define GF_ribozyme                     12
-#define GF_sRNA                         13
-#define GF_scRNA                        14
-#define GF_scaRNA                       15
-#define GF_SENSE_INTRONIC               16
-#define GF_SENSE_OVERLAPPING            17
-#define GF_PSEUDOGENE                   18
-#define GF_PROCESSED_PSEUDOGENE         19
-#define GF_ARTIFACT                     20
-#define GF_IG_PSEUDOGENE                21
-#define GF_IG_C_PSEUDOGENE              22
-#define GF_IG_J_PSEUDOGENE              23
-#define GF_IG_V_PSEUDOGENE              24
-#define GF_TR_V_PSEUDOGENE              25
-#define GF_TR_J_PSEUDOGENE              26
-#define GF_MT_tRNA_PSEUDOGENE           27
-#define GF_misc_RNA_PSEUDOGENE          28
-#define GF_miRNA_PSEUDOGENE             29
-#define GF_RIBOZYME                     30
-#define GF_RETAINED_INTRON              31
-#define GF_RETROTRANSPOSED              32
-#define GF_tRNA_PSEUDOGENE              33
-#define GF_TRANSCRIBED_PROCESSED_PSEUDOGENE     34
-#define GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE   35
-#define GF_TRANSCRIBED_UNITARY_PSEUDOGENE       36
-#define GF_TRANSLATED_UNPROCESSED_PSEUDOGENE    37
-#define GF_TRANSLATED_PROCESSED_PSEUDOGENE      38
-#define GF_KNOWN_NCRNA                          39
-#define GF_UNITARY_PSEUDOGENE                   40
-#define GF_UNPROCESSED_PSEUDOGENE               41
-#define GF_LRG_GENE                             42
-#define GF_3PRIME_OVERLAPPING_ncRNA             43
-#define GF_DISRUPTED_DOMAIN                     44
-#define GF_vaultRNA                             45
-#define GF_BIDIRECTIONAL_PROMOTER_lncRNA        46
-#define GF_AMBIGUOUS_ORF                        47
-#define GF_PROTEIN_CODING               (1|(1<<GF_coding_bit))  // coding: 65, 66, ...
-#define GF_POLYMORPHIC_PSEUDOGENE       (2|(1<<GF_coding_bit))
-#define GF_IG_C                         (3|(1<<GF_coding_bit))
-#define GF_IG_D                         (4|(1<<GF_coding_bit))
-#define GF_IG_J                         (5|(1<<GF_coding_bit))
-#define GF_IG_LV                        (6|(1<<GF_coding_bit))
-#define GF_IG_V                         (7|(1<<GF_coding_bit))
-#define GF_TR_C                         (8|(1<<GF_coding_bit))
-#define GF_TR_D                         (9|(1<<GF_coding_bit))
-#define GF_TR_J                        (10|(1<<GF_coding_bit))
-#define GF_TR_V                        (11|(1<<GF_coding_bit))
-#define GF_NMD                         (12|(1<<GF_coding_bit))
-#define GF_NON_STOP_DECAY              (13|(1<<GF_coding_bit))
-#define GF_CDS      ((1<<(GF_coding_bit+1))+1)                  // special types: 129, 130, ...
-#define GF_EXON     ((1<<(GF_coding_bit+1))+2)
-#define GF_UTR3     ((1<<(GF_coding_bit+1))+3)
-#define GF_UTR5     ((1<<(GF_coding_bit+1))+4)
-// GF_MAX = (1<<30)-1, see hap_node_t
-
-#define CDS_PHASE_UNKN 3
-typedef struct _tscript_t tscript_t;
-typedef struct
-{
-    tscript_t *tr;      // transcript
-    uint32_t beg;       // the start coordinate of the CDS (on the reference strand, 0-based)
-    uint32_t pos;       // 0-based index of the first exon base within the transcript (only to
-                        //  update hap_node_t.sbeg in hap_init, could be calculated on the fly)
-    uint32_t len;       // exon length
-    uint32_t icds:30,   // exon index within the transcript
-             phase:2;   // offset of the CDS: 0,1,2 or 3 for unknown
-}
-gf_cds_t;
-typedef struct
-{
-    char *name;           // human readable name, e.g. ORF45
-    uint32_t iseq;
-}
-gf_gene_t;
-typedef struct
-{
-    uint32_t beg,end;
-    tscript_t *tr;
-}
-gf_exon_t;
-typedef enum { prime3, prime5 } utr_t;
-typedef struct
-{
-    utr_t which;
-    uint32_t beg,end;
-    tscript_t *tr;
-}
-gf_utr_t;
-
-
 /*
     Structures related to VCF output:
 
@@ -459,28 +338,21 @@ struct _hap_node_t
     csq_t *csq_list;            // list of haplotype's consequences, broken by position (each corresponds to a VCF record)
     int ncsq_list, mcsq_list;
 };
-struct _tscript_t
+#define TSCRIPT_AUX(x) ((tscript_t*)(x)->aux)
+typedef struct
 {
-    uint32_t id;        // transcript id
-    uint32_t beg,end;   // transcript's beg and end coordinate (ref strand, 0-based, inclusive)
-    uint32_t strand:1,  // STRAND_REV or STRAND_FWD
-             ncds:31,   // number of exons
-             mcds;
-    gf_cds_t **cds;     // ordered list of exons
     char *ref;          // reference sequence, padded with N_REF_PAD bases on both ends
     char *sref;         // spliced reference sequence, padded with N_REF_PAD bases on both ends
     hap_node_t *root;   // root of the haplotype tree
     hap_node_t **hap;   // pointer to haplotype leaves, two for each sample
     int nhap, nsref;    // number of haplotypes and length of sref, including 2*N_REF_PAD
-    uint32_t trim:2,    // complete, 5' or 3' trimmed, see TRIM_* types
-             type:30;   // one of GF_* types
-    gf_gene_t *gene;
-};
-static inline int cmp_tscript(tscript_t **a, tscript_t **b)
+}
+tscript_t;
+static inline int cmp_tscript(gf_tscript_t **a, gf_tscript_t **b)
 {
     return ( (*a)->end  < (*b)->end ) ? 1 : 0;
 }
-KHEAP_INIT(trhp, tscript_t*, cmp_tscript)
+KHEAP_INIT(trhp, gf_tscript_t*, cmp_tscript)
 typedef khp_trhp_t tr_heap_t;
 typedef struct
 {
@@ -494,7 +366,7 @@ typedef struct
 {
     int mstack;
     hstack_t *stack;
-    tscript_t *tr;      // tr->ref: spliced transcript on ref strand
+    gf_tscript_t *tr;   // tr->ref: spliced transcript on ref strand
     kstring_t sseq;     // spliced haplotype sequence on ref strand
     kstring_t tseq;     // the variable part of translated haplotype transcript, coding strand
     kstring_t tref;     // the variable part of translated reference transcript, coding strand
@@ -503,77 +375,20 @@ typedef struct
 }
 hap_t;
 
-
-/*
-    Helper structures, only for initialization
-
-    ftr_t
-        temporary list of all exons, CDS, UTRs
-*/
-KHASH_MAP_INIT_INT(int2tscript, tscript_t*)
-KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
-typedef struct
-{
-    int type;       // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR
-    uint32_t beg;
-    uint32_t end;
-    uint32_t trid;
-    uint32_t strand:1;   // STRAND_REV,STRAND_FWD
-    uint32_t phase:2;    // 0, 1, 2, or 3 for unknown
-    uint32_t iseq:29;
-}
-ftr_t;
-/*
-    Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001)
-    to integer id.  To keep the memory requirements low, the original version
-    relied on IDs in the form of a string prefix and a numerical id.  However,
-    it turns out that this assumption is not valid for some ensembl GFFs, see
-    for example Zea_mays.AGPv4.36.gff3.gz
- */
-typedef struct
-{
-    void *str2id;       // khash_str2int
-    int nstr, mstr;
-    char **str;         // numeric id to string
-}
-id_tbl_t;
-typedef struct
-{
-    // all exons, CDS, UTRs
-    ftr_t *ftr;
-    int nftr, mftr;
-
-    // mapping from gene id to gf_gene_t
-    kh_int2gene_t *gid2gene;
-
-    // mapping from transcript id to tscript, for quick CDS anchoring
-    kh_int2tscript_t *id2tr;
-
-    // sequences
-    void *seq2int;  // str2int hash
-    char **seq;
-    int nseq, mseq;
-
-    // ignored biotypes
-    void *ignored_biotypes;
-
-    id_tbl_t gene_ids;   // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx
-}
-aux_t;
-
 typedef struct _args_t
 {
     // the main regidx lookups, from chr:beg-end to overlapping features and
     // index iterator
+    gff_t *gff;
     regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
     regitr_t *itr;
 
-    // temporary structures, deleted after initializtion
-    aux_t init;
-
     // text tab-delimited output (out) or vcf/bcf output (out_fh)
     FILE *out;
     htsFile *out_fh;
+    char *index_fn;
+    int write_index;
+    char *dump_gff;
 
     // vcf
     bcf_srs_t *sr;
@@ -597,6 +412,13 @@ typedef struct _args_t
     int ncsq2_max, nfmt_bcsq;   // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values)
     int ncsq2_small_warned;
     int brief_predictions;
+    int unify_chr_names;
+    char *chr_name;
+    int mchr_name;
+    struct {
+        int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id;
+        int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds;
+    } warned;
 
     int rid;                    // current chromosome
     tr_heap_t *active_tr;       // heap of active transcripts for quick flushing
@@ -604,11 +426,10 @@ typedef struct _args_t
     vbuf_t **vcf_buf;           // buffered VCF lines to annotate with CSQ and flush
     rbuf_t vcf_rbuf;            // round buffer indexes to vcf_buf
     kh_pos2vbuf_t *pos2vbuf;    // fast lookup of buffered lines by position
-    tscript_t **rm_tr;          // buffer of transcripts to clean
+    gf_tscript_t **rm_tr;       // buffer of transcripts to clean
     int nrm_tr, mrm_tr;
     csq_t *csq_buf;             // pool of csq not managed by hap_node_t, i.e. non-CDS csqs
     int ncsq_buf, mcsq_buf;
-    id_tbl_t tscript_ids;       // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
     int force;                  // force run under various conditions. Currently only to skip out-of-phase transcripts
     int n_threads;              // extra compression/decompression threads
 
@@ -645,818 +466,6 @@ const uint8_t cnt4[] =
 #define dna2aa(x)  gencode[  nt4[(uint8_t)(x)[0]]<<4 |  nt4[(uint8_t)(x)[1]]<<2 |  nt4[(uint8_t)(x)[2]] ]
 #define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
 
-static const char *gf_strings_noncoding[] =
-{
-    "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript",
-    "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping",
-    "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
-    "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
-    "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
-    "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene",    "translated_unprocessed_pseudogene",
-    "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene",
-    "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf"
-};
-static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"};
-static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" };
-
-const char *gf_type2gff_string(int type)
-{
-    if ( !GF_is_coding(type) )
-    {
-        if ( type < (1<<GF_coding_bit) ) return gf_strings_noncoding[type-1];
-        type &= (1<<(GF_coding_bit+1)) - 1;
-        return gf_strings_special[type - 1];
-    }
-    type &= (1<<GF_coding_bit) - 1;
-    return gf_strings_coding[type - 1];
-}
-
-/*
-    gff parsing functions
-*/
-static inline int feature_set_seq(args_t *args, char *chr_beg, char *chr_end)
-{
-    aux_t *aux = &args->init;
-    char c = chr_end[1];
-    chr_end[1] = 0;
-    int iseq;
-    if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
-    {
-        // check for possible mismatch in chromosome naming convention such as chrX vs X
-        char *new_chr = NULL;
-        if ( faidx_has_seq(args->fai,chr_beg) )
-            new_chr = strdup(chr_beg);                  // valid chr name, the same in gff and faidx
-        else
-        {
-            int len = strlen(chr_beg);
-            if ( !strncmp("chr",chr_beg,3) && len>3 )
-                new_chr = strdup(chr_beg+3);            // gff has the prefix, faidx does not
-            else
-            {
-                new_chr = malloc(len+4);                // gff does not have the prefix, faidx has
-                memcpy(new_chr,"chr",3);
-                memcpy(new_chr+3,chr_beg,len);
-                new_chr[len+3] = 0;
-            }
-            if ( !faidx_has_seq(args->fai,new_chr) )    // modification did not help, this sequence is not in fai
-            {
-                static int unkwn_chr_warned = 0;
-                if ( !unkwn_chr_warned && args->verbosity>0 )
-                    fprintf(stderr,"Warning: GFF chromosome \"%s\" not part of the reference genome\n",chr_beg);
-                unkwn_chr_warned = 1;
-                free(new_chr);
-                new_chr = strdup(chr_beg);              // use the original sequence name
-            }
-        }
-        if ( khash_str2int_get(aux->seq2int, new_chr, &iseq)!=0 )
-        {
-            hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
-            aux->seq[aux->nseq] = new_chr;
-            iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
-            aux->nseq++;
-            assert( aux->nseq < 1<<29 );  // see gf_gene_t.iseq and ftr_t.iseq
-        }
-        else
-            free(new_chr);
-    }
-    chr_end[1] = c;
-    return iseq;
-}
-static inline char *gff_skip(const char *line, char *ss)
-{
-    while ( *ss && *ss!='\t' ) ss++;
-    if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-    return ss+1;
-}
-static inline void gff_parse_chr(const char *line, char **chr_beg, char **chr_end)
-{
-    char *se = (char*) line;
-    while ( *se && *se!='\t' ) se++;
-    if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-    *chr_beg = (char*) line;
-    *chr_end = se-1;
-}
-static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end)
-{
-    char *se = ss;
-    *beg = strtol(ss, &se, 10) - 1;
-    if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss);
-    ss = se+1;
-    *end = strtol(ss, &se, 10) - 1;
-    if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-    return se+1;
-}
-static void gff_id_init(id_tbl_t *tbl)
-{
-    memset(tbl, 0, sizeof(*tbl));
-    tbl->str2id = khash_str2int_init();
-}
-static void gff_id_destroy(id_tbl_t *tbl)
-{
-    khash_str2int_destroy_free(tbl->str2id);
-    free(tbl->str);
-}
-// returns 0 on success, -1 on failure
-static inline int gff_id_parse(id_tbl_t *tbl, const char *needle, char *ss, uint32_t *id_ptr)
-{
-    ss = strstr(ss,needle);     // e.g. "ID=transcript:"
-    if ( !ss ) return -1;
-    ss += strlen(needle);
-
-    char *se = ss;
-    while ( *se && *se!=';' && !isspace(*se) ) se++;
-    char tmp = *se;
-    *se = 0;
-
-    int id;
-    if ( khash_str2int_get(tbl->str2id, ss, &id) < 0 )
-    {
-        id = tbl->nstr++;
-        hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
-        tbl->str[id] = strdup(ss);
-        khash_str2int_set(tbl->str2id, tbl->str[id], id);
-    }
-    *se = tmp;
-    *id_ptr = id;
-    return 0;
-}
-static inline int gff_parse_type(char *line)
-{
-    line = strstr(line,"ID=");
-    if ( !line ) return -1;
-    line += 3;
-    if ( !strncmp(line,"transcript:",11) ) return GFF_TSCRIPT_LINE;
-    else if ( !strncmp(line,"gene:",5) ) return GFF_GENE_LINE;
-    return -1;
-}
-static inline int gff_parse_biotype(char *_line)
-{
-    char *line = strstr(_line,"biotype=");
-    if ( !line ) return -1;
-
-    line += 8;
-    switch (*line)
-    {
-        case 'p':
-            if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING;
-            else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE;
-            else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT;
-            else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE;
-            else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE;
-            break;
-        case 'a':
-            if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT;
-            else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE;
-            else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF;
-            break;
-        case 'I':
-            if ( !strncmp(line,"IG_C_gene",9) ) return GF_IG_C;
-            else if ( !strncmp(line,"IG_D_gene",9) ) return GF_IG_D;
-            else if ( !strncmp(line,"IG_J_gene",9) ) return GF_IG_J;
-            else if ( !strncmp(line,"IG_LV_gene",10) ) return GF_IG_LV;
-            else if ( !strncmp(line,"IG_V_gene",9) ) return GF_IG_V;
-            else if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE;
-            else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE;
-            else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE;
-            else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE;
-            break;
-        case 'T':
-            if ( !strncmp(line,"TR_C_gene",9) ) return GF_TR_C;
-            else if ( !strncmp(line,"TR_D_gene",9) ) return GF_TR_D;
-            else if ( !strncmp(line,"TR_J_gene",9) ) return GF_TR_J;
-            else if ( !strncmp(line,"TR_V_gene",9) ) return GF_TR_V;
-            else if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE;
-            else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE;
-            break;
-        case 'M':
-            if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE;
-            else if ( !strncmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA;
-            else if ( !strncmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA;
-            break;
-        case 'l':
-            if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA;
-            break;
-        case 'm':
-            if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA;
-            else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE;
-            else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE;
-            else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA;
-            else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA;
-            break;
-        case 'r':
-            if ( !strncmp(line,"rRNA",4) ) return GF_rRNA;
-            else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME;
-            else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON;
-            else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED;
-            break;
-        case 's':
-            if ( !strncmp(line,"snRNA",5) ) return GF_snRNA;
-            else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA;
-            else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA;
-            else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA;
-            else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA;
-            else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC;
-            else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING;
-            break;
-        case 't':
-            if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE;
-            else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE;
-            else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
-            else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE;
-            else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE;
-            else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE;
-            break;
-        case 'n':
-            if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD;
-            else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY;
-            break;
-        case 'k':
-            if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA;
-            break;
-        case 'u':
-            if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE;
-            else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE;
-            break;
-        case 'L':
-            if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE;
-            break;
-        case '3':
-            if ( !strncmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA;
-            break;
-        case 'd':
-            if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN;
-            break;
-        case 'v':
-            if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA;
-            break;
-        case 'b':
-            if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA;
-            break;
-    }
-    return 0;
-}
-static inline int gff_ignored_biotype(args_t *args, char *ss)
-{
-    ss = strstr(ss,"biotype=");
-    if ( !ss ) return 0;
-
-    ss += 8;
-    char *se = ss, tmp;
-    while ( *se && *se!=';' ) se++;
-    tmp = *se;
-    *se = 0;
-
-    char *key = ss;
-    int n = 0;
-    if ( khash_str2int_get(args->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss);
-    khash_str2int_set(args->init.ignored_biotypes, key, n+1);
-
-    *se = tmp;
-    return 1;
-}
-gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id)
-{
-    khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id);
-    gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k);
-    if ( !gene )
-    {
-        gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t));
-        int ret;
-        k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret);
-        kh_val(aux->gid2gene,k) = gene;
-    }
-    return gene;
-}
-void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr)
-{
-    aux_t *aux = &args->init;
-    int biotype = gff_parse_biotype(ss);
-    if ( biotype <= 0 )
-    {
-        if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored transcript, unknown biotype: %s\n",line);
-        return;
-    }
-
-    // create a mapping from transcript_id to gene_id
-    uint32_t trid, gene_id;
-    if ( gff_id_parse(&args->tscript_ids, "ID=transcript:", ss, &trid) )
-    {
-        if ( gff_id_parse(&args->tscript_ids, "ID=", ss, &trid) )
-            error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-        static int warned = 0;
-        if ( !warned && args->verbosity > 0 )
-        {
-            fprintf(stderr,"Warning: non-standard transcript ID notation in the GFF, expected \"ID=transcript:XXX\", found %s\n",line);
-            warned = 1;
-        }
-    }
-    if ( gff_id_parse(&args->init.gene_ids, "Parent=gene:", ss, &gene_id) )
-    {
-        if ( gff_id_parse(&args->init.gene_ids, "Parent=", ss, &gene_id) )
-            error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-        static int warned = 0;
-        if ( !warned && args->verbosity > 0 )
-        {
-            fprintf(stderr,"Warning: non-standard transcript Parent notation in the GFF, expected \"Parent=gene:XXX\", found %s\n",line);
-            warned = 1;
-        }
-    }
-
-    tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t));
-    tr->id     = trid;
-    tr->strand = ftr->strand;
-    tr->gene   = gene_init(aux, gene_id);
-    tr->type   = biotype;
-    tr->beg    = ftr->beg;
-    tr->end    = ftr->end;
-
-    khint_t k;
-    int ret;
-    k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret);
-    kh_val(aux->id2tr,k) = tr;
-}
-void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, char *chr_end, ftr_t *ftr)
-{
-    int biotype = gff_parse_biotype(ss);
-    if ( biotype <= 0 )
-    {
-        if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored gene, unknown biotype: %s\n",line);
-        return;
-    }
-
-    aux_t *aux = &args->init;
-
-    // substring search for "ID=gene:ENSG00000437963"
-    uint32_t gene_id;
-    if ( gff_id_parse(&aux->gene_ids, "ID=gene:", ss, &gene_id) )
-    {
-        if ( gff_id_parse(&aux->gene_ids, "ID=", ss, &gene_id) )
-            error("[%s:%d %s] Could not parse the line, neither \"ID=gene:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-        static int warned = 0;
-        if ( !warned && args->verbosity > 0 )
-        {
-            fprintf(stderr,"Warning: non-standard gene ID notation in the GFF, expected \"ID=gene:XXX\", found %s\n",line);
-            warned = 1;
-        }
-    }
-
-    gf_gene_t *gene = gene_init(aux, gene_id);
-    assert( !gene->name );      // the gene_id should be unique
-
-    gene->iseq = feature_set_seq(args, chr_beg,chr_end);
-
-    // substring search for "Name=OR4F5"
-    ss = strstr(chr_end+2,"Name=");
-    if ( ss )
-    {
-        ss += 5;
-        char *se = ss;
-        while ( *se && *se!=';' && !isspace(*se) ) se++;
-        gene->name = (char*) malloc(se-ss+1);
-        memcpy(gene->name,ss,se-ss);
-        gene->name[se-ss] = 0;
-    }
-    else
-        gene->name = strdup(aux->gene_ids.str[gene_id]); // Name=<GeneName> field is not present, use the gene ID instead
-}
-int gff_parse(args_t *args, char *line, ftr_t *ftr)
-{
-    // - skip empty lines and commented lines
-    // - columns
-    //      1.      chr
-    //      2.      <skip>
-    //      3.      CDS, transcript, gene, ...
-    //      4-5.    beg,end
-    //      6.      <skip>
-    //      7.      strand
-    //      8.      phase
-    //      9.      Parent=transcript:ENST(\d+);ID=... etc
-
-    char *ss = line;
-    if ( !*ss ) return -1;      // skip blank lines
-    if ( *ss=='#' ) return -1;  // skip comments
-
-    char *chr_beg, *chr_end;
-    gff_parse_chr(line, &chr_beg, &chr_end);
-    ss = gff_skip(line, chr_end + 2);
-
-    // 3. column: is this a CDS, transcript, gene, etc.
-    if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; }
-    else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; }
-    else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; }
-    else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; }
-    else
-    {
-        int type = GFF_UNKN_LINE;
-        if ( !strncmp("gene\t",ss,4) ) type = GFF_GENE_LINE;
-        else if ( !strncmp("transcript\t",ss,4) ) type = GFF_TSCRIPT_LINE;
-        ss = gff_skip(line, ss);
-        ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
-        ss = gff_skip(line, ss);
-        if ( type==GFF_UNKN_LINE ) type = gff_parse_type(ss);   // determine type from ID=transcript: or ID=gene:
-        if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE )
-        {
-            // we ignore these, debug print to see new types:
-            ss = strstr(ss,"ID=");
-            if ( !ss ) return -1;   // no ID, ignore the line
-            if ( !strncmp("chromosome",ss+3,10) ) return -1;
-            if ( !strncmp("supercontig",ss+3,11) ) return -1;
-            if ( args->verbosity > 0 ) fprintf(stderr,"ignored: %s\n", line);
-            return -1;
-        }
-
-        // 7. column: strand
-        if ( *ss == '+' ) ftr->strand = STRAND_FWD;
-        else if ( *ss == '-' ) ftr->strand = STRAND_REV;
-        else error("Unknown strand: %c .. %s\n", *ss,ss);
-
-        if ( type==GFF_TSCRIPT_LINE )
-            gff_parse_transcript(args, line, ss, ftr);
-        else
-            gff_parse_gene(args, line, ss, chr_beg, chr_end, ftr);
-
-        return -1;
-    }
-    ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
-    ss = gff_skip(line, ss);
-
-    // 7. column: strand
-    if ( *ss == '+' ) ftr->strand = STRAND_FWD;
-    else if ( *ss == '-' ) ftr->strand = STRAND_REV;
-    else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown strand: %c\n", *ss); return -1; }
-    ss += 2;
-
-    // 8. column: phase (codon offset)
-    if ( *ss == '0' ) ftr->phase = 0;
-    else if ( *ss == '1' ) ftr->phase = 1;
-    else if ( *ss == '2' ) ftr->phase = 2;
-    else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN;     // exons and even CDS in some GFFs do not have phase
-    else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; }
-    ss += 2;
-
-    // substring search for "Parent=transcript:ENST00000437963"
-    if ( gff_id_parse(&args->tscript_ids, "Parent=transcript:", ss, &ftr->trid) )
-    {
-        if ( gff_id_parse(&args->tscript_ids, "Parent=", ss, &ftr->trid) )
-            error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-        static int warned = 0;
-        if ( !warned && args->verbosity > 0 )
-        {
-            fprintf(stderr,"Warning: non-standard gene Parent notation in the GFF, expected \"Parent=transcript:XXX\", found %s\n",line);
-            warned = 1;
-        }
-    }
-
-    ftr->iseq = feature_set_seq(args, chr_beg,chr_end);
-    return 0;
-}
-
-static int cmp_cds_ptr(const void *a, const void *b)
-{
-    // comparison function for qsort of transcripts's CDS
-    if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1;
-    if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1;
-    return 0;
-}
-
-static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end)
-{
-    *chr_beg = *chr_end = aux->seq[iseq];
-    while ( (*chr_end)[1] ) (*chr_end)++;
-}
-tscript_t *tscript_init(aux_t *aux, uint32_t trid)
-{
-    khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid);
-    tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k);
-    assert( tr );
-    return tr;
-}
-void register_cds(args_t *args, ftr_t *ftr)
-{
-    // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet.
-    //  ftr is the result of parsing a gff CDS line
-    aux_t *aux = &args->init;
-
-    tscript_t *tr = tscript_init(aux, ftr->trid);
-    if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand);
-
-    gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t));
-    cds->tr    = tr;
-    cds->beg   = ftr->beg;
-    cds->len   = ftr->end - ftr->beg + 1;
-    cds->icds  = 0;     // to keep valgrind on mac happy
-    cds->phase = ftr->phase;
-
-    hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds);
-    tr->cds[tr->ncds++] = cds;
-}
-void register_utr(args_t *args, ftr_t *ftr)
-{
-    aux_t *aux = &args->init;
-    gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t));
-    utr->which = ftr->type==GF_UTR3 ? prime3 : prime5;
-    utr->beg   = ftr->beg;
-    utr->end   = ftr->end;
-    utr->tr    = tscript_init(aux, ftr->trid);
-
-    char *chr_beg, *chr_end;
-    chr_beg_end(&args->init, utr->tr->gene->iseq, &chr_beg, &chr_end);
-    regidx_push(args->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr);
-}
-void register_exon(args_t *args, ftr_t *ftr)
-{
-    aux_t *aux = &args->init;
-    gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t));
-    exon->beg = ftr->beg;
-    exon->end = ftr->end;
-    exon->tr  = tscript_init(aux, ftr->trid);
-
-    char *chr_beg, *chr_end;
-    chr_beg_end(&args->init, exon->tr->gene->iseq, &chr_beg, &chr_end);
-    regidx_push(args->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon);
-}
-
-void tscript_init_cds(args_t *args)
-{
-    aux_t *aux = &args->init;
-
-    // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds)
-    khint_t k;
-    int warn_phase_unkn = 0;
-    for (k=0; k<kh_end(aux->id2tr); k++)
-    {
-        if ( !kh_exist(aux->id2tr, k) ) continue;
-        tscript_t *tr = (tscript_t*) kh_val(aux->id2tr, k);
-
-        // position-to-tscript lookup
-        char *chr_beg, *chr_end;
-        chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end);
-        regidx_push(args->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr);
-
-        if ( !tr->ncds ) continue;      // transcript with no CDS
-
-        // sort CDs
-        qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr);
-
-        // trim non-coding start
-        int i, len = 0;
-        if ( tr->strand==STRAND_FWD )
-        {
-            if ( tr->cds[0]->phase != CDS_PHASE_UNKN )
-            {
-                if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME;
-                tr->cds[0]->beg += tr->cds[0]->phase;
-                tr->cds[0]->len -= tr->cds[0]->phase;
-                tr->cds[0]->phase = 0;
-            }
-
-            // sanity check phase; the phase number in gff tells us how many bases to skip in this
-            // feature to reach the first base of the next codon
-            int tscript_ok = 1;
-            for (i=0; i<tr->ncds; i++)
-            {
-                if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
-                {
-                    warn_phase_unkn = 1;
-                    len += tr->cds[i]->len;
-                    continue;
-                }
-                int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
-                if ( phase!=len%3 )
-                {
-                    if ( args->force )
-                    {
-                        if ( args->verbosity > 0 )
-                            fprintf(stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
-                                args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
-                        tscript_ok = 0;
-                        break;
-                    }
-                    error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
-                            args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
-                }
-                len += tr->cds[i]->len;
-            }
-            if ( !tscript_ok ) continue;    // skip this transcript
-        }
-        else
-        {
-            if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN )
-            {
-                // Check that the phase is not bigger than CDS length. Curiously, this can really happen,
-                // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141
-                // todo: the same for the fwd strand
-                i = tr->ncds - 1;
-                int phase = tr->cds[i]->phase;
-                if ( phase ) tr->trim |= TRIM_5PRIME;
-                while ( i>=0 && phase > tr->cds[i]->len )
-                {
-                    phase -= tr->cds[i]->len;
-                    tr->cds[i]->phase = 0;
-                    tr->cds[i]->len   = 0;
-                    i--;
-                }
-                tr->cds[i]->len  -= tr->cds[i]->phase;
-                tr->cds[i]->phase = 0;
-            }
-
-            // sanity check phase
-            int tscript_ok = 1;
-            for (i=tr->ncds-1; i>=0; i--)
-            {
-                if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
-                {
-                    warn_phase_unkn = 1;
-                    len += tr->cds[i]->len;
-                    continue;
-                }
-                int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
-                if ( phase!=len%3)
-                {
-                    if ( args->force )
-                    {
-                        if ( args->verbosity > 0 )
-                            fprintf(stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
-                                args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
-                        tscript_ok = 0;
-                        break;
-                    }
-                    error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
-                        args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
-                }
-                len += tr->cds[i]->len;
-            }
-            if ( !tscript_ok ) continue;    // skip this transcript
-        }
-
-        // set len. At the same check that CDS within a transcript do not overlap
-        len = 0;
-        for (i=0; i<tr->ncds; i++)
-        {
-            tr->cds[i]->icds = i;
-            len += tr->cds[i]->len;
-            if ( !i ) continue;
-
-            gf_cds_t *a = tr->cds[i-1];
-            gf_cds_t *b = tr->cds[i];
-            if ( a->beg + a->len - 1 >= b->beg )
-            {
-                if ( args->force )
-                {
-                    fprintf(stderr,"Warning: GFF contains overlapping CDS %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32".\n",
-                        args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
-                }
-                else
-                    error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n"
-                          "       Use the --force option to override (at your own risk).\n",
-                            args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
-            }
-        }
-        if ( len%3 != 0 )
-        {
-            // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289
-            //  http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289
-            // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one.
-
-            tr->trim |= TRIM_3PRIME;
-            if ( tr->strand==STRAND_FWD )
-            {
-                i = tr->ncds - 1;
-                while ( i>=0 && len%3 )
-                {
-                    int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
-                    tr->cds[i]->len -= dlen;
-                    len -= dlen;
-                    i--;
-                }
-            }
-            else
-            {
-                i = 0;
-                while ( i<tr->ncds && len%3 )
-                {
-                    int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
-                    tr->cds[i]->len -= dlen;
-                    tr->cds[i]->beg += dlen;
-                    len -= dlen;
-                    i++;
-                }
-            }
-        }
-
-        // set CDS offsets and insert into regidx
-        len=0;
-        for (i=0; i<tr->ncds; i++)
-        {
-            tr->cds[i]->pos = len;
-            len += tr->cds[i]->len;
-            regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]);
-        }
-    }
-    if ( warn_phase_unkn && args->verbosity > 0 )
-        fprintf(stderr,"Warning: encountered CDS with phase column unset, could not verify reading frame\n");
-}
-
-void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); }
-void regidx_free_tscript(void *payload) { tscript_t *tr = *((tscript_t**)payload); free(tr->cds); free(tr); }
-
-void init_gff(args_t *args)
-{
-    aux_t *aux = &args->init;
-    aux->seq2int   = khash_str2int_init();   // chrom's numeric id
-    aux->gid2gene  = kh_init(int2gene);      // gene id to gf_gene_t, for idx_gene
-    aux->id2tr     = kh_init(int2tscript);   // transcript id to tscript_t
-    args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL);
-    aux->ignored_biotypes = khash_str2int_init();
-    gff_id_init(&aux->gene_ids);
-    gff_id_init(&args->tscript_ids);
-
-    // parse gff
-    kstring_t str = {0,0,0};
-    htsFile *fp = hts_open(args->gff_fname,"r");
-    if ( !fp ) error("Failed to read %s\n", args->gff_fname);
-    while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
-    {
-        hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr);
-        int ret = gff_parse(args, str.s, aux->ftr + aux->nftr);
-        if ( !ret ) aux->nftr++;
-    }
-    free(str.s);
-    if ( hts_close(fp)!=0 ) error("Close failed: %s\n", args->gff_fname);
-
-
-    // process gff information: connect CDS and exons to transcripts
-    args->idx_cds  = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL);
-    args->idx_utr  = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL);
-    args->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL);
-    args->itr      = regitr_init(NULL);
-
-    int i;
-    for (i=0; i<aux->nftr; i++)
-    {
-        ftr_t *ftr = &aux->ftr[i];
-
-        // check whether to keep this feature: is there a mapping trid -> gene_id -> gene?
-        khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid);
-        if ( k==kh_end(aux->id2tr) ) continue;       // no such transcript
-
-        tscript_t *tr = kh_val(aux->id2tr,k);
-        if ( !tr->gene->name )
-        {
-            // not a supported biotype (e.g. gene:pseudogene, transcript:processed_transcript)
-            regidx_free_tscript(&tr);
-            kh_del(int2tscript, aux->id2tr,k);
-            continue;
-        }
-
-        // populate regidx by category:
-        //      ftr->type   .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5
-        //      gene->type  .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ...
-        if ( ftr->type==GF_CDS ) register_cds(args, ftr);
-        else if ( ftr->type==GF_EXON ) register_exon(args, ftr);
-        else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr);
-        else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr);
-        else
-            error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,args->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
-    }
-    tscript_init_cds(args);
-
-    if ( args->verbosity > 0 )
-    {
-        fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
-                regidx_nregs(args->idx_tscript),
-                regidx_nregs(args->idx_exon),
-                regidx_nregs(args->idx_cds),
-                regidx_nregs(args->idx_utr));
-    }
-    if ( !regidx_nregs(args->idx_tscript) )
-        fprintf(stderr,
-            "Warning: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n"
-            "         or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n"
-            "         of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n");
-
-    free(aux->ftr);
-    khash_str2int_destroy_free(aux->seq2int);
-    // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
-    kh_destroy(int2tscript,aux->id2tr);
-    free(aux->seq);
-    gff_id_destroy(&aux->gene_ids);
-
-    if ( args->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) )
-    {
-        khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes;
-        fprintf(stderr,"Ignored the following biotypes:\n");
-        for (i = kh_begin(ign); i < kh_end(ign); i++)
-        {
-            if ( !kh_exist(ign,i)) continue;
-            const char *biotype = kh_key(ign,i);
-            if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")";
-            fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype);
-        }
-    }
-    khash_str2int_destroy_free(aux->ignored_biotypes);
-}
-
 static inline int ncsq2_to_nfmt(int ncsq2)
 {
     return 1 + (ncsq2 - 1) / 30;
@@ -1474,8 +483,17 @@ void init_data(args_t *args)
     args->fai = fai_load(args->fa_fname);
     if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
 
-    if ( args->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", args->gff_fname);
-    init_gff(args);
+    args->gff = gff_init(args->gff_fname);
+    gff_set(args->gff,verbosity,args->verbosity);
+    gff_set(args->gff,strip_chr_names,args->unify_chr_names);
+    gff_set(args->gff,force_out_of_phase,args->force);
+    gff_set(args->gff,dump_fname,args->dump_gff);
+    gff_parse(args->gff);
+    args->idx_cds  = gff_get(args->gff,idx_cds);
+    args->idx_utr  = gff_get(args->gff,idx_utr);
+    args->idx_exon = gff_get(args->gff,idx_exon);
+    args->idx_tscript = gff_get(args->gff,idx_tscript);
+    args->itr = regitr_init(NULL);
 
     args->rid = -1;
 
@@ -1536,6 +554,7 @@ void init_data(args_t *args)
         if ( args->hdr_nsmpl )
             bcf_hdr_printf(args->hdr,"##FORMAT=<ID=%s,Number=.,Type=Integer,Description=\"Bitmask of indexes to INFO/BCSQ, with interleaved first/second haplotype. Use \\\"bcftools query -f'[%%CHROM\\t%%POS\\t%%SAMPLE\\t%%TBCSQ\\n]'\\\" to translate.\">",args->bcsq_tag);
         if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
+        if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
     }
     if ( args->verbosity > 0 ) fprintf(stderr,"Calling...\n");
 }
@@ -1547,21 +566,8 @@ void destroy_data(args_t *args)
             "Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n"
             "      the limit can be increased by running with `--ncsq %d`.\n",ncsq2_to_nfmt(args->ncsq2_max)/8,1+args->ncsq2_small_warned/2);
 
-    regidx_destroy(args->idx_cds);
-    regidx_destroy(args->idx_utr);
-    regidx_destroy(args->idx_exon);
-    regidx_destroy(args->idx_tscript);
     regitr_destroy(args->itr);
-
-    khint_t k,i,j;
-    for (k=0; k<kh_end(args->init.gid2gene); k++)
-    {
-        if ( !kh_exist(args->init.gid2gene, k) ) continue;
-        gf_gene_t *gene = (gf_gene_t*) kh_val(args->init.gid2gene, k);
-        free(gene->name);
-        free(gene);
-    }
-    kh_destroy(int2gene,args->init.gid2gene);
+    gff_destroy(args->gff);
 
     if ( args->filter )
         filter_destroy(args->filter);
@@ -1569,9 +575,20 @@ void destroy_data(args_t *args)
     khp_destroy(trhp,args->active_tr);
     kh_destroy(pos2vbuf,args->pos2vbuf);
     if ( args->smpl ) smpl_ilist_destroy(args->smpl);
-    int ret;
+    int i,j,ret;
     if ( args->out_fh )
+    {
+        if ( args->write_index )
+        {
+            if ( bcf_idx_save(args->out_fh)<0 )
+            {
+                if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+                error("Error: cannot write to index %s\n", args->index_fn);
+            }
+            free(args->index_fn);
+        }
         ret = hts_close(args->out_fh);
+    }
     else
         ret = fclose(args->out);
     if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
@@ -1602,7 +619,7 @@ void destroy_data(args_t *args)
     free(args->gt_arr);
     free(args->str.s);
     free(args->str2.s);
-    gff_id_destroy(&args->tscript_ids);
+    free(args->chr_name);
 }
 
 /*
@@ -1614,7 +631,7 @@ void destroy_data(args_t *args)
 #define SPLICE_OVERLAP 3   // indel overlaps region boundary, csq set but could not determine csq
 typedef struct
 {
-    tscript_t *tr;
+    gf_tscript_t *tr;
     struct {
         int32_t pos, rlen, alen, ial;
         char *ref, *alt;
@@ -1678,7 +695,7 @@ fprintf(stderr,"build_hap:  rbeg=%d + %d    abeg=%d \n",rbeg,rlen,abeg);
     if ( rbeg < splice->vcf.pos )
     {
         assert( splice->tr->beg <= rbeg );  // this can be extended thanks to N_REF_PAD
-        kputsn(splice->tr->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref);
+        kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref);
         roff = 0;
     }
     else
@@ -1703,7 +720,7 @@ fprintf(stderr,"r2: %s\n",splice->kref.s);
         if ( end + rlen - splice->kref.l - 1 > splice->tr->end ) // trim, the requested sequence is too long (could be extended, see N_REF_PAD)
             rlen -= end + rlen - splice->kref.l - 1 - splice->tr->end;
         if ( splice->kref.l < rlen )
-            kputsn(splice->tr->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref);
+            kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref);
     }
 #if XDBG
 fprintf(stderr,"r3: %s\n",splice->kref.s);
@@ -1714,7 +731,7 @@ fprintf(stderr,"r3: %s\n",splice->kref.s);
     if ( abeg < splice->vcf.pos )
     {
         assert( splice->tr->beg <= abeg );
-        kputsn(splice->tr->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt);
+        kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt);
         aoff = 0;
     }
     else
@@ -1742,7 +759,7 @@ fprintf(stderr,"a2: %s  aoff=%d\n",splice->kalt.s,aoff);
         if ( end + alen + aoff - splice->kalt.l - 1 > splice->tr->end ) // trim, the requested sequence is too long
             alen -= end + alen + aoff - splice->kalt.l - 1 - splice->tr->end;
         if ( alen > 0 && alen > splice->kalt.l )
-            kputsn(splice->tr->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt);
+            kputsn(TSCRIPT_AUX(splice->tr)->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt);
     }
 #if XDBG
 fprintf(stderr,"a3: %s\n",splice->kalt.s);
@@ -1755,7 +772,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32
     while ( regitr_overlap(itr) )
     {
         gf_utr_t *utr = regitr_payload(itr, gf_utr_t*);
-        tscript_t *tr = utr->tr;
+        gf_tscript_t *tr = utr->tr;
         if ( tr->id != trid ) continue;
         csq_t csq;
         memset(&csq, 0, sizeof(csq_t));
@@ -1771,7 +788,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32
     }
     return 0;
 }
-static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type, int ial)
+static inline void csq_stage_splice(args_t *args, bcf1_t *rec, gf_tscript_t *tr, uint32_t type, int ial)
 {
 #if XDBG
 fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
@@ -1788,6 +805,21 @@ fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
     csq.type.gene    = tr->gene->name;
     csq_stage(args, &csq, rec);
 }
+static inline const char *drop_chr_prefix(args_t *args, const char *chr)
+{
+    if ( !args->unify_chr_names ) return chr;
+    if ( !strncasecmp("chr",chr,3) ) return chr+3;
+    return chr;
+}
+static inline const char *add_chr_prefix(args_t *args, const char *chr)
+{
+    if ( !args->unify_chr_names ) return chr;
+    int len = strlen(chr);
+    hts_expand(char,len+4,args->mchr_name,args->chr_name);
+    memcpy(args->chr_name,"chr",3);
+    memcpy(args->chr_name+3,chr,len+1);
+    return args->chr_name;
+}
 static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
 {
     // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG, 1bp
@@ -1813,7 +845,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d  check_ut
         if ( splice->check_utr )
         {
             regitr_t *itr = regitr_init(NULL);
-            const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+            const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
             if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) )     // adjacent utr
             {
                 ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
@@ -1851,7 +883,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d  check_ut
         if ( splice->check_utr )
         {
             regitr_t *itr = regitr_init(NULL);
-            const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+            const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
             if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) )     // adjacent utr
             {
                 ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
@@ -1924,7 +956,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d  check_ut
 int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
 {
     static int small_ref_padding_warned = 0;
-    tscript_t *tr = splice->tr;
+    gf_tscript_t *tr = splice->tr;
 
     // We know the VCF record overlaps the exon, but does it overlap the start codon?
     if ( tr->strand==STRAND_REV && splice->vcf.pos + splice->vcf.rlen + 2 <= ex_end ) return 0;
@@ -1956,7 +988,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint
         }
 
         char *ptr_vcf = splice->vcf.ref + alt_len;                         // the first deleted base in the VCF REF allele
-        char *ptr_ref = splice->tr->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg);  // the first ref base after the ndel bases deleted
+        char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg);  // the first ref base after the ndel bases deleted
 #if XDBG
         fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref);
 #endif
@@ -1985,7 +1017,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint
         }
 
         char *ptr_vcf = splice->vcf.ref + alt_len;                                      // the first deleted base in the VCF REF allele
-        char *ptr_ref = splice->tr->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg;  // the replacement ref block
+        char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg;  // the replacement ref block
 #if XDBG
         fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref);
 #endif
@@ -2030,7 +1062,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%
             if ( splice->check_utr )
             {
                 regitr_t *itr = regitr_init(NULL);
-                const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+                const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
                 if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) )     // adjacent utr
                     csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
                 regitr_destroy(itr);
@@ -2086,7 +1118,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%
             if ( splice->check_utr )
             {
                 regitr_t *itr = regitr_init(NULL);
-                const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+                const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
                 if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) )     // adjacent utr
                     csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
                 regitr_destroy(itr);
@@ -2175,7 +1207,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d  check_ut
             if ( splice->check_utr )
             {
                 regitr_t *itr = regitr_init(NULL);
-                const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+                const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
                 if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) )     // adjacent utr
                     csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
                 regitr_destroy(itr);
@@ -2205,7 +1237,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d  check_ut
             if ( splice->check_utr )
             {
                 regitr_t *itr = regitr_init(NULL);
-                const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+                const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
                 if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) )     // adjacent utr
                     csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
                 regitr_destroy(itr);
@@ -2291,7 +1323,7 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds,
 {
     int i;
     kstring_t str = {0,0,0};
-    tscript_t *tr = cds->tr;
+    gf_tscript_t *tr = cds->tr;
     child->icds = cds->icds;     // index of cds in the tscript's list of exons
     child->vcf_ial = ial;
 
@@ -2313,8 +1345,8 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds,
     }
     if ( splice.check_start )   // do not check starts in incomplete CDS, defined as not starting with M
     {
-        if ( tr->strand==STRAND_FWD ) { if ( dna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
-        else { if ( cdna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
+        if ( tr->strand==STRAND_FWD ) { if ( dna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
+        else { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
     }
     if ( child->icds!=0 ) splice.check_region_beg = 1;
     if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1;
@@ -2373,12 +1405,12 @@ fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n
             // the variant is on a new exon, finish up the previous
             int len = tr->cds[i]->len - parent->rbeg - parent->rlen + tr->cds[i]->beg;
             if ( len > 0 )
-                kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+                kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
         }
 
         // append any skipped non-variant exons
         while ( ++i < cds->icds )
-            kputsn_(tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str);
+            kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str);
 
         if ( parent->icds==child->icds )
         {
@@ -2390,10 +1422,10 @@ fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n
                 free(splice.kalt.s);
                 return 1;
             }
-            kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+            kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
         }
         else
-            kputsn_(tr->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str);
+            kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str);
     }
     kputs(splice.kalt.s + dbeg, &str);
 
@@ -2645,28 +1677,28 @@ fprintf(stderr,"\ntranslate: %d %d %d  fill=%d  seq.l=%d\n",sbeg,rbeg,rend,fill,
 #endif
 }
 
-void tscript_splice_ref(tscript_t *tr)
+void tscript_splice_ref(gf_tscript_t *tr)
 {
     int i, len = 0;
     for (i=0; i<tr->ncds; i++)
         len += tr->cds[i]->len;
 
-    tr->nsref = len + 2*N_REF_PAD;
-    tr->sref  = (char*) malloc(len + 1 + 2*N_REF_PAD);
+    TSCRIPT_AUX(tr)->nsref = len + 2*N_REF_PAD;
+    TSCRIPT_AUX(tr)->sref  = (char*) malloc(len + 1 + 2*N_REF_PAD);
     len = 0;
 
-    memcpy(tr->sref, tr->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD);
+    memcpy(TSCRIPT_AUX(tr)->sref, TSCRIPT_AUX(tr)->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD);
     len += N_REF_PAD;
 
     for (i=0; i<tr->ncds; i++)
     {
-        memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len);
+        memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len);
         len += tr->cds[i]->len;
     }
-    memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD);
+    memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD);
     len += N_REF_PAD;
 
-    tr->sref[len] = 0;
+    TSCRIPT_AUX(tr)->sref[len] = 0;
 }
 
 // returns: 0 if consequence was added, 1 if it already exists or could not be added
@@ -2800,18 +1832,25 @@ void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str)
     if ( csq->type & CSQ_UPSTREAM_STOP )
         kputc_('*',str);
 
-    int i, n = sizeof(csq_strings)/sizeof(char*);
+    int has_csq = 0, i, n = sizeof(csq_strings)/sizeof(char*);
     for (i=1; i<n; i++)
-        if ( csq_strings[i] && csq->type&(1<<i) ) { kputs(csq_strings[i],str); break; }
+        if ( csq_strings[i] && csq->type&(1<<i) ) { has_csq = 1; kputs(csq_strings[i],str); break; }
     i++;
     for (; i<n; i++)
-        if ( csq_strings[i] && csq->type&(1<<i) ) { kputc_('&',str); kputs(csq_strings[i],str); }
+        if ( csq_strings[i] && csq->type&(1<<i) ) { has_csq = 1; kputc_('&',str); kputs(csq_strings[i],str); }
+
+    if ( (csq->biotype==GF_NMD) && (csq->type & CSQ_PRN_NMD) )
+    {
+        if ( has_csq ) kputc_('&',str); // just in case, this should always be true
+        kputs("NMD_transcript",str);
+    }
 
     kputc_('|', str);
     if ( csq->gene ) kputs(csq->gene , str);
 
     kputc_('|', str);
-    if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str);
+//    if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str);
+    if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(gff_id2string(args->gff,transcript,csq->trid), str);
 
     kputc_('|', str);
     kputs(gf_type2gff_string(csq->biotype), str);
@@ -2840,7 +1879,7 @@ void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str)
 void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel)
 {
     int i;
-    tscript_t *tr = hap->tr;
+    gf_tscript_t *tr = hap->tr;
     int ref_node = tr->strand==STRAND_FWD ? ibeg : iend;
     int icsq = node->ncsq_list++;
     hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
@@ -2954,7 +1993,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
     str.l = 0;
 
     // create the aa variant string
-    int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (hap->tr->nsref - 2*N_REF_PAD - node2rend(iend))/3+1;
+    int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (TSCRIPT_AUX(hap->tr)->nsref - 2*N_REF_PAD - node2rend(iend))/3+1;
     int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1;
     kputc_('|', &str);
     kputw(aa_rbeg, &str);
@@ -3020,13 +2059,13 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
 
 void hap_finalize(args_t *args, hap_t *hap)
 {
-    tscript_t *tr = hap->tr;
-    if ( !tr->sref )
+    gf_tscript_t *tr = hap->tr;
+    if ( !TSCRIPT_AUX(tr)->sref )
         tscript_splice_ref(tr);
 
     kstring_t sref;
-    sref.s = tr->sref;
-    sref.l = tr->nsref;
+    sref.s = TSCRIPT_AUX(tr)->sref;
+    sref.l = TSCRIPT_AUX(tr)->nsref;
     sref.m = sref.l;
 
     int istack = 0;
@@ -3034,7 +2073,7 @@ void hap_finalize(args_t *args, hap_t *hap)
 
     hap->sseq.l = 0;
     hap->tseq.l = 0;
-    hap->stack[0].node = tr->root;
+    hap->stack[0].node = TSCRIPT_AUX(tr)->root;
     hap->stack[0].ichild = -1;
     hap->stack[0].slen = 0;
     hap->stack[0].dlen = 0;
@@ -3214,7 +2253,7 @@ static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap)
     kput_vcsq(args, &csq->type, &args->str);
     fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
 }
-static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+static inline void hap_print_text(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
 {
     if ( !node || !node->ncsq_list ) return;
 
@@ -3240,7 +2279,7 @@ static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ih
     }
 }
 
-static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+static inline void hap_stage_vcf(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
 {
     if ( !node || !node->ncsq_list || ismpl<0 ) return;
 
@@ -3276,23 +2315,23 @@ void hap_flush(args_t *args, uint32_t pos)
     tr_heap_t *heap = args->active_tr;
     while ( heap->ndat && heap->dat[0]->end<=pos )
     {
-        tscript_t *tr = heap->dat[0];
+        gf_tscript_t *tr = heap->dat[0];
         khp_delete(trhp, heap);
         args->hap->tr = tr;
-        if ( tr->root && tr->root->nchild ) // normal, non-localized calling
+        if ( TSCRIPT_AUX(tr)->root && TSCRIPT_AUX(tr)->root->nchild ) // normal, non-localized calling
         {
             hap_finalize(args, args->hap);
 
             if ( args->output_type==FT_TAB_TEXT )   // plain text output, not a vcf
             {
                 if ( args->phase==PHASE_DROP_GT )
-                    hap_print_text(args, tr, -1,0, tr->hap[0]);
+                    hap_print_text(args, tr, -1,0, TSCRIPT_AUX(tr)->hap[0]);
                 else
                 {
                     for (i=0; i<args->smpl->n; i++)
                     {
                         for (j=0; j<2; j++)
-                            hap_print_text(args, tr, args->smpl->idx[i],j+1, tr->hap[i*2+j]);
+                            hap_print_text(args, tr, args->smpl->idx[i],j+1, TSCRIPT_AUX(tr)->hap[i*2+j]);
                     }
                 }
             }
@@ -3301,7 +2340,7 @@ void hap_flush(args_t *args, uint32_t pos)
                 for (i=0; i<args->smpl->n; i++)
                 {
                     for (j=0; j<2; j++)
-                        hap_stage_vcf(args, tr, args->smpl->idx[i],j, tr->hap[i*2+j]);
+                        hap_stage_vcf(args, tr, args->smpl->idx[i],j, TSCRIPT_AUX(tr)->hap[i*2+j]);
                 }
             }
         }
@@ -3309,7 +2348,7 @@ void hap_flush(args_t *args, uint32_t pos)
         // mark the transcript for deletion. Cannot delete it immediately because
         // by-position VCF output will need them when flushed by vcf_buf_push
         args->nrm_tr++;
-        hts_expand(tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr);
+        hts_expand(gf_tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr);
         args->rm_tr[args->nrm_tr-1] = tr;
     }
 }
@@ -3424,24 +2463,33 @@ void vbuf_flush(args_t *args, uint32_t pos)
 
     for (i=0; i<args->nrm_tr; i++)
     {
-        tscript_t *tr = args->rm_tr[i];
-        if ( tr->root ) hap_destroy(tr->root);
-        tr->root = NULL;
-        free(tr->hap);
-        free(tr->ref);
-        free(tr->sref);
+        gf_tscript_t *tr = args->rm_tr[i];
+        tscript_t *aux = TSCRIPT_AUX(tr);
+        if ( aux->root ) hap_destroy(aux->root);
+        aux->root = NULL;
+        free(aux->hap);
+        free(aux->ref);
+        free(aux->sref);
+        free(aux);
+        tr->aux = NULL;
     }
     args->nrm_tr = 0;
     args->ncsq_buf = 0;
 }
 
-void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr)
+void tscript_init_ref(args_t *args, gf_tscript_t *tr, const char *chr)
 {
     int i, len;
     int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg;
 
-    tr->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
-    if ( !tr->ref )
+    const char *tmp_chr = chr;
+    if ( !faidx_has_seq(args->fai,tmp_chr) )
+    {
+        tmp_chr = drop_chr_prefix(args,chr);
+        if ( !faidx_has_seq(args->fai,tmp_chr) ) tmp_chr = add_chr_prefix(args,chr);
+    }
+    TSCRIPT_AUX(tr)->ref = faidx_fetch_seq(args->fai, tmp_chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
+    if ( !TSCRIPT_AUX(tr)->ref )
         error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1);
 
     int pad_end = len - (tr->end - tr->beg + 1 + pad_beg);
@@ -3449,23 +2497,23 @@ void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr)
     {
         char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD + 1);
         for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N';
-        memcpy(ref+i, tr->ref, len);
+        memcpy(ref+i, TSCRIPT_AUX(tr)->ref, len);
         len += i;
         for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N';
         ref[i+len] = 0;
-        free(tr->ref);
-        tr->ref = ref;
+        free(TSCRIPT_AUX(tr)->ref);
+        TSCRIPT_AUX(tr)->ref = ref;
     }
 }
 
-static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec)
+static void sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec)
 {
     int vbeg = 0;
     int rbeg = rec->pos - tr->beg + N_REF_PAD;
     if ( rbeg < 0 ) { vbeg += abs(rbeg); rbeg = 0; }
-    char *ref = tr->ref + rbeg;
+    char *ref = TSCRIPT_AUX(tr)->ref + rbeg;
     char *vcf = rec->d.allele[0] + vbeg;
-    assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - tr->ref < tr->end - tr->beg + 2*N_REF_PAD );
+    assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - TSCRIPT_AUX(tr)->ref < tr->end - tr->beg + 2*N_REF_PAD );
     int i = 0;
     while ( ref[i] && vcf[i] )
     {
@@ -3479,7 +2527,7 @@ static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec)
 int test_cds_local(args_t *args, bcf1_t *rec)
 {
     int i,j, ret = 0;
-    const char *chr = bcf_seqname(args->hdr,rec);
+    const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
     // note that the off-by-one extension of rlen is deliberate to account for insertions
     if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
 
@@ -3491,12 +2539,13 @@ int test_cds_local(args_t *args, bcf1_t *rec)
     while ( regitr_overlap(args->itr) )
     {
         gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
-        tscript_t *tr = cds->tr;
+        gf_tscript_t *tr = cds->tr;
         if ( !GF_is_coding(tr->type) ) continue;
         ret = 1;
 
-        if ( !tr->ref )
+        if ( !TSCRIPT_AUX(tr) )
         {
+            tr->aux = calloc(sizeof(tscript_t),1);
             tscript_init_ref(args, tr, chr);
             tscript_splice_ref(tr);
             khp_insert(trhp, args->active_tr, &tr);     // only to clean the reference afterwards
@@ -3505,8 +2554,8 @@ int test_cds_local(args_t *args, bcf1_t *rec)
         sanity_check_ref(args, tr, rec);
 
         kstring_t sref;
-        sref.s = tr->sref;
-        sref.l = tr->nsref;
+        sref.s = TSCRIPT_AUX(tr)->sref;
+        sref.l = TSCRIPT_AUX(tr)->nsref;
         sref.m = sref.l;
 
         for (i=1; i<rec->n_allele; i++)
@@ -3614,8 +2663,8 @@ int test_cds_local(args_t *args, bcf1_t *rec)
                 {
                     // create the aa variant string
                     kstring_t str = {0,0,0};
-                    int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1;
-                    int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
+                    int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1;
+                    int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
                     kputc_('|', &str);
                     kputw(aa_rbeg, &str);
                     kprint_aa_prediction(args,aa_rbeg,tref,&str);
@@ -3633,11 +2682,11 @@ int test_cds_local(args_t *args, bcf1_t *rec)
                     csq_stage(args, &csq, rec);
 
                     // all this only to clean vstr when vrec is flushed
-                    if ( !tr->root )
-                        tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
-                    tr->root->ncsq_list++;
-                    hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list);
-                    csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1;
+                    if ( !TSCRIPT_AUX(tr)->root )
+                        TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+                    TSCRIPT_AUX(tr)->root->ncsq_list++;
+                    hts_expand0(csq_t,TSCRIPT_AUX(tr)->root->ncsq_list,TSCRIPT_AUX(tr)->root->mcsq_list,TSCRIPT_AUX(tr)->root->csq_list);
+                    csq_t *rm_csq = TSCRIPT_AUX(tr)->root->csq_list + TSCRIPT_AUX(tr)->root->ncsq_list - 1;
                     rm_csq->type.vstr = str;
                 }
                 if ( csq_type & ~CSQ_COMPOUND )
@@ -3659,27 +2708,28 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
     static int overlaps_warned = 0, multiploid_warned = 0;
 
     int i, ret = 0, hap_ret;
-    const char *chr = bcf_seqname(args->hdr,rec);
+    const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
     // note that the off-by-one extension of rlen is deliberate to account for insertions
     if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
     while ( regitr_overlap(args->itr) )
     {
         gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
-        tscript_t *tr = cds->tr;
+        gf_tscript_t *tr = cds->tr;
         if ( !GF_is_coding(tr->type) ) continue;
         if ( vbuf->keep_until < tr->end ) vbuf->keep_until = tr->end;
         ret = 1;
-        if ( !tr->root )
+        if ( !TSCRIPT_AUX(tr) )
         {
             // initialize the transcript and its haplotype tree, fetch the reference sequence
+            tr->aux = calloc(sizeof(tscript_t),1);
             tscript_init_ref(args, tr, chr);
 
-            tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
-            tr->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n;     // maximum ploidy = diploid
-            tr->hap  = (hap_node_t**) malloc(tr->nhap*sizeof(hap_node_t*));
-            for (i=0; i<tr->nhap; i++) tr->hap[i] = NULL;
-            tr->root->nend = tr->nhap;
-            tr->root->type = HAP_ROOT;
+            TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+            TSCRIPT_AUX(tr)->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n;     // maximum ploidy = diploid
+            TSCRIPT_AUX(tr)->hap  = (hap_node_t**) malloc(TSCRIPT_AUX(tr)->nhap*sizeof(hap_node_t*));
+            for (i=0; i<TSCRIPT_AUX(tr)->nhap; i++) TSCRIPT_AUX(tr)->hap[i] = NULL;
+            TSCRIPT_AUX(tr)->root->nend = TSCRIPT_AUX(tr)->nhap;
+            TSCRIPT_AUX(tr)->root->type = HAP_ROOT;
 
             khp_insert(trhp, args->active_tr, &tr);
         }
@@ -3689,7 +2739,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
         if ( args->phase==PHASE_DROP_GT )
         {
             if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
-            hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root;
+            hap_node_t *parent = TSCRIPT_AUX(tr)->hap[0] ? TSCRIPT_AUX(tr)->hap[0] : TSCRIPT_AUX(tr)->root;
             hap_node_t *child  = (hap_node_t*)calloc(1,sizeof(hap_node_t));
             hap_ret = hap_init(args, parent, child, cds, rec, 1);
             if ( hap_ret!=0 )
@@ -3734,8 +2784,8 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
             parent->mchild = 1;
             parent->child  = (hap_node_t**) malloc(sizeof(hap_node_t*));
             parent->child[0] = child;
-            tr->hap[0] = child;
-            tr->hap[0]->nend = 1;
+            TSCRIPT_AUX(tr)->hap[0] = child;
+            TSCRIPT_AUX(tr)->hap[0]->nend = 1;
             continue;
         }
 
@@ -3793,12 +2843,12 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
                 assert( ial < rec->n_allele );
                 if ( rec->d.allele[ial][0]=='<' || rec->d.allele[ial][0]=='*' ) { continue; }
 
-                hap_node_t *parent = tr->hap[i] ? tr->hap[i] : tr->root;
+                hap_node_t *parent = TSCRIPT_AUX(tr)->hap[i] ? TSCRIPT_AUX(tr)->hap[i] : TSCRIPT_AUX(tr)->root;
                 if ( parent->cur_rec==rec && parent->cur_child[ial]>=0 )
                 {
                     // this haplotype has been seen in another sample
-                    tr->hap[i] = parent->child[ parent->cur_child[ial] ];
-                    tr->hap[i]->nend++;
+                    TSCRIPT_AUX(tr)->hap[i] = parent->child[ parent->cur_child[ial] ];
+                    TSCRIPT_AUX(tr)->hap[i]->nend++;
                     parent->nend--;
                     continue;
                 }
@@ -3852,8 +2902,8 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
                 hts_expand0(hap_node_t*,parent->nchild,parent->mchild,parent->child);
                 parent->cur_child[ial] = j;
                 parent->child[j] = child;
-                tr->hap[i] = child;
-                tr->hap[i]->nend++;
+                TSCRIPT_AUX(tr)->hap[i] = child;
+                TSCRIPT_AUX(tr)->hap[i]->nend++;
                 parent->nend--;
             }
         }
@@ -3933,7 +2983,7 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec)
 }
 int test_utr(args_t *args, bcf1_t *rec)
 {
-    const char *chr = bcf_seqname(args->hdr,rec);
+    const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
     // note that the off-by-one extension of rlen is deliberate to account for insertions
     if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
 
@@ -3944,7 +2994,7 @@ int test_utr(args_t *args, bcf1_t *rec)
     while ( regitr_overlap(args->itr) )
     {
         gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*);
-        tscript_t *tr = splice.tr = utr->tr;
+        gf_tscript_t *tr = splice.tr = utr->tr;
         for (i=1; i<rec->n_allele; i++)
         {
             if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
@@ -3971,7 +3021,7 @@ int test_utr(args_t *args, bcf1_t *rec)
 }
 int test_splice(args_t *args, bcf1_t *rec)
 {
-    const char *chr = bcf_seqname(args->hdr,rec);
+    const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
     if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0;
 
     splice_t splice;
@@ -4003,7 +3053,7 @@ int test_splice(args_t *args, bcf1_t *rec)
 }
 int test_tscript(args_t *args, bcf1_t *rec)
 {
-    const char *chr = bcf_seqname(args->hdr,rec);
+    const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
     if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
 
     splice_t splice;
@@ -4012,7 +3062,7 @@ int test_tscript(args_t *args, bcf1_t *rec)
     int i, ret = 0;
     while ( regitr_overlap(args->itr) )
     {
-        tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
+        gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*);
         for (i=1; i<rec->n_allele; i++)
         {
             if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
@@ -4046,7 +3096,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
         warned = 1;
     }
 
-    const char *chr = bcf_seqname(args->hdr,rec);
+    const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
 
     // only insertions atm
     int beg = rec->pos + 1;
@@ -4061,7 +3111,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
             csq_t csq;
             memset(&csq, 0, sizeof(csq_t));
             gf_cds_t *cds    = regitr_payload(args->itr,gf_cds_t*);
-            tscript_t *tr    = cds->tr;
+            gf_tscript_t *tr = cds->tr;
             csq.type.type    = (GF_is_coding(tr->type) ? CSQ_CODING_SEQUENCE : CSQ_NON_CODING) | csq_class;
             csq.pos          = rec->pos;
             csq.type.biotype = tr->type;
@@ -4079,7 +3129,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
             csq_t csq;
             memset(&csq, 0, sizeof(csq_t));
             gf_utr_t *utr    = regitr_payload(args->itr, gf_utr_t*);
-            tscript_t *tr    = utr->tr;
+            gf_tscript_t *tr = utr->tr;
             csq.type.type    = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | csq_class;
             csq.pos          = rec->pos;
             csq.type.biotype = tr->type;
@@ -4118,7 +3168,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
         {
             csq_t csq;
             memset(&csq, 0, sizeof(csq_t));
-            tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
+            gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*);
             splice.vcf.alt = rec->d.allele[1];
             splice.csq     = csq_class;
             int splice_ret = splice_csq(args, &splice, tr->beg, tr->end);
@@ -4179,7 +3229,10 @@ static void process(args_t *args, bcf1_t **rec_ptr)
         // Perform a simple sanity check (that does not catch much), the chromosome must be present in the
         // reference file
         if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) )
-            error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname);
+        {
+            if ( !faidx_has_seq(args->fai,drop_chr_prefix(args,bcf_seqname(args->hdr,rec))) && !faidx_has_seq(args->fai,add_chr_prefix(args,bcf_seqname(args->hdr,rec))) )
+                error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname);
+        }
     }
     if ( prev_pos > rec->pos )
         error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
@@ -4254,9 +3307,12 @@ static const char *usage(void)
         "                                       r: require phased GTs, throw an error on unphased het GTs\n"
         "                                       R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
         "                                       s: skip unphased hets\n"
-        "Options:\n"
-        "   -e, --exclude EXPR                Exclude sites for which the expression is true\n"
+        "GFF options:\n"
+        "       --dump-gff FILE.gz            Dump the parsed GFF file (for debugging purposes)\n"
         "       --force                       Run even if some sanity checks fail\n"
+        "       --unify-chr-names 1|0         Automatically unify chromosome naming (e.g. chrX vs X) in GFF, fasta, and VCF [1]\n"
+        "General options:\n"
+        "   -e, --exclude EXPR                Exclude sites for which the expression is true\n"
         "   -i, --include EXPR                Select sites for which the expression is true\n"
         "       --no-version                  Do not append version and command line to the header\n"
         "   -o, --output FILE                 Write output to a file [standard output]\n"
@@ -4272,6 +3328,7 @@ static const char *usage(void)
         "       --targets-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
         "       --threads INT                 Use multithreading with <int> worker threads [0]\n"
         "   -v, --verbose INT                 Verbosity level 0-2 [1]\n"
+        "       --write-index                 Automatically index the output files [off]\n"
         "\n"
         "Example:\n"
         "   bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
@@ -4292,6 +3349,7 @@ int main_csq(int argc, char *argv[])
     args->verbosity = 1;
     args->record_cmd_line = 1;
     args->clevel = -1;
+    args->unify_chr_names = 1;
 
     static struct option loptions[] =
     {
@@ -4321,6 +3379,9 @@ int main_csq(int argc, char *argv[])
         {"targets-file",1,0,'T'},
         {"targets-overlap",required_argument,NULL,5},
         {"no-version",no_argument,NULL,3},
+        {"write-index",no_argument,NULL,6},
+        {"dump-gff",required_argument,NULL,7},
+        {"unify-chr-names",required_argument,NULL,8},
         {0,0,0,0}
     };
     int c, targets_is_file = 0, regions_is_file = 0;
@@ -4339,7 +3400,7 @@ int main_csq(int argc, char *argv[])
             case  3 : args->record_cmd_line = 0; break;
             case 'b':
                     args->brief_predictions = 1;
-                    fprintf(stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n");
+                    fprintf(stderr,"Warning: The -b option will be removed in future versions. Please use -B 1 instead.\n");
                     break;
             case 'B':
                     args->brief_predictions = strtol(optarg,&tmp,10);
@@ -4409,6 +3470,13 @@ int main_csq(int argc, char *argv[])
                 targets_overlap = parse_overlap_option(optarg);
                 if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
                 break;
+            case  6 : args->write_index = 1; break;
+            case  7 : args->dump_gff = optarg; break;
+            case  8 :
+                if ( !strcmp(optarg,"0") ) args->unify_chr_names = 0;
+                else if ( !strcmp(optarg,"1") ) args->unify_chr_names = 1;
+                else error("Could not parse: --unify-chr-names %s\n",optarg);
+                break;
             case 'h':
             case '?': error("%s",usage());
             default: error("The option not recognised: %s\n\n", optarg); break;
diff --git a/doc/bcftools.1 b/doc/bcftools.1
index 0e3d5290e..c940065fb 100644
--- a/doc/bcftools.1
+++ b/doc/bcftools.1
@@ -2,12 +2,12 @@
 .\"     Title: bcftools
 .\"    Author: [see the "AUTHOR(S)" section]
 .\" Generator: Asciidoctor 2.0.16.dev
-.\"      Date: 2023-02-21
+.\"      Date: 2023-07-25
 .\"    Manual: \ \&
 .\"    Source: \ \&
 .\"  Language: English
 .\"
-.TH "BCFTOOLS" "1" "2023-02-21" "\ \&" "\ \&"
+.TH "BCFTOOLS" "1" "2023-07-25" "\ \&" "\ \&"
 .ie \n(.g .ds Aq \(aq
 .el       .ds Aq '
 .ss \n[.ss] 0
@@ -51,10 +51,10 @@ standard input (stdin) and outputs to the standard output (stdout). Several
 commands can thus be  combined  with  Unix pipes.
 .SS "VERSION"
 .sp
-This manual page was last updated \fB2023\-02\-21\fP and refers to bcftools git version \fB1.17\fP.
+This manual page was last updated \fB2023\-07\-25\fP and refers to bcftools git version \fB1.18\fP.
 .SS "BCF1"
 .sp
-The BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP
+The obsolete BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP
 compatible with this version of bcftools. To read BCF1 files one can use
 the view command from old versions of bcftools packaged with samtools
 versions <= 0.1.19 to convert to VCF, which can then be read by
@@ -75,6 +75,9 @@ done with \fIbcftools view\fP. Users are now required to choose between the old
 samtools calling model (\fI\-c/\-\-consensus\-caller\fP) and the new multiallelic
 calling model (\fI\-m/\-\-multiallelic\-caller\fP). The multiallelic calling model
 is recommended for most tasks.
+.SS "FILTERING EXPRESSIONS"
+.sp
+See \fBEXPRESSIONS\fP
 .SH "LIST OF COMMANDS"
 .sp
 For a full list of available commands, run \fBbcftools\fP without arguments. For a full
@@ -344,6 +347,17 @@ Some helper scripts are bundled with the bcftools code.
 .  sp -1
 .  IP \(bu 2.3
 .\}
+\fBgff2gff\fP  .. converts a GFF file to the format required by \fBcsq\fP
+.RE
+.sp
+.RS 4
+.ie n \{\
+\h'-04'\(bu\h'+03'\c
+.\}
+.el \{\
+.  sp -1
+.  IP \(bu 2.3
+.\}
 \fBplot\-vcfstats\fP  .. plots the output of \fBstats\fP
 .RE
 .SH "COMMANDS AND OPTIONS"
@@ -597,6 +611,11 @@ Same as \fB\-\-regions\-overlap\fP but for \fB\-t/\-T\fP.
 Use multithreading with \fIINT\fP worker threads. The option is currently used only for the compression of the
 output stream, only when \fI\-\-output\-type\fP is \fIb\fP or \fIz\fP. Default: 0.
 .RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output files. Can be used only for compressed BCF and VCF output.
+.RE
 .SS "bcftools annotate \fI[OPTIONS]\fP \fIFILE\fP"
 .sp
 Add or remove annotations.
@@ -881,6 +900,11 @@ except GT. To remove all INFO tags except "FOO" and "BAR", use
 "INFO" can be abbreviated to "INF" and "FORMAT" to "FMT".
 .RE
 .sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
+.sp
 \fBExamples:\fP
 .sp
 .if n .RS 4
@@ -1017,6 +1041,11 @@ see \fBCommon Options\fP
 .RS 4
 see \fBCommon Options\fP
 .RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
 .SS "Input/output options:"
 .sp
 \fB\-A, \-\-keep\-alts\fP
@@ -1401,6 +1430,11 @@ see \fBCommon Options\fP
 .RS 4
 see \fBCommon Options\fP
 .RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
 .SS "bcftools consensus \fI[OPTIONS]\fP \fIFILE\fP"
 .sp
 Create consensus sequence by applying VCF variants to a reference fasta file.
@@ -1432,18 +1466,13 @@ exclude sites for which \fIEXPRESSION\fP is true. For valid expressions see
 reference sequence in fasta format
 .RE
 .sp
-\fB\-H, \-\-haplotype\fP \fI1\fP|\fI2\fP|\fIR\fP|\fIA\fP|\fII\fP|\fILR\fP|\fILA\fP|\fISR\fP|\fISA\fP|\fI1pIu\fP|\fI2pIu\fP
+\fB\-H, \-\-haplotype\fP N|\fIR\fP|\fIA\fP|\fII\fP|\fILR\fP|\fILA\fP|\fISR\fP|\fISA\fP|\fINpIu\fP
 .RS 4
 choose which allele from the FORMAT/GT field to use (the codes are case\-insensitive):
 .sp
-\fI1\fP
-.RS 4
-the first allele, regardless of phasing
-.RE
-.sp
-\fI2\fP
+\fIN\fP
 .RS 4
-the second allele, regardless of phasing
+N={1,2,3,...}, the allele index within the genotype, regardless of phasing
 .RE
 .sp
 \fIR\fP
@@ -1471,20 +1500,15 @@ the longer allele. If both have the same length, use the REF allele (LR), or the
 the shorter allele. If both have the same length, use the REF allele (SR), or the ALT allele  (SA)
 .RE
 .sp
-\fI1pIu, 2pIu\fP
+\fINpIu\fP
 .RS 4
-first/second allele for phased genotypes and IUPAC code for unphased genotypes
-.sp
-.if n .RS 4
-.nf
-.fam C
-This option requires *\-s*, unless exactly one sample is present in the VCF
-.fam
-.fi
-.if n .RE
+N={1,2,3,...}, the allele index within genotype for phased genotypes and IUPAC code for unphased genotypes.
+For example, \fI1pIu\fP or \fI2pIu\fP
 .RE
 .RE
 .sp
+Note that the \fB\-H, \-\-haplotype\fP option requires the \fB\-s, \-\-samples\fP option, unless exactly one sample is present in the VCF
+.sp
 \fB\-i, \-\-include\fP \fIEXPRESSION\fP
 .RS 4
 include only sites for which \fIEXPRESSION\fP is true. For valid expressions see
@@ -1494,24 +1518,24 @@ include only sites for which \fIEXPRESSION\fP is true. For valid expressions see
 \fB\-I, \-\-iupac\-codes\fP
 .RS 4
 output variants in the form of IUPAC ambiguity codes determined from FORMAT/GT fields. By default all
-samples are used and can be subset with \f(CR\-s, \-\-samples\fP and \f(CR\-S, \-\-samples\-file\fP. Use \f(CR\-s \-\fP to ignore
+samples are used and can be subset with \fB\-s, \-\-samples\fP and \fB\-S, \-\-samples\-file\fP. Use \fB\-s \-\fP to ignore
 samples and use only the REF and ALT columns.  NOTE: prior to version 1.17 the IUPAC codes were determined solely
 from REF,ALT columns and sample genotypes were not considered.
 .RE
 .sp
 \fB\-\-mark\-del\fP \fICHAR\fP
 .RS 4
-instead of removing sequence, insert CHAR for deletions
+instead of removing sequence, insert character CHAR for deletions
 .RE
 .sp
-\fB\-\-mark\-ins\fP \fIuc\fP|\fIlc\fP
+\fB\-\-mark\-ins\fP \fIuc\fP|\fIlc\fP|\fICHAR\fP
 .RS 4
-highlight inserted sequence in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is
+highlight inserted sequence in uppercase (uc), lowercase (lc), or a provided character CHAR, leaving the rest of the sequence as is
 .RE
 .sp
 \fB\-\-mark\-snv\fP \fIuc\fP|\fIlc\fP
 .RS 4
-highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is
+highlight substitutions in uppercase (uc), lowercase (lc), or a provided character CHAR, leaving the rest of the sequence as is
 .RE
 .sp
 \fB\-m, \-\-mask\fP \fIFILE\fP
@@ -1539,12 +1563,12 @@ write output to a file
 .sp
 \fB\-s, \-\-samples\fP \fILIST\fP
 .RS 4
-apply variants of the listed samples. See also the option \f(CR\-I, \-\-iupac\-codes\fP
+apply variants of the listed samples. See also the option \fB\-I, \-\-iupac\-codes\fP
 .RE
 .sp
 \fB\-S, \-\-samples\-file\fP \fIFILE\fP
 .RS 4
-apply variants of the samples listed in the file. See also the option \f(CR\-I, \-\-iupac\-codes\fP
+apply variants of the samples listed in the file. See also the option \fB\-I, \-\-iupac\-codes\fP
 .RE
 .sp
 \fBExamples:\fP
@@ -1563,6 +1587,44 @@ apply variants of the samples listed in the file. See also the option \f(CR\-I,
 .fam
 .fi
 .if n .RE
+.sp
+\fBNotes:\fP
+.RS 4
+Masking options are applied in the following order
+.sp
+.RS 4
+.ie n \{\
+\h'-04' 1.\h'+01'\c
+.\}
+.el \{\
+.  sp -1
+.  IP " 1." 4.2
+.\}
+mask regions with \fB\-\-mask\-with\fP character if \fB\-\-mask\fP is given. All overlapping VCF variants are ignored
+.RE
+.sp
+.RS 4
+.ie n \{\
+\h'-04' 2.\h'+01'\c
+.\}
+.el \{\
+.  sp -1
+.  IP " 2." 4.2
+.\}
+replace sequence not mentioned in the VCF with the requested character if \fB\-\-absent\fP is given
+.RE
+.sp
+.RS 4
+.ie n \{\
+\h'-04' 3.\h'+01'\c
+.\}
+.el \{\
+.  sp -1
+.  IP " 3." 4.2
+.\}
+finally apply \fB\-\-mark\-del\fP, \fB\-\-mark\-ins\fP, \fB\-\-mark\-snv\fP masks
+.RE
+.RE
 .SS "bcftools convert \fI[OPTIONS]\fP \fIFILE\fP"
 .SS "VCF input options:"
 .sp
@@ -1617,6 +1679,11 @@ see \fBCommon Options\fP
 .RS 4
 see \fBCommon Options\fP
 .RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
 .SS "VCF output options:"
 .sp
 \fB\-\-no\-version\fP
@@ -1887,13 +1954,13 @@ convert from TSV (tab\-separated values) format (such as generated by
 \fB\-c, \-\-columns\fP \fIlist\fP
 .RS 4
 comma\-separated list of fields in the input file. In the current
-version, the fields CHROM, POS, ID, and AA are expected and
-can appear in arbitrary order, columns which should be ignored in the input
+version, the fields CHROM, POS, ID, and AA or REF, ALT are expected and
+can appear in arbitrary order. Columns which should be ignored in the input
 file can be indicated by "\-".
 The AA field lists alleles on the forward reference strand,
 for example "CC" or "CT" for diploid genotypes or "C"
 for haploid genotypes (sex chromosomes). Insertions and deletions
-are not supported yet, missing data can be indicated with "\-\-".
+are supported only with REF and ALT but not with AA. Missing data can be indicated with "\-\-" or ".".
 .RE
 .sp
 \fB\-f, \-\-fasta\-ref\fP \fIfile\fP
@@ -1917,7 +1984,10 @@ file of sample names. See \fBCommon Options\fP
 .nf
 .fam C
 # Convert 23andme results into VCF
-bcftools convert \-c ID,CHROM,POS,AA \-s SampleName \-f 23andme\-ref.fa \-\-tsv2vcf 23andme.txt \-Oz \-o out.vcf.gz
+bcftools convert \-c ID,CHROM,POS,AA \-s SampleName \-f 23andme\-ref.fa \-\-tsv2vcf 23andme.txt \-o out.vcf.gz
+
+# Convert tab\-delimited file into a sites\-only VCF (no genotypes), in this example first column to be ignored
+bcftools convert \-c \-,CHROM,POS,REF,ALT \-f ref.fa \-\-tsv2vcf calls.txt \-o out.bcf
 .fam
 .fi
 .if n .RE
@@ -1966,6 +2036,12 @@ aminoacids, with \fB\-B 1\fP only an abbreviated version such as \fI25E..329>25G
 written.
 .RE
 .sp
+\fB\-\-dump\-gff\fP \fIFILE\fP
+.RS 4
+dump the parsed GFF into a gzipped FILE. Intended for debugging purposes,
+shows how is the input GFF viewed by the program.
+.RE
+.sp
 \fB\-e, \-\-exclude\fP \fIEXPRESSION\fP
 .RS 4
 exclude sites for which \fIEXPRESSION\fP is true. For valid expressions see
@@ -1987,6 +2063,7 @@ transcripts in malformatted GFFs with incorrect phase
 .RS 4
 GFF3 annotation file (required), such as \c
 .URL "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens" "" "."
+The script \fBgff2gff\fP can help with conversion from non\-standard GFF formats.
 An example of a minimal working GFF file:
 .RE
 .sp
@@ -1998,6 +2075,17 @@ An example of a minimal working GFF file:
     # the gene (determined from the transcript\*(Aqs "Parent=gene:" attribute), and the biotype
     # (the most interesting is "protein_coding").
     #
+    # Empty and commented lines are skipped, the following GFF columns are required
+    #   1.  chromosome
+    #   2.  IGNORED
+    #   3.  type (CDS, exon, three_prime_UTR, five_prime_UTR, gene, transcript, etc.)
+    #   4.  start of the feature (1\-based)
+    #   5.  end of the feature (1\-based)
+    #   6.  IGNORED
+    #   7.  strand (+ or \-)
+    #   8.  phase (0, 1, 2 or .)
+    #   9.  semicolon\-separated attributes (see below)
+    #
     # Attributes required for
     #   gene lines:
     #   \- ID=gene:<gene_id>
@@ -2137,6 +2225,18 @@ see \fBCommon Options\fP
 see \fBCommon Options\fP
 .RE
 .sp
+\fB\-\-unify\-chr\-names\fP \fI0\fP|\fI1\fP
+.RS 4
+Automatically detect and unify chromosome naming conventions in the GFF, fasta
+and VCF, such as "chrX" vs "X". The chromosome names in the output VCF will match
+that of the input VCF. The default is to attempt the automatic translation.
+.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
+.sp
 \fBExamples:\fP
 .sp
 .if n .RS 4
@@ -2366,6 +2466,11 @@ see \fBCommon Options\fP
 .RS 4
 see \fBCommon Options\fP
 .RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
 .SS "bcftools gtcheck [\fIOPTIONS\fP] [\fB\-g\fP \fIgenotypes.vcf.gz\fP] \fIquery.vcf.gz\fP"
 .sp
 Checks sample identity. The program can operate in two modes. If the \fB\-g\fP
@@ -2373,6 +2478,10 @@ option is given, the identity of samples from \fIquery.vcf.gz\fP
 is checked against the samples in the \fB\-g\fP file.
 Without the \fB\-g\fP option, multi\-sample cross\-check of samples in \fIquery.vcf.gz\fP is performed.
 .sp
+Note that the interpretation of the discordance score depends on the options provided (specifically \fB\-e\fP and
+\fB\-u\fP) and on the available annotations (FORMAT/PL vs FORMAT/GT).
+The discordance score can be interpreted as the number of mismatching genotypes if only GT\-vs\-GT matching is performed.
+.sp
 \fB\-\-distinctive\-sites\fP \fINUM[,MEM[,DIR]]\fP
 .RS 4
 Find sites that can distinguish between at least NUM sample pairs. If the number is smaller or equal to 1,
@@ -2391,11 +2500,18 @@ Stop after first record to estimate required time.
 Interpret genotypes and genotype likelihoods probabilistically. The value of \fIINT\fP
 represents genotype quality when GT tag is used (e.g. Q=30 represents one error in 1,000 genotypes and
 Q=40 one error in 10,000 genotypes) and is ignored when PL tag is used (in that case an arbitrary
-non\-zero integer can be provided). See also the \fB\-u, \-\-use\fP option below. If set to 0,
-the discordance equals to the number of mismatching genotypes when GT vs GT is compared.
-Note that the values with and without \fB\-e\fP are not comparable, only values generated
-with \fB\-e 0\fP correspond to mismatching genotypes.
-If performance is an issue, set to 0 for faster run but less accurate results.
+non\-zero integer can be provided).
+\~
+.br
+\~
+.br
+If \fB\-e\fP is set to 0, the discordance score can be interpreted as the number of mismatching genotypes,
+but only in the GT\-vs\-GT matching mode. See the \fB\-u, \-\-use\fP option below for additional notes and caveats.
+\~
+.br
+\~
+.br
+If performance is an issue, set \fB\-e 0\fP for faster run times but less accurate results.
 .RE
 .sp
 \fB\-g, \-\-genotypes\fP \fIFILE\fP
@@ -2476,8 +2592,15 @@ see \fBCommon Options\fP
 \fB\-u, \-\-use\fP \fITAG1\fP[,\fITAG2\fP]
 .RS 4
 specifies which tag to use in the query file (\fITAG1\fP) and the \fB\-g\fP (\fITAG2\fP) file.
-By default, the PL tag is used in the query file and GT in the \fB\-g\fP file when
-available.
+By default, the PL tag is used in the query file and, when available, the GT tags in the
+\fB\-g\fP file.
+\~
+.br
+\~
+.br
+Note that when the requested tag is not available, the program will attempt to use
+the other tag. The output includes the number of sites that were matched by the four
+possible mode (for example GT\-vs\-GT or GT\-vs\-PL).
 .RE
 .sp
 \fBExamples:\fP
@@ -2676,6 +2799,11 @@ see \fBCommon Options\fP
 list of input files to output given as 1\-based indices. With \fB\-p\fP and no
 \fB\-w\fP, all files are written.
 .RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file. This is done automatically with the \fB\-p\fP option.
+.RE
 .SS "Examples:"
 .sp
 Create intersection and complements of two sets saving the output in dir/*
@@ -2785,7 +2913,8 @@ merge gVCF blocks, INFO/END tag is expected. If the reference fasta
 file \fIFILE\fP is not given and the dash (\fI\-\fP) is given, unknown reference
 bases generated at gVCF block splits will be substituted with N\(cqs.
 The \fB\-\-gvcf\fP option uses the following default INFO rules:
-\fB\-i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\fP.
+\fB\-i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\fP and the following missing
+rules: \fB\-M PL:max,AD:0\fP.
 .RE
 .sp
 \fB\-i, \-\-info\-rules\fP \fI\-\fP|\fITAG:METHOD\fP[,...]
@@ -2835,6 +2964,17 @@ The option controls what types of multiallelic records can be created:
 .fi
 .if n .RE
 .sp
+\fB\-M, \-\-missing\-rules\fP \fI\-\fP|\fITAG:METHOD\fP[,...]
+.RS 4
+Rules for merging vector tags at multiallelic sites. When input files have different alternate
+alleles, vector fields pertaining to unobserved alleles are set to missing (\fI.\fP) by default.
+The \fIMETHOD\fP is one of \fI.\fP (the default, use missing values), \fINUMBER\fP (use a constant value, e.g. 0),
+\fImax\fP (the maximum value observed for other alleles in the sample). When \fB\-\-gvcf\fP option is set,
+the rule \fB\-M PL:max,AD:0\fP is implied. This can be overriden with providing \fB\-M \-\fP or \fB\-M PL:.,AD:.\fP.
+Note that if the unobserved allele is explicitly present as \fI<*>\fP or \fI<NON_REF>\fP, then its corresponding
+value will be used regardless of \fB\-M\fP settings.
+.RE
+.sp
 \fB\-\-no\-index\fP
 .RS 4
 the option allows to merge files without indexing them first. In order for this
@@ -2876,6 +3016,11 @@ see \fBCommon Options\fP
 .RS 4
 see \fBCommon Options\fP
 .RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
 .SS "bcftools mpileup [\fIOPTIONS\fP] \fB\-f\fP \fIref.fa\fP \fIin.bam\fP [\fIin2.bam\fP [...]]"
 .sp
 Generate VCF or BCF containing genotype likelihoods for one or multiple
@@ -3209,6 +3354,11 @@ BQB.
 .fi
 .if n .RE
 .RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
 .SS "Options for SNP/INDEL genotype likelihood computation"
 .sp
 \fB\-X, \-\-config\fP \fISTR\fP
@@ -3431,6 +3581,13 @@ try to proceed with \fB\-m\-\fP even if malformed tags with incorrect number of
 are encountered, discarding such tags. (Experimental, use at your own risk.)
 .RE
 .sp
+\fB\-g, \-\-gff\-annot\fP \fIFILE\fP
+.RS 4
+when a GFF file is provided, follow HGVS 3\(cqrule and right\-align variants in transcripts on the forward
+strand.  In case of overlapping transcripts, the default mode is to left\-align the variant. For a
+description of the supported GFF3 file format see \fBbcftools csq\fP.
+.RE
+.sp
 \fB\-\-keep\-sum\fP \fITAG\fP[,...]
 .RS 4
 keep vector sum constant when splitting multiallelic sites. Only AD tag
@@ -3528,6 +3685,11 @@ see \fBCommon Options\fP
 maximum distance between two records to consider when locally
 sorting variants which changed position during the realignment
 .RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
 .SS "bcftools [plugin \fINAME\fP|+\fINAME\fP] \fI[OPTIONS]\fP \fIFILE\fP \(em \fI[PLUGIN OPTIONS]\fP"
 .sp
 A common framework for various utilities. The plugins can be used
@@ -3601,6 +3763,11 @@ see \fBCommon Options\fP
 .RS 4
 see \fBCommon Options\fP
 .RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
 .SS "Plugin options:"
 .sp
 \fB\-h, \-\-help\fP
@@ -4723,7 +4890,13 @@ see \fBCommon Options\fP
 .sp
 \fB\-T, \-\-temp\-dir\fP \fIDIR\fP
 .RS 4
-Use this directory to store temporary files
+Use this directory to store temporary files. If the last six characters of the string DIR are XXXXXX,
+then these are replaced with a string that makes the directory name unique.
+.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
 .RE
 .SS "bcftools stats [\fIOPTIONS\fP] \fIA.vcf.gz\fP [\fIB.vcf.gz\fP]"
 .sp
@@ -4943,6 +5116,11 @@ see \fBCommon Options\fP
 .RS 4
 see \fBCommon Options\fP
 .RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
 .SS "Subset options:"
 .sp
 \fB\-a, \-\-trim\-alt\-alleles\fP
@@ -5137,7 +5315,7 @@ important libraries used by bcftools.
 .SS "bcftools [\fI\-\-version\-only\fP]"
 .sp
 Display the full bcftools version number in a machine\-readable format.
-.SH "EXPRESSIONS"
+.SH "FILTERING EXPRESSIONS"
 .sp
 These filtering expressions are accepted by most of the commands.
 .sp
@@ -5919,7 +6097,18 @@ bcftools view \-i \*(Aq%ID!="." & MAF[0]<0.01\*(Aq
 .if n .RE
 .sp
 Please refer to the documentation of your shell for details.
-.SH "SCRIPTS AND OPTIONS"
+.SH "SCRIPTS"
+.SS "gff2gff"
+.sp
+Attempts to fix a GFF file to be correctly parsed by \fBcsq\fP.
+.sp
+.if n .RS 4
+.nf
+.fam C
+zcat in.gff.gz | gff2gff | gzip \-c > out.gff.gz
+.fam
+.fi
+.if n .RE
 .SS "plot\-vcfstats [\fIOPTIONS\fP] \fIfile.vchk\fP [...]"
 .sp
 Script for processing output of \fBbcftools stats\fP. It can merge
@@ -6013,8 +6202,10 @@ Please report any bugs you encounter on the github website: \c
 .sp
 Heng Li from the Sanger Institute wrote the original C version of htslib,
 samtools and bcftools. Bob Handsaker from the Broad Institute implemented the
-BGZF library. Petr Danecek, Shane McCarthy and John Marshall are  maintaining
-and further developing bcftools.  Many other people contributed to the program
+BGZF library. Petr Danecek is maintaining and further developing bcftools, together
+with the rest of the \c
+.URL "https://www.sanger.ac.uk/tool/samtools\-bcftools\-htslib" "samtools team" "."
+Many other people contributed to the program
 and to the file format specifications, both directly and indirectly by
 providing patches, testing and reporting bugs. We thank them all.
 .SH "RESOURCES"
diff --git a/doc/bcftools.html b/doc/bcftools.html
index 5a4f5ae51..0b4baab9e 100644
--- a/doc/bcftools.html
+++ b/doc/bcftools.html
@@ -50,13 +50,13 @@ <h2 id="_description">DESCRIPTION</h2>
 <div class="sect2">
 <h3 id="_version">VERSION</h3>
 <div class="paragraph">
-<p>This manual page was last updated <strong>2023-02-21</strong> and refers to bcftools git version <strong>1.17</strong>.</p>
+<p>This manual page was last updated <strong>2023-07-25</strong> and refers to bcftools git version <strong>1.18</strong>.</p>
 </div>
 </div>
 <div class="sect2">
 <h3 id="_bcf1">BCF1</h3>
 <div class="paragraph">
-<p>The BCF1 format output by versions of samtools &lt;= 0.1.19 is <strong>not</strong>
+<p>The obsolete BCF1 format output by versions of samtools &lt;= 0.1.19 is <strong>not</strong>
 compatible with this version of bcftools. To read BCF1 files one can use
 the view command from old versions of bcftools packaged with samtools
 versions &lt;= 0.1.19 to convert to VCF, which can then be read by
@@ -79,6 +79,12 @@ <h3 id="_variant_calling">VARIANT CALLING</h3>
 is recommended for most tasks.</p>
 </div>
 </div>
+<div class="sect2">
+<h3 id="_filtering_expressions">FILTERING EXPRESSIONS</h3>
+<div class="paragraph">
+<p>See <strong><a href="#expressions">EXPRESSIONS</a></strong></p>
+</div>
+</div>
 </div>
 </div>
 <div class="sect1">
@@ -172,6 +178,9 @@ <h2 id="_list_of_scripts">LIST OF SCRIPTS</h2>
 <div class="ulist">
 <ul>
 <li>
+<p><strong><a href="#gff2gff">gff2gff</a></strong>  .. converts a GFF file to the format required by <strong><a href="#csq">csq</a></strong></p>
+</li>
+<li>
 <p><strong><a href="#plot-vcfstats">plot-vcfstats</a></strong>  .. plots the output of <strong><a href="#stats">stats</a></strong></p>
 </li>
 </ul>
@@ -417,6 +426,10 @@ <h3 id="common_options">Common Options</h3>
 <p>Use multithreading with <em>INT</em> worker threads. The option is currently used only for the compression of the
 output stream, only when <em>--output-type</em> is <em>b</em> or <em>z</em>. Default: 0.</p>
 </dd>
+<dt class="hdlist1"><strong>--write-index</strong></dt>
+<dd>
+<p>Automatically index the output files. Can be used only for compressed BCF and VCF output.</p>
+</dd>
 </dl>
 </div>
 </div>
@@ -668,6 +681,10 @@ <h3 id="annotate">bcftools annotate <em>[OPTIONS]</em> <em>FILE</em></h3>
 "^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER).
 "INFO" can be abbreviated to "INF" and "FORMAT" to "FMT".</p>
 </dd>
+<dt class="hdlist1"><strong>--write-index</strong></dt>
+<dd>
+<p>Automatically index the output file</p>
+</dd>
 </dl>
 </div>
 <div class="paragraph">
@@ -797,6 +814,10 @@ <h4 id="_file_format_options">File format options:</h4>
 <dd>
 <p>see <strong><a href="#common_options">Common Options</a></strong></p>
 </dd>
+<dt class="hdlist1"><strong>--write-index</strong></dt>
+<dd>
+<p>Automatically index the output file</p>
+</dd>
 </dl>
 </div>
 </div>
@@ -1161,6 +1182,10 @@ <h3 id="concat">bcftools concat <em>[OPTIONS]</em> <em>FILE1</em> <em>FILE2</em>
 <dd>
 <p>see <strong><a href="#common_options">Common Options</a></strong></p>
 </dd>
+<dt class="hdlist1"><strong>--write-index</strong></dt>
+<dd>
+<p>Automatically index the output file</p>
+</dd>
 </dl>
 </div>
 </div>
@@ -1194,18 +1219,14 @@ <h3 id="consensus">bcftools consensus <em>[OPTIONS]</em> <em>FILE</em></h3>
 <dd>
 <p>reference sequence in fasta format</p>
 </dd>
-<dt class="hdlist1"><strong>-H, --haplotype</strong> <em>1</em>|<em>2</em>|<em>R</em>|<em>A</em>|<em>I</em>|<em>LR</em>|<em>LA</em>|<em>SR</em>|<em>SA</em>|<em>1pIu</em>|<em>2pIu</em></dt>
+<dt class="hdlist1"><strong>-H, --haplotype</strong> N|<em>R</em>|<em>A</em>|<em>I</em>|<em>LR</em>|<em>LA</em>|<em>SR</em>|<em>SA</em>|<em>NpIu</em></dt>
 <dd>
 <p>choose which allele from the FORMAT/GT field to use (the codes are case-insensitive):</p>
 <div class="dlist">
 <dl>
-<dt class="hdlist1"><em>1</em></dt>
-<dd>
-<p>the first allele, regardless of phasing</p>
-</dd>
-<dt class="hdlist1"><em>2</em></dt>
+<dt class="hdlist1"><em>N</em></dt>
 <dd>
-<p>the second allele, regardless of phasing</p>
+<p>N={1,2,3,&#8230;&#8203;}, the allele index within the genotype, regardless of phasing</p>
 </dd>
 <dt class="hdlist1"><em>R</em></dt>
 <dd>
@@ -1227,18 +1248,21 @@ <h3 id="consensus">bcftools consensus <em>[OPTIONS]</em> <em>FILE</em></h3>
 <dd>
 <p>the shorter allele. If both have the same length, use the REF allele (SR), or the ALT allele  (SA)</p>
 </dd>
-<dt class="hdlist1"><em>1pIu, 2pIu</em></dt>
+<dt class="hdlist1"><em>NpIu</em></dt>
 <dd>
-<p>first/second allele for phased genotypes and IUPAC code for unphased genotypes</p>
-<div class="literalblock">
-<div class="content">
-<pre>This option requires *-s*, unless exactly one sample is present in the VCF</pre>
-</div>
-</div>
+<p>N={1,2,3,&#8230;&#8203;}, the allele index within genotype for phased genotypes and IUPAC code for unphased genotypes.
+For example, <em>1pIu</em> or <em>2pIu</em></p>
 </dd>
 </dl>
 </div>
 </dd>
+</dl>
+</div>
+<div class="paragraph">
+<p>Note that the <strong>-H, --haplotype</strong> option requires the <strong>-s, --samples</strong> option, unless exactly one sample is present in the VCF</p>
+</div>
+<div class="dlist">
+<dl>
 <dt class="hdlist1"><strong>-i, --include</strong> <em>EXPRESSION</em></dt>
 <dd>
 <p>include only sites for which <em>EXPRESSION</em> is true. For valid expressions see
@@ -1247,21 +1271,21 @@ <h3 id="consensus">bcftools consensus <em>[OPTIONS]</em> <em>FILE</em></h3>
 <dt class="hdlist1"><strong>-I, --iupac-codes</strong></dt>
 <dd>
 <p>output variants in the form of IUPAC ambiguity codes determined from FORMAT/GT fields. By default all
-samples are used and can be subset with <code>-s, --samples</code> and <code>-S, --samples-file</code>. Use <code>-s -</code> to ignore
+samples are used and can be subset with <strong>-s, --samples</strong> and <strong>-S, --samples-file</strong>. Use <strong>-s -</strong> to ignore
 samples and use only the REF and ALT columns.  NOTE: prior to version 1.17 the IUPAC codes were determined solely
 from REF,ALT columns and sample genotypes were not considered.</p>
 </dd>
 <dt class="hdlist1"><strong>--mark-del</strong> <em>CHAR</em></dt>
 <dd>
-<p>instead of removing sequence, insert CHAR for deletions</p>
+<p>instead of removing sequence, insert character CHAR for deletions</p>
 </dd>
-<dt class="hdlist1"><strong>--mark-ins</strong> <em>uc</em>|<em>lc</em></dt>
+<dt class="hdlist1"><strong>--mark-ins</strong> <em>uc</em>|<em>lc</em>|<em>CHAR</em></dt>
 <dd>
-<p>highlight inserted sequence in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is</p>
+<p>highlight inserted sequence in uppercase (uc), lowercase (lc), or a provided character CHAR, leaving the rest of the sequence as is</p>
 </dd>
 <dt class="hdlist1"><strong>--mark-snv</strong> <em>uc</em>|<em>lc</em></dt>
 <dd>
-<p>highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is</p>
+<p>highlight substitutions in uppercase (uc), lowercase (lc), or a provided character CHAR, leaving the rest of the sequence as is</p>
 </dd>
 <dt class="hdlist1"><strong>-m, --mask</strong> <em>FILE</em></dt>
 <dd>
@@ -1284,11 +1308,11 @@ <h3 id="consensus">bcftools consensus <em>[OPTIONS]</em> <em>FILE</em></h3>
 </dd>
 <dt class="hdlist1"><strong>-s, --samples</strong> <em>LIST</em></dt>
 <dd>
-<p>apply variants of the listed samples. See also the option <code>-I, --iupac-codes</code></p>
+<p>apply variants of the listed samples. See also the option <strong>-I, --iupac-codes</strong></p>
 </dd>
 <dt class="hdlist1"><strong>-S, --samples-file</strong> <em>FILE</em></dt>
 <dd>
-<p>apply variants of the samples listed in the file. See also the option <code>-I, --iupac-codes</code></p>
+<p>apply variants of the samples listed in the file. See also the option <strong>-I, --iupac-codes</strong></p>
 </dd>
 </dl>
 </div>
@@ -1307,6 +1331,27 @@ <h3 id="consensus">bcftools consensus <em>[OPTIONS]</em> <em>FILE</em></h3>
     # For more examples see http://samtools.github.io/bcftools/howtos/consensus-sequence.html</pre>
 </div>
 </div>
+<div class="dlist">
+<dl>
+<dt class="hdlist1"><strong>Notes:</strong></dt>
+<dd>
+<p>Masking options are applied in the following order</p>
+<div class="olist arabic">
+<ol class="arabic">
+<li>
+<p>mask regions with <strong>--mask-with</strong> character if <strong>--mask</strong> is given. All overlapping VCF variants are ignored</p>
+</li>
+<li>
+<p>replace sequence not mentioned in the VCF with the requested character if <strong>--absent</strong> is given</p>
+</li>
+<li>
+<p>finally apply <strong>--mark-del</strong>, <strong>--mark-ins</strong>, <strong>--mark-snv</strong> masks</p>
+</li>
+</ol>
+</div>
+</dd>
+</dl>
+</div>
 </div>
 <div class="sect2">
 <h3 id="convert">bcftools convert <em>[OPTIONS]</em> <em>FILE</em></h3>
@@ -1356,6 +1401,10 @@ <h4 id="_vcf_input_options">VCF input options:</h4>
 <dd>
 <p>see <strong><a href="#common_options">Common Options</a></strong></p>
 </dd>
+<dt class="hdlist1"><strong>--write-index</strong></dt>
+<dd>
+<p>Automatically index the output file</p>
+</dd>
 </dl>
 </div>
 </div>
@@ -1637,13 +1686,13 @@ <h4 id="_tsv_conversion">TSV conversion:</h4>
 <dt class="hdlist1"><strong>-c, --columns</strong> <em>list</em></dt>
 <dd>
 <p>comma-separated list of fields in the input file. In the current
-version, the fields CHROM, POS, ID, and AA are expected and
-can appear in arbitrary order, columns which should be ignored in the input
+version, the fields CHROM, POS, ID, and AA or REF, ALT are expected and
+can appear in arbitrary order. Columns which should be ignored in the input
 file can be indicated by "-".
 The AA field lists alleles on the forward reference strand,
 for example "CC" or "CT" for diploid genotypes or "C"
 for haploid genotypes (sex chromosomes). Insertions and deletions
-are not supported yet, missing data can be indicated with "--".</p>
+are supported only with REF and ALT but not with AA. Missing data can be indicated with "--" or ".".</p>
 </dd>
 <dt class="hdlist1"><strong>-f, --fasta-ref</strong> <em>file</em></dt>
 <dd>
@@ -1665,7 +1714,10 @@ <h4 id="_tsv_conversion">TSV conversion:</h4>
 <div class="listingblock">
 <div class="content">
 <pre># Convert 23andme results into VCF
-bcftools convert -c ID,CHROM,POS,AA -s SampleName -f 23andme-ref.fa --tsv2vcf 23andme.txt -Oz -o out.vcf.gz</pre>
+bcftools convert -c ID,CHROM,POS,AA -s SampleName -f 23andme-ref.fa --tsv2vcf 23andme.txt -o out.vcf.gz
+
+# Convert tab-delimited file into a sites-only VCF (no genotypes), in this example first column to be ignored
+bcftools convert -c -,CHROM,POS,REF,ALT -f ref.fa --tsv2vcf calls.txt -o out.bcf</pre>
 </div>
 </div>
 </div>
@@ -1721,6 +1773,11 @@ <h3 id="csq">bcftools csq <em>[OPTIONS]</em> <em>FILE</em></h3>
 aminoacids, with <strong>-B 1</strong> only an abbreviated version such as <em>25E..329&gt;25G..94</em> will be
 written.</p>
 </dd>
+<dt class="hdlist1"><strong>--dump-gff</strong> <em>FILE</em></dt>
+<dd>
+<p>dump the parsed GFF into a gzipped FILE. Intended for debugging purposes,
+shows how is the input GFF viewed by the program.</p>
+</dd>
 <dt class="hdlist1"><strong>-e, --exclude</strong> <em>EXPRESSION</em></dt>
 <dd>
 <p>exclude sites for which <em>EXPRESSION</em> is true. For valid expressions see
@@ -1738,6 +1795,7 @@ <h3 id="csq">bcftools csq <em>[OPTIONS]</em> <em>FILE</em></h3>
 <dt class="hdlist1"><strong>-g, --gff-annot</strong> <em>FILE</em></dt>
 <dd>
 <p>GFF3 annotation file (required), such as <a href="ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens" class="bare">ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens</a>.
+The script <strong><a href="#gff2gff">gff2gff</a></strong> can help with conversion from non-standard GFF formats.
 An example of a minimal working GFF file:</p>
 </dd>
 </dl>
@@ -1749,6 +1807,17 @@ <h3 id="csq">bcftools csq <em>[OPTIONS]</em> <em>FILE</em></h3>
     # the gene (determined from the transcript's "Parent=gene:" attribute), and the biotype
     # (the most interesting is "protein_coding").
     #
+    # Empty and commented lines are skipped, the following GFF columns are required
+    #   1.  chromosome
+    #   2.  IGNORED
+    #   3.  type (CDS, exon, three_prime_UTR, five_prime_UTR, gene, transcript, etc.)
+    #   4.  start of the feature (1-based)
+    #   5.  end of the feature (1-based)
+    #   6.  IGNORED
+    #   7.  strand (+ or -)
+    #   8.  phase (0, 1, 2 or .)
+    #   9.  semicolon-separated attributes (see below)
+    #
     # Attributes required for
     #   gene lines:
     #   - ID=gene:&lt;gene_id&gt;
@@ -1871,6 +1940,16 @@ <h3 id="csq">bcftools csq <em>[OPTIONS]</em> <em>FILE</em></h3>
 <dd>
 <p>see <strong><a href="#common_options">Common Options</a></strong></p>
 </dd>
+<dt class="hdlist1"><strong>--unify-chr-names</strong> <em>0</em>|<em>1</em></dt>
+<dd>
+<p>Automatically detect and unify chromosome naming conventions in the GFF, fasta
+and VCF, such as "chrX" vs "X". The chromosome names in the output VCF will match
+that of the input VCF. The default is to attempt the automatic translation.</p>
+</dd>
+<dt class="hdlist1"><strong>--write-index</strong></dt>
+<dd>
+<p>Automatically index the output file</p>
+</dd>
 </dl>
 </div>
 <div class="paragraph">
@@ -2084,6 +2163,10 @@ <h3 id="filter">bcftools filter <em>[OPTIONS]</em> <em>FILE</em></h3>
 <dd>
 <p>see <strong><a href="#common_options">Common Options</a></strong></p>
 </dd>
+<dt class="hdlist1"><strong>--write-index</strong></dt>
+<dd>
+<p>Automatically index the output file</p>
+</dd>
 </dl>
 </div>
 </div>
@@ -2095,6 +2178,11 @@ <h3 id="gtcheck">bcftools gtcheck [<em>OPTIONS</em>] [<strong>-g</strong> <em>ge
 is checked against the samples in the <strong>-g</strong> file.
 Without the <strong>-g</strong> option, multi-sample cross-check of samples in <em>query.vcf.gz</em> is performed.</p>
 </div>
+<div class="paragraph">
+<p>Note that the interpretation of the discordance score depends on the options provided (specifically <strong>-e</strong> and
+<strong>-u</strong>) and on the available annotations (FORMAT/PL vs FORMAT/GT).
+The discordance score can be interpreted as the number of mismatching genotypes if only GT-vs-GT matching is performed.</p>
+</div>
 <div class="dlist">
 <dl>
 <dt class="hdlist1"><strong>--distinctive-sites</strong> <em>NUM[,MEM[,DIR]]</em></dt>
@@ -2113,11 +2201,14 @@ <h3 id="gtcheck">bcftools gtcheck [<em>OPTIONS</em>] [<strong>-g</strong> <em>ge
 <p>Interpret genotypes and genotype likelihoods probabilistically. The value of <em>INT</em>
 represents genotype quality when GT tag is used (e.g. Q=30 represents one error in 1,000 genotypes and
 Q=40 one error in 10,000 genotypes) and is ignored when PL tag is used (in that case an arbitrary
-non-zero integer can be provided). See also the <strong>-u, --use</strong> option below. If set to 0,
-the discordance equals to the number of mismatching genotypes when GT vs GT is compared.
-Note that the values with and without <strong>-e</strong> are not comparable, only values generated
-with <strong>-e 0</strong> correspond to mismatching genotypes.
-If performance is an issue, set to 0 for faster run but less accurate results.</p>
+non-zero integer can be provided).
+&#160;<br>
+&#160;<br>
+If <strong>-e</strong> is set to 0, the discordance score can be interpreted as the number of mismatching genotypes,
+but only in the GT-vs-GT matching mode. See the <strong>-u, --use</strong> option below for additional notes and caveats.
+&#160;<br>
+&#160;<br>
+If performance is an issue, set <strong>-e 0</strong> for faster run times but less accurate results.</p>
 </dd>
 <dt class="hdlist1"><strong>-g, --genotypes</strong> <em>FILE</em></dt>
 <dd>
@@ -2191,8 +2282,13 @@ <h3 id="gtcheck">bcftools gtcheck [<em>OPTIONS</em>] [<strong>-g</strong> <em>ge
 <dt class="hdlist1"><strong>-u, --use</strong> <em>TAG1</em>[,<em>TAG2</em>]</dt>
 <dd>
 <p>specifies which tag to use in the query file (<em>TAG1</em>) and the <strong>-g</strong> (<em>TAG2</em>) file.
-By default, the PL tag is used in the query file and GT in the <strong>-g</strong> file when
-available.</p>
+By default, the PL tag is used in the query file and, when available, the GT tags in the
+<strong>-g</strong> file.
+&#160;<br>
+&#160;<br>
+Note that when the requested tag is not available, the program will attempt to use
+the other tag. The output includes the number of sites that were matched by the four
+possible mode (for example GT-vs-GT or GT-vs-PL).</p>
 </dd>
 </dl>
 </div>
@@ -2394,6 +2490,10 @@ <h3 id="isec">bcftools isec [<em>OPTIONS</em>]  <em>A.vcf.gz</em> <em>B.vcf.gz</
 <p>list of input files to output given as 1-based indices. With <strong>-p</strong> and no
 <strong>-w</strong>, all files are written.</p>
 </dd>
+<dt class="hdlist1"><strong>--write-index</strong></dt>
+<dd>
+<p>Automatically index the output file. This is done automatically with the <strong>-p</strong> option.</p>
+</dd>
 </dl>
 </div>
 <div class="sect3">
@@ -2497,7 +2597,8 @@ <h3 id="merge">bcftools merge [<em>OPTIONS</em>] <em>A.vcf.gz</em> <em>B.vcf.gz<
 file <em>FILE</em> is not given and the dash (<em>-</em>) is given, unknown reference
 bases generated at gVCF block splits will be substituted with N&#8217;s.
 The <strong>--gvcf</strong> option uses the following default INFO rules:
-<strong>-i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max</strong>.</p>
+<strong>-i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max</strong> and the following missing
+rules: <strong>-M PL:max,AD:0</strong>.</p>
 </dd>
 <dt class="hdlist1"><strong>-i, --info-rules</strong> <em>-</em>|<em>TAG:METHOD</em>[,&#8230;&#8203;]</dt>
 <dd>
@@ -2543,6 +2644,16 @@ <h3 id="merge">bcftools merge [<em>OPTIONS</em>] <em>A.vcf.gz</em> <em>B.vcf.gz<
 </div>
 <div class="dlist">
 <dl>
+<dt class="hdlist1"><strong>-M, --missing-rules</strong> <em>-</em>|<em>TAG:METHOD</em>[,&#8230;&#8203;]</dt>
+<dd>
+<p>Rules for merging vector tags at multiallelic sites. When input files have different alternate
+alleles, vector fields pertaining to unobserved alleles are set to missing (<em>.</em>) by default.
+The <em>METHOD</em> is one of <em>.</em> (the default, use missing values), <em>NUMBER</em> (use a constant value, e.g. 0),
+<em>max</em> (the maximum value observed for other alleles in the sample). When <strong>--gvcf</strong> option is set,
+the rule <strong>-M PL:max,AD:0</strong> is implied. This can be overriden with providing <strong>-M -</strong> or <strong>-M PL:.,AD:.</strong>.
+Note that if the unobserved allele is explicitly present as <em>&lt;*&gt;</em> or <em>&lt;NON_REF&gt;</em>, then its corresponding
+value will be used regardless of <strong>-M</strong> settings.</p>
+</dd>
 <dt class="hdlist1"><strong>--no-index</strong></dt>
 <dd>
 <p>the option allows to merge files without indexing them first. In order for this
@@ -2577,6 +2688,10 @@ <h3 id="merge">bcftools merge [<em>OPTIONS</em>] <em>A.vcf.gz</em> <em>B.vcf.gz<
 <dd>
 <p>see <strong><a href="#common_options">Common Options</a></strong></p>
 </dd>
+<dt class="hdlist1"><strong>--write-index</strong></dt>
+<dd>
+<p>Automatically index the output file</p>
+</dd>
 </dl>
 </div>
 </div>
@@ -2889,6 +3004,10 @@ <h4 id="_output_options">Output options</h4>
 </div>
 </div>
 </dd>
+<dt class="hdlist1"><strong>--write-index</strong></dt>
+<dd>
+<p>Automatically index the output file</p>
+</dd>
 </dl>
 </div>
 </div>
@@ -3099,6 +3218,12 @@ <h3 id="norm">bcftools norm [<em>OPTIONS</em>] <em>file.vcf.gz</em></h3>
 <p>try to proceed with <strong>-m-</strong> even if malformed tags with incorrect number of fields
 are encountered, discarding such tags. (Experimental, use at your own risk.)</p>
 </dd>
+<dt class="hdlist1"><strong>-g, --gff-annot</strong> <em>FILE</em></dt>
+<dd>
+<p>when a GFF file is provided, follow HGVS 3&#8217;rule and right-align variants in transcripts on the forward
+strand.  In case of overlapping transcripts, the default mode is to left-align the variant. For a
+description of the supported GFF3 file format see <strong><a href="#csq">bcftools csq</a></strong>.</p>
+</dd>
 <dt class="hdlist1"><strong>--keep-sum</strong> <em>TAG</em>[,&#8230;&#8203;]</dt>
 <dd>
 <p>keep vector sum constant when splitting multiallelic sites. Only AD tag
@@ -3179,6 +3304,10 @@ <h3 id="norm">bcftools norm [<em>OPTIONS</em>] <em>file.vcf.gz</em></h3>
 <p>maximum distance between two records to consider when locally
 sorting variants which changed position during the realignment</p>
 </dd>
+<dt class="hdlist1"><strong>--write-index</strong></dt>
+<dd>
+<p>Automatically index the output file</p>
+</dd>
 </dl>
 </div>
 </div>
@@ -3254,6 +3383,10 @@ <h4 id="_vcf_output_options_2">VCF output options:</h4>
 <dd>
 <p>see <strong><a href="#common_options">Common Options</a></strong></p>
 </dd>
+<dt class="hdlist1"><strong>--write-index</strong></dt>
+<dd>
+<p>Automatically index the output file</p>
+</dd>
 </dl>
 </div>
 </div>
@@ -4134,7 +4267,12 @@ <h3 id="sort">bcftools sort [<em>OPTIONS</em>] file.bcf</h3>
 </dd>
 <dt class="hdlist1"><strong>-T, --temp-dir</strong> <em>DIR</em></dt>
 <dd>
-<p>Use this directory to store temporary files</p>
+<p>Use this directory to store temporary files. If the last six characters of the string DIR are XXXXXX,
+then these are replaced with a string that makes the directory name unique.</p>
+</dd>
+<dt class="hdlist1"><strong>--write-index</strong></dt>
+<dd>
+<p>Automatically index the output file</p>
 </dd>
 </dl>
 </div>
@@ -4339,6 +4477,10 @@ <h4 id="_output_options_2">Output options</h4>
 <dd>
 <p>see <strong><a href="#common_options">Common Options</a></strong></p>
 </dd>
+<dt class="hdlist1"><strong>--write-index</strong></dt>
+<dd>
+<p>Automatically index the output file</p>
+</dd>
 </dl>
 </div>
 </div>
@@ -4538,7 +4680,7 @@ <h3 id="version-only">bcftools [<em>--version-only</em>]</h3>
 </div>
 </div>
 <div class="sect1">
-<h2 id="expressions">EXPRESSIONS</h2>
+<h2 id="expressions">FILTERING EXPRESSIONS</h2>
 <div class="sectionbody">
 <div class="paragraph">
 <p>These filtering expressions are accepted by most of the commands.</p>
@@ -4974,9 +5116,24 @@ <h2 id="expressions">EXPRESSIONS</h2>
 </div>
 </div>
 <div class="sect1">
-<h2 id="_scripts_and_options">SCRIPTS AND OPTIONS</h2>
+<h2 id="_scripts">SCRIPTS</h2>
 <div class="sectionbody">
 <div class="sect2">
+<h3 id="gff2gff">gff2gff</h3>
+<div class="paragraph">
+<p>Attempts to fix a GFF file to be correctly parsed by <strong><a href="#csq">csq</a></strong>.</p>
+</div>
+<div class="openblock">
+<div class="content">
+<div class="literalblock">
+<div class="content">
+<pre>zcat in.gff.gz | gff2gff | gzip -c &gt; out.gff.gz</pre>
+</div>
+</div>
+</div>
+</div>
+</div>
+<div class="sect2">
 <h3 id="plot-vcfstats">plot-vcfstats [<em>OPTIONS</em>] <em>file.vchk</em> [&#8230;&#8203;]</h3>
 <div class="paragraph">
 <p>Script for processing output of <strong><a href="#stats">bcftools stats</a></strong>. It can merge
@@ -5077,8 +5234,9 @@ <h2 id="_authors">AUTHORS</h2>
 <div class="paragraph">
 <p>Heng Li from the Sanger Institute wrote the original C version of htslib,
 samtools and bcftools. Bob Handsaker from the Broad Institute implemented the
-BGZF library. Petr Danecek, Shane McCarthy and John Marshall are  maintaining
-and further developing bcftools.  Many other people contributed to the program
+BGZF library. Petr Danecek is maintaining and further developing bcftools, together
+with the rest of the <a href="https://www.sanger.ac.uk/tool/samtools-bcftools-htslib">samtools team</a>.
+Many other people contributed to the program
 and to the file format specifications, both directly and indirectly by
 providing patches, testing and reporting bugs. We thank them all.</p>
 </div>
@@ -5119,7 +5277,7 @@ <h2 id="_copying">COPYING</h2>
 </div>
 <div id="footer">
 <div id="footer-text">
-Last updated 2023-02-21 12:25:09 UTC
+Last updated 2023-07-25 09:17:14 +0100
 </div>
 </div>
 </body>
diff --git a/doc/bcftools.txt b/doc/bcftools.txt
index b1a5f07c4..b2dcaf6c2 100644
--- a/doc/bcftools.txt
+++ b/doc/bcftools.txt
@@ -52,7 +52,7 @@ commands can thus be  combined  with  Unix pipes.
 This manual page was last updated *{date}* and refers to bcftools git version *{version}*.
 
 === BCF1
-The BCF1 format output by versions of samtools \<= 0.1.19 is *not*
+The obsolete BCF1 format output by versions of samtools \<= 0.1.19 is *not*
 compatible with this version of bcftools. To read BCF1 files one can use
 the view command from old versions of bcftools packaged with samtools
 versions \<= 0.1.19 to convert to VCF, which can then be read by
@@ -71,6 +71,10 @@ calling model ('-m/--multiallelic-caller'). The multiallelic calling model
 is recommended for most tasks.
 
 
+=== FILTERING EXPRESSIONS
+See *<<expressions,EXPRESSIONS>>*
+
+
 LIST OF COMMANDS
 ----------------
 For a full list of available commands, run *bcftools* without arguments. For a full
@@ -105,6 +109,7 @@ LIST OF SCRIPTS
 ---------------
 Some helper scripts are bundled with the bcftools code.
 
+- *<<gff2gff,gff2gff>>*  .. converts a GFF file to the format required by *<<csq,csq>>*
 - *<<plot-vcfstats,plot-vcfstats>>*  .. plots the output of *<<stats,stats>>*
 
 
@@ -298,6 +303,9 @@ Such a file can be easily created from a VCF using:
     Use multithreading with 'INT' worker threads. The option is currently used only for the compression of the
     output stream, only when '--output-type' is 'b' or 'z'. Default: 0.
 
+*--write-index*::
+    Automatically index the output files. Can be used only for compressed BCF and VCF output.
+
 
 [[annotate]]
 === bcftools annotate '[OPTIONS]' 'FILE'
@@ -501,6 +509,9 @@ Add or remove annotations.
     "^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER).
     "INFO" can be abbreviated to "INF" and "FORMAT" to "FMT".
 
+*--write-index*::
+    Automatically index the output file
+
 *Examples:*
 ----
     # Remove three fields
@@ -604,6 +615,9 @@ demand. The original calling model can be invoked with the *-c* option.
 *--threads* 'INT'::
     see *<<common_options,Common Options>>*
 
+*--write-index*::
+    Automatically index the output file
+
 ==== Input/output options:
 
 *-A, --keep-alts*::
@@ -878,6 +892,9 @@ are concatenated without being recompressed, which is very fast..
 *--threads* 'INT'::
     see *<<common_options,Common Options>>*
 
+*--write-index*::
+    Automatically index the output file
+
 
 [[consensus]]
 === bcftools consensus '[OPTIONS]' 'FILE'
@@ -902,14 +919,11 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the
 *-f, --fasta-ref* 'FILE'::
     reference sequence in fasta format
 
-*-H, --haplotype* '1'|'2'|'R'|'A'|'I'|'LR'|'LA'|'SR'|'SA'|'1pIu'|'2pIu'::
+*-H, --haplotype* N|'R'|'A'|'I'|'LR'|'LA'|'SR'|'SA'|'NpIu'::
     choose which allele from the FORMAT/GT field to use (the codes are case-insensitive):
 
-        '1';;
-            the first allele, regardless of phasing
-
-        '2';;
-            the second allele, regardless of phasing
+        'N';;
+            N={1,2,3,...}, the allele index within the genotype, regardless of phasing
 
         'R';;
             the REF allele (in heterozygous genotypes)
@@ -926,10 +940,11 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the
         'SR, SA';;
             the shorter allele. If both have the same length, use the REF allele (SR), or the ALT allele  (SA)
 
-        '1pIu, 2pIu';;
-            first/second allele for phased genotypes and IUPAC code for unphased genotypes
+        'NpIu';;
+            N={1,2,3,...}, the allele index within genotype for phased genotypes and IUPAC code for unphased genotypes.
+            For example, '1pIu' or '2pIu'
 
-    This option requires *-s*, unless exactly one sample is present in the VCF
+Note that the *-H, --haplotype* option requires the *-s, --samples* option, unless exactly one sample is present in the VCF
 
 *-i, --include* 'EXPRESSION'::
     include only sites for which 'EXPRESSION' is true. For valid expressions see
@@ -937,18 +952,18 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the
 
 *-I, --iupac-codes*::
     output variants in the form of IUPAC ambiguity codes determined from FORMAT/GT fields. By default all
-    samples are used and can be subset with `-s, --samples` and `-S, --samples-file`. Use `-s -` to ignore
+    samples are used and can be subset with *-s, --samples* and *-S, --samples-file*. Use *-s -* to ignore
     samples and use only the REF and ALT columns.  NOTE: prior to version 1.17 the IUPAC codes were determined solely
     from REF,ALT columns and sample genotypes were not considered.
 
 *--mark-del* 'CHAR'::
-    instead of removing sequence, insert CHAR for deletions
+    instead of removing sequence, insert character CHAR for deletions
 
-*--mark-ins* 'uc'|'lc'::
-    highlight inserted sequence in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is
+*--mark-ins* 'uc'|'lc'|'CHAR'::
+    highlight inserted sequence in uppercase (uc), lowercase (lc), or a provided character CHAR, leaving the rest of the sequence as is
 
 *--mark-snv* 'uc'|'lc'::
-    highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is
+    highlight substitutions in uppercase (uc), lowercase (lc), or a provided character CHAR, leaving the rest of the sequence as is
 
 *-m, --mask* 'FILE'::
     BED file or TAB file with regions to be replaced with N (the default) or as specified by
@@ -966,10 +981,10 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the
     write output to a file
 
 *-s, --samples* 'LIST'::
-    apply variants of the listed samples. See also the option `-I, --iupac-codes`
+    apply variants of the listed samples. See also the option *-I, --iupac-codes*
 
 *-S, --samples-file* 'FILE'::
-    apply variants of the samples listed in the file. See also the option `-I, --iupac-codes`
+    apply variants of the samples listed in the file. See also the option *-I, --iupac-codes*
 
 *Examples:*
 ----
@@ -983,6 +998,14 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the
     # For more examples see http://samtools.github.io/bcftools/howtos/consensus-sequence.html
 ----
 
+*Notes:*::
+    Masking options are applied in the following order
+    1. mask regions with *--mask-with* character if *--mask* is given. All overlapping VCF variants are ignored
+    2. replace sequence not mentioned in the VCF with the requested character if *--absent* is given
+    3. finally apply *--mark-del*, *--mark-ins*, *--mark-snv* masks
+
+
+
 
 [[convert]]
 === bcftools convert '[OPTIONS]' 'FILE'
@@ -1021,6 +1044,9 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the
 *--targets-overlap* '0'|'1'|'2'::
     see *<<common_options,Common Options>>*
 
+*--write-index*::
+    Automatically index the output file
+
 ==== VCF output options:
 
 *--no-version*::
@@ -1210,13 +1236,13 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the
 
 *-c, --columns* 'list'::
     comma-separated list of fields in the input file. In the current
-    version, the fields CHROM, POS, ID, and AA are expected and
-    can appear in arbitrary order, columns which should be ignored in the input
+    version, the fields CHROM, POS, ID, and AA or REF, ALT are expected and
+    can appear in arbitrary order. Columns which should be ignored in the input
     file can be indicated by "-".
     The AA field lists alleles on the forward reference strand,
     for example "CC" or "CT" for diploid genotypes or "C"
     for haploid genotypes (sex chromosomes). Insertions and deletions
-    are not supported yet, missing data can be indicated with "--".
+    are supported only with REF and ALT but not with AA. Missing data can be indicated with "--" or ".".
 
 *-f, --fasta-ref* 'file'::
     reference sequence in fasta format. Must be indexed with samtools faidx
@@ -1230,7 +1256,10 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the
 *Example:*
 ----
 # Convert 23andme results into VCF
-bcftools convert -c ID,CHROM,POS,AA -s SampleName -f 23andme-ref.fa --tsv2vcf 23andme.txt -Oz -o out.vcf.gz
+bcftools convert -c ID,CHROM,POS,AA -s SampleName -f 23andme-ref.fa --tsv2vcf 23andme.txt -o out.vcf.gz
+
+# Convert tab-delimited file into a sites-only VCF (no genotypes), in this example first column to be ignored
+bcftools convert -c -,CHROM,POS,REF,ALT -f ref.fa --tsv2vcf calls.txt -o out.bcf
 ----
 
 
@@ -1277,6 +1306,10 @@ output VCF and are ignored for the prediction analysis.
     aminoacids, with *-B 1* only an abbreviated version such as '25E..329>25G..94' will be
     written.
 
+*--dump-gff* 'FILE'::
+    dump the parsed GFF into a gzipped FILE. Intended for debugging purposes,
+    shows how is the input GFF viewed by the program.
+
 *-e, --exclude* 'EXPRESSION'::
     exclude sites for which 'EXPRESSION' is true. For valid expressions see
     *<<expressions,EXPRESSIONS>>*.
@@ -1290,6 +1323,7 @@ output VCF and are ignored for the prediction analysis.
 
 *-g, --gff-annot* 'FILE'::
     GFF3 annotation file (required), such as ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens.
+    The script *<<gff2gff,gff2gff>>* can help with conversion from non-standard GFF formats.
     An example of a minimal working GFF file:
 ----
     # The program looks for "CDS", "exon", "three_prime_UTR" and "five_prime_UTR" lines,
@@ -1297,6 +1331,17 @@ output VCF and are ignored for the prediction analysis.
     # the gene (determined from the transcript's "Parent=gene:" attribute), and the biotype
     # (the most interesting is "protein_coding").
     #
+    # Empty and commented lines are skipped, the following GFF columns are required
+    #   1.  chromosome
+    #   2.  IGNORED
+    #   3.  type (CDS, exon, three_prime_UTR, five_prime_UTR, gene, transcript, etc.)
+    #   4.  start of the feature (1-based)
+    #   5.  end of the feature (1-based)
+    #   6.  IGNORED
+    #   7.  strand (+ or -)
+    #   8.  phase (0, 1, 2 or .)
+    #   9.  semicolon-separated attributes (see below)
+    #
     # Attributes required for
     #   gene lines:
     #   - ID=gene:<gene_id>
@@ -1392,6 +1437,14 @@ output VCF and are ignored for the prediction analysis.
 *--targets-overlap* '0'|'1'|'2'::
     see *<<common_options,Common Options>>*
 
+*--unify-chr-names* '0'|'1'::
+    Automatically detect and unify chromosome naming conventions in the GFF, fasta
+    and VCF, such as "chrX" vs "X". The chromosome names in the output VCF will match
+    that of the input VCF. The default is to attempt the automatic translation.
+
+*--write-index*::
+    Automatically index the output file
+
 *Examples:*
 ----
     # Basic usage
@@ -1559,6 +1612,9 @@ And similarly here, the second is filtered:
 *--threads* 'INT'::
     see *<<common_options,Common Options>>*
 
+*--write-index*::
+    Automatically index the output file
+
 
 
 [[gtcheck]]
@@ -1568,6 +1624,10 @@ option is given, the identity of samples from 'query.vcf.gz'
 is checked against the samples in the *-g* file.
 Without the *-g* option, multi-sample cross-check of samples in 'query.vcf.gz' is performed.
 
+Note that the interpretation of the discordance score depends on the options provided (specifically *-e* and
+*-u*) and on the available annotations (FORMAT/PL vs FORMAT/GT).
+The discordance score can be interpreted as the number of mismatching genotypes if only GT-vs-GT matching is performed.
+
 *--distinctive-sites* 'NUM[,MEM[,DIR]]'::
     Find sites that can distinguish between at least NUM sample pairs. If the number is smaller or equal to 1,
     it is interpreted as the fraction of pairs.  The optional MEM string sets the maximum memory used for
@@ -1581,11 +1641,14 @@ Without the *-g* option, multi-sample cross-check of samples in 'query.vcf.gz' i
     Interpret genotypes and genotype likelihoods probabilistically. The value of 'INT'
     represents genotype quality when GT tag is used (e.g. Q=30 represents one error in 1,000 genotypes and
     Q=40 one error in 10,000 genotypes) and is ignored when PL tag is used (in that case an arbitrary
-    non-zero integer can be provided). See also the *-u, --use* option below. If set to 0,
-    the discordance equals to the number of mismatching genotypes when GT vs GT is compared.
-    Note that the values with and without *-e* are not comparable, only values generated
-    with *-e 0* correspond to mismatching genotypes.
-    If performance is an issue, set to 0 for faster run but less accurate results.
+    non-zero integer can be provided).
+    {nbsp} +
+    {nbsp} +
+    If *-e* is set to 0, the discordance score can be interpreted as the number of mismatching genotypes,
+    but only in the GT-vs-GT matching mode. See the *-u, --use* option below for additional notes and caveats.
+    {nbsp} +
+    {nbsp} +
+    If performance is an issue, set *-e 0* for faster run times but less accurate results.
 
 *-g, --genotypes* 'FILE'::
     VCF/BCF file with reference genotypes to compare against
@@ -1640,8 +1703,13 @@ Without the *-g* option, multi-sample cross-check of samples in 'query.vcf.gz' i
 
 *-u, --use* 'TAG1'[,'TAG2']::
     specifies which tag to use in the query file ('TAG1') and the *-g* ('TAG2') file.
-    By default, the PL tag is used in the query file and GT in the *-g* file when
-    available.
+    By default, the PL tag is used in the query file and, when available, the GT tags in the
+    *-g* file.
+    {nbsp} +
+    {nbsp} +
+    Note that when the requested tag is not available, the program will attempt to use
+    the other tag. The output includes the number of sites that were matched by the four
+    possible mode (for example GT-vs-GT or GT-vs-PL).
 
 *Examples:*
 ----
@@ -1815,6 +1883,9 @@ in the other files.
     list of input files to output given as 1-based indices. With *-p* and no
     *-w*, all files are written.
 
+*--write-index*::
+    Automatically index the output file. This is done automatically with the *-p* option.
+
 ==== Examples:
 
 Create intersection and complements of two sets saving the output in dir/*
@@ -1889,7 +1960,8 @@ For "vertical" merge take a look at *<<concat,bcftools concat>>* or *<<norm,bcft
     file 'FILE' is not given and the dash ('-') is given, unknown reference
     bases generated at gVCF block splits will be substituted with N's.
     The *--gvcf* option uses the following default INFO rules:
-    *-i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max*.
+    *-i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max* and the following missing
+    rules: *-M PL:max,AD:0*.
 
 *-i, --info-rules* '-'|'TAG:METHOD'[,...]::
     Rules for merging INFO fields (scalars or vectors) or '-' to disable the
@@ -1925,6 +1997,15 @@ For "vertical" merge take a look at *<<concat,bcftools concat>>* or *<<norm,bcft
 -m id          ..  merge by ID
 ----
 
+*-M, --missing-rules* '-'|'TAG:METHOD'[,...]::
+    Rules for merging vector tags at multiallelic sites. When input files have different alternate
+    alleles, vector fields pertaining to unobserved alleles are set to missing ('.') by default.
+    The 'METHOD' is one of '.' (the default, use missing values), 'NUMBER' (use a constant value, e.g. 0),
+    'max' (the maximum value observed for other alleles in the sample). When *--gvcf* option is set,
+    the rule *-M PL:max,AD:0* is implied. This can be overriden with providing *-M -* or *-M PL:.,AD:.*.
+    Note that if the unobserved allele is explicitly present as '<*>' or '<NON_REF>', then its corresponding
+    value will be used regardless of *-M* settings.
+
 *--no-index*::
     the option allows to merge files without indexing them first. In order for this
     option to work, the user must ensure that the input files have chromosomes in
@@ -1951,6 +2032,8 @@ For "vertical" merge take a look at *<<concat,bcftools concat>>* or *<<norm,bcft
 *--threads* 'INT'::
     see *<<common_options,Common Options>>*
 
+*--write-index*::
+    Automatically index the output file
 
 [[mpileup]]
 === bcftools mpileup ['OPTIONS'] *-f* 'ref.fa' 'in.bam' ['in2.bam' [...]]
@@ -2199,6 +2282,9 @@ INFO/DPR    .. Deprecated in favor of INFO/AD; Number of high-quality bases for
     used by the earlier Bcftools releases.  For excample BQBZ becomes
     BQB.
 
+*--write-index*::
+    Automatically index the output file
+
 ==== Options for SNP/INDEL genotype likelihood computation
 
 *-X, --config* 'STR'::
@@ -2365,6 +2451,11 @@ the *<<fasta_ref,--fasta-ref>>* option is supplied.
     try to proceed with *-m-* even if malformed tags with incorrect number of fields
     are encountered, discarding such tags. (Experimental, use at your own risk.)
 
+*-g, --gff-annot* 'FILE'::
+    when a GFF file is provided, follow HGVS 3'rule and right-align variants in transcripts on the forward
+    strand.  In case of overlapping transcripts, the default mode is to left-align the variant. For a
+    description of the supported GFF3 file format see *<<csq,bcftools csq>>*.
+
 *--keep-sum* 'TAG'[,...]::
     keep vector sum constant when splitting multiallelic sites. Only AD tag
     is currently supported. See also https://github.com/samtools/bcftools/issues/360
@@ -2428,6 +2519,8 @@ the *<<fasta_ref,--fasta-ref>>* option is supplied.
     maximum distance between two records to consider when locally
     sorting variants which changed position during the realignment
 
+*--write-index*::
+    Automatically index the output file
 
 [[plugin]]
 
@@ -2485,6 +2578,9 @@ the usage examples that each plugin comes with.
 *--threads* 'INT'::
     see *<<common_options,Common Options>>*
 
+*--write-index*::
+    Automatically index the output file
+
 ==== Plugin options:
 
 *-h, --help*::
@@ -3103,7 +3199,11 @@ Transition probabilities:
     see *<<common_options,Common Options>>*
 
 *-T, --temp-dir* 'DIR'::
-    Use this directory to store temporary files
+    Use this directory to store temporary files. If the last six characters of the string DIR are XXXXXX,
+    then these are replaced with a string that makes the directory name unique.
+
+*--write-index*::
+    Automatically index the output file
 
 
 
@@ -3252,6 +3352,9 @@ Convert between VCF and BCF. Former *bcftools subset*.
 *--threads* 'INT'::
     see *<<common_options,Common Options>>*
 
+*--write-index*::
+    Automatically index the output file
+
 
 ==== Subset options:
 *-a, --trim-alt-alleles*::
@@ -3403,8 +3506,8 @@ Display the full bcftools version number in a machine-readable format.
 
 
 [[expressions]]
-EXPRESSIONS
------------
+FILTERING EXPRESSIONS
+---------------------
 
 These filtering expressions are accepted by most of the commands.
 
@@ -3662,8 +3765,17 @@ that the whole expression is passed to the program as intended:
 Please refer to the documentation of your shell for details.
 
 
-SCRIPTS AND OPTIONS
--------------------
+SCRIPTS
+-------
+
+[[gff2gff]]
+=== gff2gff
+Attempts to fix a GFF file to be correctly parsed by *<<csq,csq>>*.
+
+--
+    zcat in.gff.gz | gff2gff | gzip -c > out.gff.gz
+--
+
 
 [[plot-vcfstats]]
 === plot-vcfstats ['OPTIONS'] 'file.vchk' [...]
@@ -3729,8 +3841,9 @@ AUTHORS
 -------
 Heng Li from the Sanger Institute wrote the original C version of htslib,
 samtools and bcftools. Bob Handsaker from the Broad Institute implemented the
-BGZF library. Petr Danecek, Shane McCarthy and John Marshall are  maintaining
-and further developing bcftools.  Many other people contributed to the program
+BGZF library. Petr Danecek is maintaining and further developing bcftools, together
+with the rest of the https://www.sanger.ac.uk/tool/samtools-bcftools-htslib[samtools team].
+Many other people contributed to the program
 and to the file format specifications, both directly and indirectly by
 providing patches, testing and reporting bugs. We thank them all.
 
diff --git a/filter.c b/filter.c
index 3925475b7..b6547f81f 100644
--- a/filter.c
+++ b/filter.c
@@ -109,8 +109,8 @@ struct _filter_t
 #if ENABLE_PERL_FILTERS
     PerlInterpreter *perl;
 #endif
-    char **undef_tag;
-    int nundef_tag;
+    char **undef_tag, **used_tag;
+    int nundef_tag, nused_tag;
     int status, exit_on_error;
 };
 
@@ -328,6 +328,32 @@ const char **filter_list_undef_tags(filter_t *filter, int *ntags)
     *ntags = filter->nundef_tag;
     return (const char**)filter->undef_tag;
 }
+static void filter_add_used_tag(filter_t *filter, const char *prefix, char *str)
+{
+    int i;
+    kstring_t tmp = {0,0,0};
+    if ( prefix ) kputs(prefix,&tmp);
+    kputs(str,&tmp);
+    for (i=0; i<filter->nused_tag; i++)
+        if ( !strcmp(tmp.s,filter->used_tag[i]) ) break;
+    if ( i<filter->nused_tag )
+    {
+        free(tmp.s);
+        return;
+    }
+
+    filter->nused_tag++;
+    filter->used_tag = (char**)realloc(filter->used_tag,sizeof(*filter->used_tag)*filter->nused_tag);
+    if ( !filter->used_tag ) error("Could not allocate memory\n");
+    filter->used_tag[filter->nused_tag-1] = tmp.s;
+    if ( !filter->used_tag[filter->nused_tag-1] ) error("Could not allocate memory\n");
+}
+const char **filter_list_used_tags(filter_t *filter, int *ntags)
+{
+    *ntags = filter->nused_tag;
+    return (const char**)filter->used_tag;
+}
+
 
 
 /*
@@ -2841,6 +2867,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         {
             tok->setter = filters_set_qual;
             tok->tag = strdup("QUAL");
+            filter_add_used_tag(filter,NULL,tok->tag);
             return 0;
         }
         else if ( !strncasecmp(str,"TYPE",len) || !strncmp(str,"%TYPE",len) /* for backward compatibility */ )
@@ -2855,24 +2882,28 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             tok->tag = strdup("FILTER");
             filter->max_unpack |= BCF_UN_FLT;
             tok->tag_type = BCF_HL_FLT;
+            filter_add_used_tag(filter,NULL,tok->tag);
             return 0;
         }
         else if ( !strncasecmp(str,"ID",len) || !strncasecmp(str,"%ID",len) /* for backward compatibility */ )
         {
             tok->comparator = filters_cmp_id;
             tok->tag = strdup("ID");
+            filter_add_used_tag(filter,NULL,tok->tag);
             return 0;
         }
         else if ( !strncasecmp(str,"CHROM",len) )
         {
             tok->setter = &filters_set_chrom;
             tok->tag = strdup("CHROM");
+            filter_add_used_tag(filter,NULL,tok->tag);
             return 0;
         }
         else if ( !strncasecmp(str,"POS",len) )
         {
             tok->setter = &filters_set_pos;
             tok->tag = strdup("POS");
+            filter_add_used_tag(filter,NULL,tok->tag);
             return 0;
         }
         else if ( !strncasecmp(str,"REF",len) )
@@ -2880,6 +2911,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             tok->setter = &filters_set_ref_string;
             tok->is_str = 1;
             tok->tag = strdup("REF");
+            filter_add_used_tag(filter,NULL,tok->tag);
             return 0;
         }
         else if ( !strncasecmp(str,"ALT",len) )
@@ -2891,6 +2923,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             tok->idxs[0] = -1;
             tok->nidxs   = 1;
             tok->idx     = -2;
+            filter_add_used_tag(filter,NULL,tok->tag);
             return 0;
         }
         else if ( !strncasecmp(str,"N_ALT",len) )
@@ -3018,6 +3051,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         }
         tok->tag = strdup(tmp.s);
         if ( tmp.s ) free(tmp.s);
+        filter_add_used_tag(filter,is_fmt ? "FORMAT/" : "INFO/",tok->tag);
         return 0;
     }
     else if ( !strcasecmp(tmp.s,"ALT") )
@@ -3026,6 +3060,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         tok->is_str = 1;
         tok->tag = strdup(tmp.s);
         free(tmp.s);
+        filter_add_used_tag(filter,NULL,tok->tag);
         return 0;
     }
     else if ( !strcasecmp(tmp.s,"AN") )
@@ -3669,7 +3704,9 @@ void filter_destroy(filter_t *filter)
         }
     }
     for (i=0; i<filter->nundef_tag; i++) free(filter->undef_tag[i]);
+    for (i=0; i<filter->nused_tag; i++) free(filter->used_tag[i]);
     free(filter->undef_tag);
+    free(filter->used_tag);
     free(filter->cached_GT.buf);
     free(filter->cached_GT.mask);
     free(filter->filters);
diff --git a/filter.h b/filter.h
index 7be842a3a..cc60d6b96 100644
--- a/filter.h
+++ b/filter.h
@@ -79,5 +79,6 @@ filter_t *filter_parse(bcf_hdr_t *hdr, const char *str);
   */
 int filter_status(filter_t *filter);
 const char **filter_list_undef_tags(filter_t *filter, int *nundef);
+const char **filter_list_used_tags(filter_t *filter, int *nused);
 
 #endif
diff --git a/gff.c b/gff.c
new file mode 100644
index 000000000..90da84ba9
--- /dev/null
+++ b/gff.c
@@ -0,0 +1,1098 @@
+/* The MIT License
+
+   Copyright (c) 2023 Genome Research Ltd.
+
+   Author: Petr Danecek <pd3@sanger.ac.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "gff.h"
+
+/*
+    Helper structures, only for initialization
+
+    ftr_t
+        temporary list of all exons, CDS, UTRs
+*/
+KHASH_MAP_INIT_INT(int2tscript, gf_tscript_t*)
+KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
+typedef struct
+{
+    int type;           // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR
+    uint32_t beg;
+    uint32_t end;
+    uint32_t trid;
+    uint32_t strand:1;  // STRAND_REV,STRAND_FWD
+    uint32_t phase:2;   // 0, 1, 2, or 3 for unknown
+    uint32_t iseq:29;
+}
+ftr_t;
+
+/*
+    Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001)
+    to integer id.  To keep the memory requirements low, the original version
+    relied on IDs in the form of a string prefix and a numerical id.  However,
+    it turns out that this assumption is not valid for some ensembl GFFs, see
+    for example Zea_mays.AGPv4.36.gff3.gz
+ */
+typedef struct
+{
+    void *str2id;       // khash_str2int
+    int nstr, mstr;
+    char **str;         // numeric id to string
+}
+id_tbl_t;
+
+typedef struct
+{
+    // all exons, CDS, UTRs
+    ftr_t *ftr;
+    int nftr, mftr;
+
+    // mapping from gene id to gf_gene_t
+    kh_int2gene_t *gid2gene;
+
+    // mapping from transcript id to tscript, for quick CDS anchoring
+    kh_int2tscript_t *id2tr;
+
+    // sequences
+    void *seq2int;  // str2int hash
+    char **seq;
+    int nseq, mseq;
+
+    // ignored biotypes
+    void *ignored_biotypes;
+
+    id_tbl_t gene_ids;   // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx
+
+    // pointers to the current partially processed line
+    char *id, *id_end, *parent, *parent_end, *biotype, *biotype_end,
+         *chr, *chr_end, *name, *name_end, *type, *type_end;
+}
+aux_t;
+
+struct gff_t_
+{
+    const char *fname, *dump_fname;
+
+    // the main regidx lookups, from chr:beg-end to overlapping features and
+    // index iterator
+    regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
+
+    // temporary structures, deleted after initializtion
+    aux_t init;
+
+    // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
+    id_tbl_t tscript_ids;
+
+    int strip_chr_names, verbosity;
+    int force;      // force run under various conditions. Currently only to skip out-of-phase transcripts
+
+    struct {
+        int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id;
+        int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds;
+    } warned;
+};
+
+static const char *gf_strings_noncoding[] =
+{
+    "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript",
+    "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping",
+    "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
+    "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
+    "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
+    "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene",    "translated_unprocessed_pseudogene",
+    "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene",
+    "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf",
+    "lncRNA"
+};
+static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"};
+static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" };
+
+int gff_set(gff_t *gff, gff_opt_t key, ...)
+{
+    va_list args;
+    switch (key)
+    {
+        case dump_fname:
+            va_start(args, key);
+            gff->dump_fname = va_arg(args,char*);
+            va_end(args);
+            return 0;
+
+        case force_out_of_phase:
+            va_start(args, key);
+            gff->force = va_arg(args,int);
+            va_end(args);
+            return 0;
+
+        case strip_chr_names:
+            va_start(args, key);
+            gff->strip_chr_names = va_arg(args,int);
+            va_end(args);
+            return 0;
+
+        case verbosity:
+            va_start(args, key);
+            gff->verbosity = va_arg(args,int);
+            va_end(args);
+            return 0;
+
+        default:
+            error("The key %d is not supported with gff_set\n",key);
+    }
+    return 0;
+}
+
+void *gff_get(gff_t *gff, gff_opt_t key)
+{
+    switch (key)
+    {
+        case idx_cds: return gff->idx_cds;
+        case idx_utr: return gff->idx_utr;
+        case idx_exon: return gff->idx_exon;
+        case idx_tscript: return gff->idx_tscript;
+        default:
+            error("The key %d is not supported with gff_get\n",key);
+    }
+    return NULL;
+}
+
+const char *gff_id2string(gff_t *gff, id_type_t type, int id)    // currently only transcript ids
+{
+    return gff->tscript_ids.str[id];
+}
+
+const char *gf_type2gff_string(int type)
+{
+    if ( !GF_is_coding(type) )
+    {
+        if ( type < (1<<GF_coding_bit) ) return gf_strings_noncoding[type-1];
+        type &= (1<<(GF_coding_bit+1)) - 1;
+        return gf_strings_special[type - 1];
+    }
+    type &= (1<<GF_coding_bit) - 1;
+    return gf_strings_coding[type - 1];
+}
+
+/*
+    gff parsing functions
+*/
+static inline int feature_set_seq(gff_t *gff, char *chr_beg, char *chr_end)
+{
+    aux_t *aux = &gff->init;
+    char tmp = chr_end[1];
+    chr_end[1] = 0;
+    int iseq;
+    if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
+    {
+        char *new_chr = strdup(chr_beg);
+        hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
+        aux->seq[aux->nseq] = new_chr;
+        iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
+        aux->nseq++;
+        assert( aux->nseq < 1<<29 );  // see gf_gene_t.iseq and ftr_t.iseq
+    }
+    chr_end[1] = tmp;
+    return iseq;
+}
+static inline char *gff_skip(const char *line, char *ss)
+{
+    while ( *ss && *ss!='\t' ) ss++;
+    if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+    return ss+1;
+}
+static inline void gff_parse_chr(gff_t *gff, const char *line, char **chr_beg, char **chr_end)
+{
+    char *se = (char*) line;
+    while ( *se && *se!='\t' ) se++;
+    if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+    if ( gff->strip_chr_names && !strncasecmp("chr",line,3) ) line += 3;
+    *chr_beg = (char*) line;
+    *chr_end = se-1;
+}
+static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end)
+{
+    char *se = ss;
+    *beg = strtol(ss, &se, 10) - 1;
+    if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss);
+    ss = se+1;
+    *end = strtol(ss, &se, 10) - 1;
+    if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+    return se+1;
+}
+static void gff_id_init(id_tbl_t *tbl)
+{
+    memset(tbl, 0, sizeof(*tbl));
+    tbl->str2id = khash_str2int_init();
+}
+static void gff_id_destroy(id_tbl_t *tbl)
+{
+    khash_str2int_destroy_free(tbl->str2id);
+    free(tbl->str);
+}
+static inline int gff_id_register(id_tbl_t *tbl, char *beg, char *end, uint32_t *id_ptr)
+{
+    char tmp = end[1];
+    end[1] = 0;
+    int id;
+    if ( khash_str2int_get(tbl->str2id, beg, &id) < 0 )
+    {
+        id = tbl->nstr++;
+        hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
+        tbl->str[id] = strdup(beg);
+        khash_str2int_set(tbl->str2id, tbl->str[id], id);
+    }
+    end[1] = tmp;
+    *id_ptr = id;
+    return 0;
+}
+static inline int gff_parse_biotype(char *line)
+{
+    if ( !line ) return -1;
+    switch (*line)
+    {
+        case 'p':
+            if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING;
+            else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE;
+            else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT;
+            else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE;
+            else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE;
+            break;
+        case 'a':
+            if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT;
+            else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE;
+            else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF;
+            break;
+        case 'I':
+            if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE;
+            else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE;
+            else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE;
+            else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE;
+            else if ( !strncmp(line,"IG_C",4) ) return GF_IG_C;
+            else if ( !strncmp(line,"IG_D",4) ) return GF_IG_D;
+            else if ( !strncmp(line,"IG_J",4) ) return GF_IG_J;
+            else if ( !strncmp(line,"IG_V",4) ) return GF_IG_V;
+            else if ( !strncmp(line,"IG_LV",5) ) return GF_IG_LV;
+            break;
+        case 'T':
+            if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE;
+            else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE;
+            else if ( !strncmp(line,"TR_C",4) ) return GF_TR_C;
+            else if ( !strncmp(line,"TR_D",4) ) return GF_TR_D;
+            else if ( !strncmp(line,"TR_J",4) ) return GF_TR_J;
+            else if ( !strncmp(line,"TR_V",4) ) return GF_TR_V;
+            break;
+        case 'M':
+            if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE;
+            else if ( !strncasecmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA;
+            else if ( !strncasecmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA;
+            else if ( !strncasecmp(line,"MRNA",4) ) return GF_PROTEIN_CODING;
+            break;
+        case 'l':
+            if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA;
+            if ( !strncmp(line,"lncRNA",7) ) return GF_lncRNA;
+            break;
+        case 'm':
+            if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA;
+            else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE;
+            else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE;
+            else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA;
+            else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA;
+            else if ( !strncasecmp(line,"mRNA",4) ) return GF_PROTEIN_CODING;
+            break;
+        case 'r':
+            if ( !strncmp(line,"rRNA",4) ) return GF_rRNA;
+            else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME;
+            else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON;
+            else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED;
+            break;
+        case 's':
+            if ( !strncmp(line,"snRNA",5) ) return GF_snRNA;
+            else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA;
+            else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA;
+            else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA;
+            else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA;
+            else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC;
+            else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING;
+            break;
+        case 't':
+            if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE;
+            else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE;
+            else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
+            else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE;
+            else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE;
+            else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE;
+            break;
+        case 'n':
+            if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD;
+            else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY;
+            break;
+        case 'N':
+            if ( !strncmp(line,"NMD",3) ) return GF_NMD;
+            break;
+        case 'k':
+            if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA;
+            break;
+        case 'u':
+            if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE;
+            else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE;
+            break;
+        case 'L':
+            if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE;
+            break;
+        case '3':
+            if ( !strncasecmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA;
+            else if ( !strncasecmp(line,"3_prime_overlapping_ncRNA",25) ) return GF_3PRIME_OVERLAPPING_ncRNA;
+            break;
+        case 'd':
+            if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN;
+            break;
+        case 'v':
+            if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA;
+            break;
+        case 'b':
+            if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA;
+            break;
+    }
+    return 0;
+}
+static inline int gff_ignored_biotype(gff_t *gff, char *ss, char *se)
+{
+    if ( !ss ) return 0;
+
+    char tmp = se[1];
+    se[1] = 0;
+
+    char *key = ss;
+    int n = 0;
+    if ( khash_str2int_get(gff->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss);
+    khash_str2int_set(gff->init.ignored_biotypes, key, n+1);
+
+    se[1] = tmp;
+    return 1;
+}
+static gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id)
+{
+    khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id);
+    gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k);
+    if ( !gene )
+    {
+        gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t));
+        int ret;
+        k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret);
+        kh_val(aux->gid2gene,k) = gene;
+    }
+    return gene;
+}
+static void gff_parse_transcript(gff_t *gff, const char *line, ftr_t *ftr)
+{
+    aux_t *aux = &gff->init;
+
+    ftr->type = gff_parse_biotype(aux->biotype);
+    if ( ftr->type <= 0 )
+    {
+        char tmp = aux->type_end[1];
+        aux->type_end[1] = 0;
+        ftr->type = gff_parse_biotype(aux->type);
+        aux->type_end[1] = tmp;
+    }
+    if ( ftr->type <= 0 )
+    {
+        if ( !gff_ignored_biotype(gff,aux->biotype,aux->biotype_end) )
+        {
+            if ( gff->verbosity > 0 )
+            {
+                if ( !gff->warned.unknown_tscript_biotype || gff->verbosity > 1 )
+                    fprintf(stderr,"Warning: Ignoring transcript with unknown biotype .. %s\n", line);
+                gff->warned.unknown_tscript_biotype++;
+            }
+        }
+        return;
+    }
+
+    if ( !aux->id )
+        error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+    if ( !aux->parent )
+        error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+
+    uint32_t trid,gene_id;
+    gff_id_register(&gff->tscript_ids, aux->id, aux->id_end, &trid);
+    gff_id_register(&aux->gene_ids, aux->parent, aux->parent_end, &gene_id);
+
+    gf_tscript_t *tr = (gf_tscript_t*) calloc(1,sizeof(gf_tscript_t));
+    tr->id     = trid;
+    tr->strand = ftr->strand;
+    tr->gene   = gene_init(aux, gene_id);
+    tr->type   = ftr->type;
+    tr->beg    = ftr->beg;
+    tr->end    = ftr->end;
+
+    khint_t k;
+    int ret;
+    k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret);
+    kh_val(aux->id2tr,k) = tr;
+}
+// register exon, CDS, UTR
+static void gff_parse_exon(gff_t *gff, const char *line, ftr_t *ftr)
+{
+    aux_t *aux = &gff->init;
+    if ( !aux->parent )
+        error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring found: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+
+    // associate with transcript id
+    gff_id_register(&gff->tscript_ids, aux->parent, aux->parent_end, &ftr->trid);
+
+    if ( ftr->strand==-1 && gff->verbosity > 0 )
+    {
+        if ( !gff->warned.unknown_strand || gff->verbosity > 1 )
+            fprintf(stderr,"Warning: Ignoring GFF feature with unknown strand .. %s\n",line);
+        gff->warned.unknown_strand++;
+    }
+    if ( ftr->phase==-1 && gff->verbosity > 0 )
+    {
+        if ( !gff->warned.unknown_phase|| gff->verbosity > 1 )
+            fprintf(stderr,"Warning: Ignoring GFF feature with unknown phase .. %s\n",line);
+        gff->warned.unknown_phase++;
+    }
+    ftr->iseq = feature_set_seq(gff, aux->chr,aux->chr_end);
+}
+static void gff_parse_gene(gff_t *gff, const char *line, ftr_t *ftr)
+{
+    aux_t *aux = &gff->init;
+    if ( !aux->id ) return;
+
+    uint32_t gene_id;
+    gff_id_register(&aux->gene_ids, aux->id, aux->id_end, &gene_id);
+
+    gf_gene_t *gene = gene_init(aux, gene_id);
+    if ( gene->name )
+    {
+        if ( !gff->warned.duplicate_id || gff->verbosity > 1 )
+            fprintf(stderr,"Warning: The GFF contains features with duplicate id .. %s\n",line);
+        gff->warned.duplicate_id++;
+        return;
+    }
+
+    gene->iseq   = feature_set_seq(gff, aux->chr,aux->chr_end);
+    gene->beg    = ftr->beg;
+    gene->end    = ftr->end;
+    gene->strand = ftr->strand;
+    gene->id     = gene_id;
+
+    if ( aux->name )
+    {
+        gene->name = (char*) malloc(aux->name_end - aux->name + 2);
+        memcpy(gene->name,aux->name,aux->name_end - aux->name + 1);
+        gene->name[aux->name_end - aux->name + 1] = 0;
+    }
+    else
+        gene->name = strdup(aux->gene_ids.str[gene_id]); // Name=<GeneName> field is not present, use the gene ID instead
+}
+
+// Returns 0 for exons,CDS,UTRs to indiciate these need to be pruned later and regidx built on them,
+// or -1 to indiciate the structure needs not be saved (either because of an error or because saved
+// as transcript or gene.)
+static int gff_parse_line(gff_t *gff, char *line, ftr_t *ftr)
+{
+    // - skip empty lines and commented lines
+    // - columns
+    //      1.      chr
+    //      2.      <skip>
+    //      3.      CDS, transcript, gene, ...
+    //      4-5.    beg,end
+    //      6.      <skip>
+    //      7.      strand
+    //      8.      phase
+    //      9.      Parent=transcript:ENST(\d+);ID=...;biotype=... etc
+
+    char *ss = line;
+    if ( !*ss ) return -1;      // skip blank lines
+    if ( *ss=='#' ) return -1;  // skip comments
+
+    aux_t *aux = &gff->init;
+    gff_parse_chr(gff, line, &aux->chr, &aux->chr_end);
+    ss = gff_skip(line, aux->chr_end + 2);
+
+    // 3rd column: is this a CDS, transcript, gene, etc.. The parsing order by frequency in Homo_sapiens.GRCh37.87.gff3
+    int is_gene_line = 0;
+    ftr->type = 0;
+    aux->type = ss;
+    if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; }
+    else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; }
+    else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; }
+    else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; }
+    else if ( !strncmp("biological_region\t",ss,18) ) { return -1; }    // skip
+    else if ( !strncmp("gene\t",ss,5) ) { is_gene_line = 1; ss += 5; }
+    else ss = gff_skip(line, ss);
+    aux->type_end = ss - 1;
+
+    // 4-5th columns: beg,end
+    ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
+
+    // 6th column: skip
+    ss = gff_skip(line, ss);
+
+    // 7th column: strand
+    ftr->strand = -1;
+    if ( *ss == '+' ) ftr->strand = STRAND_FWD;
+    else if ( *ss == '-' ) ftr->strand = STRAND_REV;
+    ss += 2;
+
+    // 8th column: phase (codon offset)
+    ftr->phase = -1;
+    if ( *ss == '0' ) ftr->phase = 0;
+    else if ( *ss == '1' ) ftr->phase = 1;
+    else if ( *ss == '2' ) ftr->phase = 2;
+    else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN;     // exons and even CDS in some GFFs do not have phase
+    ss += 2;
+
+    // 9th column: id, parent, name, biotype
+    aux->name = NULL, aux->id = NULL, aux->parent = NULL, aux->biotype = NULL;
+    while ( *ss )
+    {
+        char *es = ss;
+        while ( *es && *es!=';' ) es++;
+        if ( !strncmp(ss,"ID=",3) )
+        {
+            ss += 3;
+            aux->id_end = es - 1;
+            aux->id = ss;
+            if ( !strncmp(ss,"gene:",5) ) { aux->id += 5; is_gene_line = 1; }
+            else if ( !strncmp(ss,"transcript:",11) ) aux->id += 11;
+        }
+        else if ( !strncmp(ss,"Name=",5) ) { aux->name = ss + 5; aux->name_end = es - 1; }
+        else if ( !strncmp(ss,"Parent=",7) )
+        {
+            ss += 7;
+            aux->parent_end = es - 1;
+            aux->parent = ss;
+            if ( !strncmp(ss,"gene:",5) ) aux->parent += 5;
+            else if ( !strncmp(ss,"transcript:",11) ) aux->parent += 11;
+        }
+        else if ( !strncmp(ss,"biotype=",8) ) { aux->biotype = ss + 8; aux->biotype_end = es - 1; }
+        else if ( !strncmp(ss,"gene_biotype=",13) ) { aux->biotype = ss + 13; aux->biotype_end = es - 1; }
+        if ( !*es ) break;
+        ss = es + 1;
+    }
+
+    if ( is_gene_line || !aux->parent )
+    {
+        gff_parse_gene(gff, line, ftr);
+        return -1;
+    }
+
+    if ( ftr->type )
+    {
+        gff_parse_exon(gff, line, ftr);
+        return 0;
+    }
+
+    gff_parse_transcript(gff, line, ftr);
+    return -1;
+}
+
+static int cmp_cds_ptr(const void *a, const void *b)
+{
+    // comparison function for qsort of transcripts's CDS
+    if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1;
+    if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1;
+    return 0;
+}
+
+static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end)
+{
+    *chr_beg = *chr_end = aux->seq[iseq];
+    while ( (*chr_end)[1] ) (*chr_end)++;
+}
+static gf_tscript_t *tscript_init(aux_t *aux, uint32_t trid)
+{
+    khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid);
+    gf_tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k);
+    assert( tr );
+    return tr;
+}
+static void register_cds(gff_t *gff, ftr_t *ftr)
+{
+    // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet.
+    //  ftr is the result of parsing a gff CDS line
+    aux_t *aux = &gff->init;
+
+    gf_tscript_t *tr = tscript_init(aux, ftr->trid);
+    if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand);
+
+    gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t));
+    cds->tr    = tr;
+    cds->beg   = ftr->beg;
+    cds->len   = ftr->end - ftr->beg + 1;
+    cds->icds  = 0;     // to keep valgrind on mac happy
+    cds->phase = ftr->phase;
+
+    hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds);
+    tr->cds[tr->ncds++] = cds;
+}
+static void register_utr(gff_t *gff, ftr_t *ftr)
+{
+    aux_t *aux = &gff->init;
+    gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t));
+    utr->which = ftr->type==GF_UTR3 ? prime3 : prime5;
+    utr->beg   = ftr->beg;
+    utr->end   = ftr->end;
+    utr->tr    = tscript_init(aux, ftr->trid);
+
+    char *chr_beg, *chr_end;
+    chr_beg_end(&gff->init, utr->tr->gene->iseq, &chr_beg, &chr_end);
+    regidx_push(gff->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr);
+}
+static void register_exon(gff_t *gff, ftr_t *ftr)
+{
+    aux_t *aux = &gff->init;
+    gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t));
+    exon->beg = ftr->beg;
+    exon->end = ftr->end;
+    exon->tr  = tscript_init(aux, ftr->trid);
+
+    char *chr_beg, *chr_end;
+    chr_beg_end(&gff->init, exon->tr->gene->iseq, &chr_beg, &chr_end);
+    regidx_push(gff->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon);
+}
+
+static void tscript_init_cds(gff_t *gff)
+{
+    aux_t *aux = &gff->init;
+
+    // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds)
+    khint_t k;
+    for (k=0; k<kh_end(aux->id2tr); k++)
+    {
+        if ( !kh_exist(aux->id2tr, k) ) continue;
+        gf_tscript_t *tr = (gf_tscript_t*) kh_val(aux->id2tr, k);
+
+        // position-to-tscript lookup
+        char *chr_beg, *chr_end;
+        chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end);
+        regidx_push(gff->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr);
+
+        if ( !tr->ncds ) continue;      // transcript with no CDS
+
+        // sort CDs
+        qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr);
+
+        // trim non-coding start
+        int i, len = 0;
+        if ( tr->strand==STRAND_FWD )
+        {
+            if ( tr->cds[0]->phase != CDS_PHASE_UNKN )
+            {
+                if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME;
+                tr->cds[0]->beg += tr->cds[0]->phase;
+                tr->cds[0]->len -= tr->cds[0]->phase;
+                tr->cds[0]->phase = 0;
+            }
+
+            // sanity check phase; the phase number in gff tells us how many bases to skip in this
+            // feature to reach the first base of the next codon
+            int tscript_ok = 1;
+            for (i=0; i<tr->ncds; i++)
+            {
+                if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
+                {
+                    if ( gff->verbosity > 0 )
+                    {
+                        if ( !gff->warned.unknown_cds_phase || gff->verbosity > 1 )
+                            fprintf(stderr,"Warning: CDS with unknown phase, could not verify reading frame in transcript %s\n",gff->tscript_ids.str[tr->id]);
+                        gff->warned.unknown_cds_phase++;
+                    }
+                    len += tr->cds[i]->len;
+                    continue;
+                }
+                int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
+                if ( phase!=len%3 )
+                {
+                    if ( !gff->force )
+                        error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+                                gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+                    if ( gff->verbosity > 0 )
+                    {
+                        if ( !gff->warned.wrong_phase || gff->verbosity > 1 )
+                            fprintf(stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
+                                    gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+                        gff->warned.wrong_phase++;
+                    }
+                    tscript_ok = 0;
+                    break;
+                }
+                len += tr->cds[i]->len;
+            }
+            if ( !tscript_ok ) continue;    // skip this transcript
+        }
+        else
+        {
+            if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN )
+            {
+                // Check that the phase is not bigger than CDS length. Curiously, this can really happen,
+                // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141.
+                // This also fixes phase of 5' incomplete CDS, see test/csq/ENST00000520868/ENST00000520868.gff
+                // todo: the same for the fwd strand
+                i = tr->ncds - 1;
+                int phase = tr->cds[i]->phase;
+                if ( phase ) tr->trim |= TRIM_5PRIME;
+                while ( i>=0 && phase > tr->cds[i]->len )
+                {
+                    phase -= tr->cds[i]->len;
+                    tr->cds[i]->phase = 0;
+                    tr->cds[i]->len   = 0;
+                    i--;
+                }
+                if ( gff->verbosity > 0 && tr->cds[i]->phase )
+                {
+                    if ( !gff->warned.incomplete_cds || gff->verbosity > 1 )
+                        fprintf(stderr,"Note: truncated transcript %s with incomplete CDS (this is very common)\n",gff->tscript_ids.str[tr->id]);
+                    gff->warned.incomplete_cds++;
+                }
+                tr->cds[i]->len  -= tr->cds[i]->phase;
+                tr->cds[i]->phase = 0;
+            }
+
+            // sanity check phase
+            int tscript_ok = 1;
+            for (i=tr->ncds-1; i>=0; i--)
+            {
+                if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
+                {
+                    if ( gff->verbosity > 0 )
+                    {
+                        if ( !gff->warned.unknown_cds_phase || gff->verbosity > 1 )
+                            fprintf(stderr,"Warning: CDS with unknown phase, could not verify reading frame in transcript %s\n",gff->tscript_ids.str[tr->id]);
+                        gff->warned.unknown_cds_phase++;
+                    }
+                    len += tr->cds[i]->len;
+                    continue;
+                }
+                int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
+                if ( phase!=len%3 )
+                {
+                    if ( !gff->force )
+                        error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+                                gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+                    if ( gff->verbosity > 0 )
+                    {
+                        if ( !gff->warned.wrong_phase || gff->verbosity > 1 )
+                            fprintf(stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
+                                    gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+                        gff->warned.wrong_phase++;
+                    }
+                    tscript_ok = 0;
+                    break;
+                }
+                len += tr->cds[i]->len;
+            }
+            if ( !tscript_ok ) continue;    // skip this transcript
+        }
+
+        // set len. At the same check that CDS within a transcript do not overlap
+        len = 0;
+        for (i=0; i<tr->ncds; i++)
+        {
+            tr->cds[i]->icds = i;
+            len += tr->cds[i]->len;
+            if ( !i ) continue;
+
+            gf_cds_t *a = tr->cds[i-1];
+            gf_cds_t *b = tr->cds[i];
+            if ( a->beg + a->len - 1 >= b->beg )
+            {
+                if ( gff->verbosity > 0 )
+                {
+                    if ( !gff->warned.overlapping_cds || gff->verbosity > 1 )
+                        fprintf(stderr,"Warning: GFF contains overlapping CDS %s, %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32" (ribosomal slippage?)\n",
+                                gff->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+                    gff->warned.overlapping_cds++;
+                }
+            }
+        }
+
+        if ( len%3 != 0 )
+        {
+            // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289
+            //  http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289
+            // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one.
+
+            if ( gff->verbosity > 0 )
+            {
+                if ( !gff->warned.incomplete_cds || gff->verbosity > 1 )
+                    fprintf(stderr,"Note: truncated transcript %s with incomplete CDS (this is very common)\n",gff->tscript_ids.str[tr->id]);
+                gff->warned.incomplete_cds++;
+            }
+
+            tr->trim |= TRIM_3PRIME;
+            if ( tr->strand==STRAND_FWD )
+            {
+                i = tr->ncds - 1;
+                while ( i>=0 && len%3 )
+                {
+                    int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
+                    tr->cds[i]->len -= dlen;
+                    len -= dlen;
+                    i--;
+                }
+            }
+            else
+            {
+                i = 0;
+                while ( i<tr->ncds && len%3 )
+                {
+                    int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
+                    tr->cds[i]->len -= dlen;
+                    tr->cds[i]->beg += dlen;
+                    len -= dlen;
+                    i++;
+                }
+            }
+        }
+
+        // set CDS offsets and insert into regidx
+        len=0;
+        for (i=0; i<tr->ncds; i++)
+        {
+            tr->cds[i]->pos = len;
+            len += tr->cds[i]->len;
+            regidx_push(gff->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]);
+        }
+    }
+}
+
+static void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); }
+static void regidx_free_tscript(void *payload) { gf_tscript_t *tr = *((gf_tscript_t**)payload); free(tr->cds); free(tr); }
+
+static int gff_dump(gff_t *gff, const char *fname)
+{
+    BGZF *out = bgzf_open(fname,"wg");
+    if ( !out ) error("Failed to open %s: %s\n", fname, strerror(errno));
+
+    kstring_t str = {0,0,0};
+
+    khint_t k;
+    for (k=0; k<kh_end(gff->init.gid2gene); k++)
+    {
+        if ( !kh_exist(gff->init.gid2gene, k) ) continue;
+        gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k);
+        char *gene_id = gff->init.gene_ids.str[gene->id];
+        str.l = 0;
+        ksprintf(&str,"%s\t.\tgene\t%d\t%d\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->init.seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':'-',gene_id,gene->name,gene->used);
+        if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+    }
+
+    regitr_t *itr = regitr_init(gff->idx_tscript);
+    while ( regitr_loop(itr) )
+    {
+        gf_tscript_t *tr = regitr_payload(itr, gf_tscript_t*);
+        char *gene_id =  gff->init.gene_ids.str[tr->gene->id];
+        const char *type = tr->type==GF_PROTEIN_CODING ? "mRNA" : gf_type2gff_string(tr->type);
+        str.l = 0;
+        ksprintf(&str,"%s\t.\t%s\t%d\t%d\t.\t%c\t.\tID=%s;Parent=%s;biotype=%s;used=%d\n",itr->seq,type,itr->beg+1,itr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id],gene_id,gf_type2gff_string(tr->type),tr->used);
+        if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+    }
+    regitr_destroy(itr);
+
+    itr = regitr_init(gff->idx_cds);
+    while ( regitr_loop(itr) )
+    {
+        gf_cds_t *cds = regitr_payload(itr,gf_cds_t*);
+        gf_tscript_t *tr = cds->tr;
+        str.l = 0;
+        ksprintf(&str,"%s\t.\tCDS\t%d\t%d\t.\t%c\t%c\tParent=%s\n",itr->seq,cds->beg+1,cds->beg+cds->len,tr->strand==STRAND_FWD?'+':'-',cds->phase==3?'.':cds->phase+(int)'0',gff->tscript_ids.str[tr->id]);
+        if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+    }
+    regitr_destroy(itr);
+
+    itr = regitr_init(gff->idx_utr);
+    while ( regitr_loop(itr) )
+    {
+        gf_utr_t *utr = regitr_payload(itr,gf_utr_t*);
+        gf_tscript_t *tr = utr->tr;
+        str.l = 0;
+        ksprintf(&str,"%s\t.\t%s_prime_UTR\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,utr->which==prime3?"three":"five",utr->beg+1,utr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]);
+        if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+    }
+    regitr_destroy(itr);
+
+    itr = regitr_init(gff->idx_exon);
+    while ( regitr_loop(itr) )
+    {
+        gf_exon_t *exon = regitr_payload(itr,gf_exon_t*);
+        gf_tscript_t *tr = exon->tr;
+        str.l = 0;
+        ksprintf(&str,"%s\t.\texon\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,exon->beg+1,exon->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]);
+        if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+    }
+    regitr_destroy(itr);
+
+    if ( bgzf_close(out)!=0 ) error("Error: close failed .. %s\n", fname);
+    free(str.s);
+
+    return 0;
+}
+
+int gff_parse(gff_t *gff)
+{
+    if ( gff->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", gff->fname);
+
+    aux_t *aux = &gff->init;
+    aux->seq2int   = khash_str2int_init();   // chrom's numeric id
+    aux->gid2gene  = kh_init(int2gene);      // gene id to gf_gene_t, for idx_gene
+    aux->id2tr     = kh_init(int2tscript);   // transcript id to tscript_t
+    gff->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(gf_tscript_t*), NULL);
+    aux->ignored_biotypes = khash_str2int_init();
+    gff_id_init(&aux->gene_ids);
+    gff_id_init(&gff->tscript_ids);
+
+    // parse gff
+    kstring_t str = {0,0,0};
+    htsFile *fp = hts_open(gff->fname,"r");
+    if ( !fp ) error("Failed to read %s\n", gff->fname);
+    while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
+    {
+        hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr);
+        int ret = gff_parse_line(gff, str.s, aux->ftr + aux->nftr);
+        if ( !ret ) aux->nftr++;
+    }
+    free(str.s);
+    if ( hts_close(fp)!=0 ) error("Close failed: %s\n", gff->fname);
+
+
+    // process gff information: connect CDS and exons to transcripts
+    gff->idx_cds  = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL);
+    gff->idx_utr  = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL);
+    gff->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL);
+
+    int i;
+    for (i=0; i<aux->nftr; i++)
+    {
+        ftr_t *ftr = &aux->ftr[i];
+
+        // check whether to keep this feature: is there a mapping trid -> gene_id -> gene?
+        khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid);
+        if ( k==kh_end(aux->id2tr) ) continue;       // no corresponding transcript registered, must be an unsupported biotype
+
+        gf_tscript_t *tr = kh_val(aux->id2tr,k);
+        tr->used = 1;
+        tr->gene->used = 1;
+
+        // populate regidx by category:
+        //      ftr->type   .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5
+        //      gene->type  .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ...
+        if ( ftr->type==GF_CDS ) register_cds(gff, ftr);
+        else if ( ftr->type==GF_EXON ) register_exon(gff, ftr);
+        else if ( ftr->type==GF_UTR5 ) register_utr(gff, ftr);
+        else if ( ftr->type==GF_UTR3 ) register_utr(gff, ftr);
+        else
+            error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
+    }
+    tscript_init_cds(gff);
+
+    if ( gff->verbosity > 0 )
+    {
+        fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
+                regidx_nregs(gff->idx_tscript),
+                regidx_nregs(gff->idx_exon),
+                regidx_nregs(gff->idx_cds),
+                regidx_nregs(gff->idx_utr));
+    }
+
+    if ( gff->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) )
+    {
+        khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes;
+        fprintf(stderr,"Ignored the following biotypes:\n");
+        for (i = kh_begin(ign); i < kh_end(ign); i++)
+        {
+            if ( !kh_exist(ign,i)) continue;
+            const char *biotype = kh_key(ign,i);
+            if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")";
+            fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype);
+        }
+    }
+    khash_str2int_destroy_free(aux->ignored_biotypes);
+
+    // warned about unprinted warnings
+    if ( gff->verbosity > 0 )
+    {
+        int nwarn = 0;
+        #define INC_NWARN(X) if (gff->warned.X) nwarn += gff->verbosity > 1 ? 0 : gff->warned.X - 1;
+        INC_NWARN(unknown_chr);
+        INC_NWARN(unknown_tscript_biotype);
+        INC_NWARN(unknown_strand);
+        INC_NWARN(unknown_phase);
+        INC_NWARN(duplicate_id);
+        INC_NWARN(unknown_cds_phase);
+        INC_NWARN(incomplete_cds);
+        INC_NWARN(wrong_phase);
+        INC_NWARN(overlapping_cds);
+        if ( nwarn > 0 )
+            fprintf(stderr,"Warning: %d warnings were supressed, run with `--verbose 2` to see them all\n",nwarn);
+    }
+
+    if ( gff->dump_fname ) gff_dump(gff, gff->dump_fname);
+
+    if (  !regidx_nregs(gff->idx_tscript) )
+        error("Error: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n"
+              "       or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n"
+              "       of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n");
+
+    free(aux->seq);
+    free(aux->ftr);
+    khash_str2int_destroy_free(aux->seq2int);
+    // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
+    kh_destroy(int2tscript,aux->id2tr);
+    gff_id_destroy(&aux->gene_ids);
+
+    return 0;
+}
+
+gff_t *gff_init(const char *fname)
+{
+    gff_t *gff = calloc(sizeof(gff_t),1);
+    gff->fname = fname;
+    return gff;
+}
+void gff_destroy(gff_t *gff)
+{
+    khint_t k;
+    if ( gff->init.gid2gene )
+    {
+        for (k=0; k<kh_end(gff->init.gid2gene); k++)
+        {
+            if ( !kh_exist(gff->init.gid2gene, k) ) continue;
+            gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k);
+            free(gene->name);
+            free(gene);
+        }
+        kh_destroy(int2gene,gff->init.gid2gene);
+    }
+
+    regidx_destroy(gff->idx_cds);
+    regidx_destroy(gff->idx_utr);
+    regidx_destroy(gff->idx_exon);
+    regidx_destroy(gff->idx_tscript);
+
+    gff_id_destroy(&gff->tscript_ids);
+    free(gff);
+}
+
diff --git a/gff.h b/gff.h
new file mode 100644
index 000000000..ebb64634a
--- /dev/null
+++ b/gff.h
@@ -0,0 +1,332 @@
+/* The MIT License
+
+   Copyright (c) 2023 Genome Research Ltd.
+
+   Author: Petr Danecek <pd3@sanger.ac.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+
+ */
+/*
+    GFF parsing code refactored from csq.c
+
+    Things that would be nice to have
+        - dynamic N_REF_PAD
+        - for stop-lost events (also in frameshifts) report the number of truncated aa's
+        - memory could be greatly reduced by indexing gff (but it is quite compact already)
+        - deletions that go beyond transcript boundaries are not checked at sequence level
+            - alloc tscript->ref in hap_finalize, introduce fa_off_beg:16,fa_off_end:16
+            - see test/csq/ENST00000573314/insertion-overlap.vcf #1476288882
+
+    Read about transcript types here
+        http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
+        http://www.ensembl.org/info/genome/variation/predicted_data.html
+        https://www.gencodegenes.org/pages/biotypes.html
+
+    List of supported biotypes
+        antisense
+        IG_C_gene
+        IG_D_gene
+        IG_J_gene
+        IG_LV_gene
+        IG_V_gene
+        lincRNA
+        lncRNA      .. generic term for 3prime_overlapping_ncRNA, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic, sense_overlapping
+        macro_lncRNA
+        miRNA
+        misc_RNA
+        Mt_rRNA
+        Mt_tRNA
+        polymorphic_pseudogene
+        processed_transcript
+        protein_coding, mRNA
+        ribozyme
+        rRNA
+        sRNA
+        scRNA
+        scaRNA
+        sense_intronic
+        sense_overlapping
+        snRNA
+        snoRNA
+        TR_C_gene
+        TR_D_gene
+        TR_J_gene
+        TR_V_gene
+
+    The gff parsing logic
+        We collect features such by combining gff lines A,B,C as follows:
+            A .. gene line with a supported biotype
+                    A.ID=~/^gene:/
+
+            B .. transcript line referencing A with supported biotype
+                    B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/
+
+            C .. corresponding CDS, exon, and UTR lines:
+                    C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/
+
+        For coding biotypes ("protein_coding" or "polymorphic_pseudogene") the
+        complete chain link C -> B -> A is required. For the rest, link B -> A suffices.
+
+
+    The supported consequence types, sorted by impact:
+        splice_acceptor_variant .. end region of an intron changed (2bp at the 3' end of an intron)
+        splice_donor_variant    .. start region of an intron changed (2bp at the 5' end of an intron)
+        stop_gained             .. DNA sequence variant resulting in a stop codon
+        frameshift_variant      .. number of inserted/deleted bases not a multiple of three, disrupted translational frame
+        stop_lost               .. elongated transcript, stop codon changed
+        start_lost              .. the first codon changed
+        inframe_altering        .. combination of indels leading to unchanged reading frame and length
+        inframe_insertion       .. inserted coding sequence, unchanged reading frame
+        inframe_deletion        .. deleted coding sequence, unchanged reading frame
+        missense_variant        .. amino acid (aa) change, unchanged length
+        splice_region_variant   .. change within 1-3 bases of the exon or 3-8 bases of the intron
+        synonymous_variant      .. DNA sequence variant resulting in no amino acid change
+        stop_retained_variant   .. different stop codon
+        start_retained_variant  .. start codon retained by indel realignment
+        non_coding_variant      .. variant in non-coding sequence, such as RNA gene
+        5_prime_UTR_variant
+        3_prime_UTR_variant
+        intron_variant          .. reported only if none of the above
+        intergenic_variant      .. reported only if none of the above
+
+
+    The annotation algorithm.
+        The algorithm checks if the variant falls in a region of a supported type. The
+        search is performed in the following order, until a match is found:
+            1. idx_cds(gf_cds_t) - lookup CDS by position, create haplotypes, call consequences
+            2. idx_utr(gf_utr_t) - check UTR hits
+            3. idx_exon(gf_exon_t) - check for splice variants
+            4. idx_tscript(tscript_t) - check for intronic variants, RNAs, etc.
+
+        These regidx indexes are created by parsing a gff3 file as follows:
+            1.  create the array "ftr" of all UTR, CDS, exons. This will be
+            processed later and pruned based on transcript types we want to keep.
+            In the same go, create the hash "id2tr" of transcripts to keep
+            (based on biotype) which maps from transcript_id to a transcript. At
+            the same time also build the hash "gid2gene" which maps from gene_id to
+            gf_gene_t pointer.
+
+            2.  build "idx_cds", "idx_tscript", "idx_utr" and "idx_exon" indexes.
+            Use only features from "ftr" which are present in "id2tr".
+
+            3.  clean data that won't be needed anymore: ftr, id2tr, gid2gene.
+
+    Data structures.
+        idx_cds, idx_utr, idx_exon, idx_tscript:
+            as described above, regidx structures for fast lookup of exons/transcripts
+            overlapping a region, the payload is a pointer to tscript.cds
+*/
+
+#ifndef GFF_H__
+#define GFF_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <getopt.h>
+#include <math.h>
+#include <inttypes.h>
+#include <htslib/hts.h>
+#include <htslib/khash.h>
+#include <htslib/khash_str2int.h>
+#include <htslib/kseq.h>
+#include <htslib/faidx.h>
+#include <htslib/bgzf.h>
+#include <errno.h>
+#include <unistd.h>
+#include <ctype.h>
+#include "bcftools.h"
+#include "regidx.h"
+
+#ifndef __FUNCTION__
+#  define __FUNCTION__ __func__
+#endif
+
+// Definition of splice_region, splice_acceptor and splice_donor
+#define N_SPLICE_DONOR         2
+#define N_SPLICE_REGION_EXON   3
+#define N_SPLICE_REGION_INTRON 8
+
+#define STRAND_REV 0
+#define STRAND_FWD 1
+
+#define TRIM_NONE   0
+#define TRIM_5PRIME 1
+#define TRIM_3PRIME 2
+
+
+// GFF line types
+#define GFF_UNKN_LINE    0
+#define GFF_TSCRIPT_LINE 1
+#define GFF_GENE_LINE    2
+
+
+/*
+    Genomic features, for fast lookup by position to overlapping features
+*/
+#define GF_coding_bit 6
+#define GF_is_coding(x) ((x) & (1<<GF_coding_bit))
+#define GF_MT_rRNA                       1                      // non-coding: 1, 2, ...
+#define GF_MT_tRNA                       2
+#define GF_lincRNA                       3
+#define GF_miRNA                         4
+#define GF_MISC_RNA                      5
+#define GF_rRNA                          6
+#define GF_snRNA                         7
+#define GF_snoRNA                        8
+#define GF_PROCESSED_TRANSCRIPT          9
+#define GF_ANTISENSE                    10
+#define GF_macro_lncRNA                 11
+#define GF_ribozyme                     12
+#define GF_sRNA                         13
+#define GF_scRNA                        14
+#define GF_scaRNA                       15
+#define GF_SENSE_INTRONIC               16
+#define GF_SENSE_OVERLAPPING            17
+#define GF_PSEUDOGENE                   18
+#define GF_PROCESSED_PSEUDOGENE         19
+#define GF_ARTIFACT                     20
+#define GF_IG_PSEUDOGENE                21
+#define GF_IG_C_PSEUDOGENE              22
+#define GF_IG_J_PSEUDOGENE              23
+#define GF_IG_V_PSEUDOGENE              24
+#define GF_TR_V_PSEUDOGENE              25
+#define GF_TR_J_PSEUDOGENE              26
+#define GF_MT_tRNA_PSEUDOGENE           27
+#define GF_misc_RNA_PSEUDOGENE          28
+#define GF_miRNA_PSEUDOGENE             29
+#define GF_RIBOZYME                     30
+#define GF_RETAINED_INTRON              31
+#define GF_RETROTRANSPOSED              32
+#define GF_tRNA_PSEUDOGENE              33
+#define GF_TRANSCRIBED_PROCESSED_PSEUDOGENE     34
+#define GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE   35
+#define GF_TRANSCRIBED_UNITARY_PSEUDOGENE       36
+#define GF_TRANSLATED_UNPROCESSED_PSEUDOGENE    37
+#define GF_TRANSLATED_PROCESSED_PSEUDOGENE      38
+#define GF_KNOWN_NCRNA                          39
+#define GF_UNITARY_PSEUDOGENE                   40
+#define GF_UNPROCESSED_PSEUDOGENE               41
+#define GF_LRG_GENE                             42
+#define GF_3PRIME_OVERLAPPING_ncRNA             43
+#define GF_DISRUPTED_DOMAIN                     44
+#define GF_vaultRNA                             45
+#define GF_BIDIRECTIONAL_PROMOTER_lncRNA        46
+#define GF_AMBIGUOUS_ORF                        47
+#define GF_lncRNA                               48
+#define GF_PROTEIN_CODING               (1|(1<<GF_coding_bit))  // coding: 65, 66, ...
+#define GF_POLYMORPHIC_PSEUDOGENE       (2|(1<<GF_coding_bit))
+#define GF_IG_C                         (3|(1<<GF_coding_bit))
+#define GF_IG_D                         (4|(1<<GF_coding_bit))
+#define GF_IG_J                         (5|(1<<GF_coding_bit))
+#define GF_IG_LV                        (6|(1<<GF_coding_bit))
+#define GF_IG_V                         (7|(1<<GF_coding_bit))
+#define GF_TR_C                         (8|(1<<GF_coding_bit))
+#define GF_TR_D                         (9|(1<<GF_coding_bit))
+#define GF_TR_J                        (10|(1<<GF_coding_bit))
+#define GF_TR_V                        (11|(1<<GF_coding_bit))
+#define GF_NMD                         (12|(1<<GF_coding_bit))
+#define GF_NON_STOP_DECAY              (13|(1<<GF_coding_bit))
+#define GF_CDS      ((1<<(GF_coding_bit+1))+1)                  // special types: 129, 130, ...
+#define GF_EXON     ((1<<(GF_coding_bit+1))+2)
+#define GF_UTR3     ((1<<(GF_coding_bit+1))+3)
+#define GF_UTR5     ((1<<(GF_coding_bit+1))+4)
+// GF_MAX = (1<<30)-1, see hap_node_t
+
+#define CDS_PHASE_UNKN 3
+typedef struct gf_tscript_t_ gf_tscript_t;
+typedef struct
+{
+    gf_tscript_t *tr;   // transcript
+    uint32_t beg;       // the start coordinate of the CDS (on the reference strand, 0-based)
+    uint32_t pos;       // 0-based index of the first exon base within the transcript (only to
+                        //  update hap_node_t.sbeg in hap_init, could be calculated on the fly)
+    uint32_t len;       // exon length
+    uint32_t icds:30,   // exon index within the transcript
+             phase:2;   // offset of the CDS: 0,1,2 or 3 for unknown
+}
+gf_cds_t;
+typedef struct
+{
+    char *name;                     // human readable name, e.g. ORF45
+    uint32_t iseq;
+    uint32_t id,beg,end,strand:31,  // used only by --dump-gff
+             used:1;                // does it have any exons, CDS, UTR?
+}
+gf_gene_t;
+typedef struct
+{
+    uint32_t beg,end;
+    gf_tscript_t *tr;
+}
+gf_exon_t;
+typedef enum { prime3, prime5 } utr_t;
+typedef struct
+{
+    utr_t which;
+    uint32_t beg,end;
+    gf_tscript_t *tr;
+}
+gf_utr_t;
+struct gf_tscript_t_
+{
+    uint32_t id;        // transcript id
+    uint32_t beg,end;   // transcript's beg and end coordinate (ref strand, 0-based, inclusive)
+    uint32_t strand:1,  // STRAND_REV or STRAND_FWD
+             used:1,    // does it have any exons, UTRs, CDS?
+             ncds:30,   // number of exons
+             mcds;
+    gf_cds_t **cds;     // ordered list of exons
+    uint32_t trim:2,    // complete, 5' or 3' trimmed, see TRIM_* types
+             type:30;   // one of GF_* types
+    gf_gene_t *gene;
+    void *aux;          // auxiliary user data
+};
+
+typedef enum
+{
+    // write options
+    verbosity,          // int, 0-2
+    strip_chr_names,    // int, 0 to leave as is, 1 to strip 'chr' prefix
+    force_out_of_phase, // int, 1 to proceed even CDS exon out of expected phase
+    dump_fname,         // const char*, dump the parsed GFF into this file, for debugging purposes
+
+    // read options
+    idx_cds,
+    idx_utr,
+    idx_exon,
+    idx_tscript,
+}
+gff_opt_t;
+
+typedef enum { transcript } id_type_t;  // for gff_id2str
+
+typedef struct gff_t_ gff_t;
+
+gff_t *gff_init(const char *fname);
+int gff_parse(gff_t *gff);
+void gff_destroy(gff_t *gff);
+
+int gff_set(gff_t *gff, gff_opt_t key, ...);   // returns 0 on success
+void *gff_get(gff_t *gff, gff_opt_t key);
+const char *gff_id2string(gff_t *gff, id_type_t type, int id);
+const char *gf_type2gff_string(int type);
+
+#endif
diff --git a/hex.h b/hex.h
index d915b2862..95210e3cf 100644
--- a/hex.h
+++ b/hex.h
@@ -3,10 +3,10 @@
 // hex.h
 //
 // @category   Libraries
-// @author     Nicola Asuni <nicola.asuni@genomicsplc.com>
+// @author     Nicola Asuni <info@tecnick.com>
+// @link       https://github.com/tecnickcom/variantkey
+// @license    MIT [LICENSE](https://raw.githubusercontent.com/tecnickcom/variantkey/main/LICENSE)
 // @copyright  2017-2018 GENOMICS plc
-// @license    MIT (see LICENSE)
-// @link       https://github.com/genomicsplc/variantkey
 //
 // LICENSE
 //
diff --git a/misc/gff2gff b/misc/gff2gff
index 27485fad8..bfcda412d 100755
--- a/misc/gff2gff
+++ b/misc/gff2gff
@@ -39,11 +39,12 @@ sub error
     my (@msg) = @_;
     if ( scalar @msg ) { confess @msg; }
     print
-        "About: Attempt to fix a GFF file to be correctly parse by bcftools/csq, see\n",
+        "About: Attempt to fix a GFF file to be correctly parsed by bcftools/csq, see\n",
         "       the man page for the description of the expected format\n",
         "           http://samtools.github.io/bcftools/bcftools-man.html#csq\n",
         "Usage: gff2gff [OPTIONS]\n",
         "Options:\n",
+        "   -v, --verbose        Increase verbosity\n",
         "   -h, -?, --help       This help message\n",
         "Example:\n",
         "   zcat in.gff.gz | gff2gff | gzip -c > out.gff.gz\n",
@@ -52,10 +53,11 @@ sub error
 }
 sub parse_params
 {
-    my $opts = {};
+    my $opts = { verbose=>0, warned=>{}, fixed=>{} };
     if ( -t STDIN && !@ARGV ) { error(); }
     while (defined(my $arg=shift(@ARGV)))
     {
+        if ( $arg eq '-v' or $arg eq '--verbose' ) { $$opts{verbose}++; next; }
         if ( $arg eq '-?' or $arg eq '-h' or $arg eq '--help' ) { error(); }
         error("Unknown parameter \"$arg\". Run -h for help.\n");
     }
@@ -72,8 +74,29 @@ sub gff2gff
         chomp($row[-1]);
         if ( $row[2] eq 'gene' ) { fix_gene($opts,$line,\@row); }
         elsif ( $row[2] eq 'transcript' ) { fix_transcript($opts,$line,\@row); }
+        elsif ( $row[8]=~/biotype=/ && !($row[8]=~/Parent=/) ) { fix_gene($opts,$line,\@row); }
+        elsif ( $row[8]=~/Parent=/ ) { fix_transcript($opts,$line,\@row); }
         print join("\t",@row)."\n";
     }
+    if ( !$$opts{verbose} )
+    {
+        my $nwarn = 0;
+        for my $key (keys %{$$opts{warned}})
+        {
+            if ( $$opts{warned}{$key} > 1 ) { $nwarn += $$opts{warned}{$key} - 1; }
+        }
+        if ( $nwarn ) { print STDERR "Suppressed $nwarn warnings, run with -v to see them all\n"; }
+    }
+    my $nfixed = 0;
+    for my $key (keys %{$$opts{fixed}})
+    {
+        $nfixed += $$opts{fixed}{$key};
+    }
+    print STDERR "Fixed $nfixed records\n";
+    for my $key (sort {$$opts{fixed}{$b}<=>$$opts{fixed}{$a}} keys %{$$opts{fixed}})
+    {
+        print STDERR "\t$$opts{fixed}{$key}x .. $key\n";
+    }
 }
 sub fix_gene
 {
@@ -88,35 +111,28 @@ sub fix_gene
     if ( $$row[8] =~ /biotype=([^;]+)/ ) { $biotype = $1; $biotype_ok = 1; }
     if ( !$biotype_ok && $$row[8] =~ /gene_type=([^;]+)/i ) { $biotype = $1; }
     if ( $$row[8] =~ /Name=([^;]+)/ ) { $name = $1; $name_ok = 1; }
-    if ( !$biotype_ok && $$row[8] =~ /gene_name=([^;]+)/i ) { $name = $1; }
+    if ( !$name_ok && $$row[8] =~ /gene_name=([^;]+)/i ) { $name = $1; }
 
     if ( !$id_ok )
     {
-        if ( defined $id ) { $$row[8] .= ";ID=$id"; }
-        elsif ( !$$opts{gene_id_warned} )
+        if ( defined $id ) { $$row[8] .= ";ID=$id"; $$opts{fixed}{gene_id}++; }
+        else
         {
-            print STDERR "Unable to determine gene ID, see e.g. $line\n";
-            $$opts{gene_id_warned} = 1;
+            if ( $$opts{verbose}>0 or !$$opts{warned}{gene_id} ) { print STDERR "Unable to determine gene ID: $line"; }
+            $$opts{warned}{gene_id}++;
         }
     }
-    if ( !$biotype_ok )
+    if ( !$biotype_ok && defined $biotype )
     {
-        if ( defined $biotype ) { $$row[8] .= ";biotype=$biotype"; }
-        elsif ( !$$opts{gene_biotype_warned} )
-        {
-            print STDERR "Unable to determine gene biotype/type, see e.g. $line\n";
-            $$opts{gene_biotype_warned} = 1;
-        }
+        $$row[8] .= ";biotype=$biotype";
+        $$opts{fixed}{gene_biotype}++;
     }
-    if ( !$name_ok )
+    if ( !$name_ok && defined $name )
     {
-        if ( defined $name ) { $$row[8] .= ";Name=$name"; }
-        elsif ( !$$opts{gene_name_warned} )
-        {
-            print STDERR "Unable to determine gene name, see e.g. $line\n";
-            $$opts{gene_name_warned} = 1;
-        }
+        $$row[8] .= ";Name=$name";
+        $$opts{fixed}{gene_name}++;
     }
+    if ( defined $biotype ) { $$opts{gene_id2biotype}{$id} = $biotype; }
 }
 sub fix_transcript
 {
@@ -134,29 +150,30 @@ sub fix_transcript
 
     if ( !$id_ok )
     {
-        if ( defined $id ) { $$row[8] .= ";ID=$id"; }
-        elsif ( !$$opts{tscript_id_warned} )
+        if ( defined $id ) { $$row[8] .= ";ID=$id"; $$opts{fixed}{transcript_id}++; }
+        else
         {
-            print STDERR "Unable to determine transcript ID, see e.g. $line\n";
-            $$opts{tscript_id_warned} = 1;
+            if ( $$opts{verbose}>0 or !$$opts{warned}{tscript_id} ) { print STDERR "Unable to determine transcript ID: $line"; }
+            $$opts{warned}{tscript_id}++;
         }
     }
     if ( !$biotype_ok )
     {
-        if ( defined $biotype ) { $$row[8] .= ";biotype=$biotype"; }
-        elsif ( !$$opts{tscript_biotype_warned} )
+        if ( defined $biotype ) { $$row[8] .= ";biotype=$biotype"; $$opts{fixed}{transcript_biotype}++; }
+        elsif ( defined $parent && exists($$opts{gene_id2biotype}{$parent}) ) { $$row[8] .= ";biotype=$$opts{gene_id2biotype}{$parent}"; $$opts{fixed}{transcript_biotype}++; }
+        else
         {
-            print STDERR "Unable to determine transcript biotype/type, see e.g. $line\n";
-            $$opts{tscript_biotype_warned} = 1;
+            if ( $$opts{verbose}>0 or !$$opts{warned}{tscript_biotype} ) { print STDERR "Unable to determine transcript biotype/type: $line"; }
+            $$opts{warned}{tscript_biotype}++;
         }
     }
     if ( !$parent_ok )
     {
-        if ( defined $parent ) { $$row[8] .= ";Parent=$parent"; }   # currently cannot happen
-        elsif ( !$$opts{tscript_parent_warned} )
+        if ( defined $parent ) { $$row[8] .= ";Parent=$parent"; $$opts{fixed}{transcript_parent}++; }   # currently cannot happen
+        else
         {
-            print STDERR "Unable to determine transcript Parent, see e.g. $line\n";
-            $$opts{tscript_parent_warned} = 1;
+            if ( $$opts{verbose}>0 or !$$opts{warned}{tscript_parent} ) { print STDERR "Unable to determine transcript Parent: $line"; }
+            $$opts{warned}{tscript_parent}++;
         }
     }
 }
diff --git a/misc/plot-vcfstats b/misc/plot-vcfstats
index 58e8a3bd2..990a56ffd 100755
--- a/misc/plot-vcfstats
+++ b/misc/plot-vcfstats
@@ -64,9 +64,13 @@ if ( $$opts{make_plots} )
         plot_indel_distribution($opts,$id);
         plot_indel_vaf_distribution($opts,$id);
         plot_substitutions($opts,$id);
+        plot_vaf($opts,$id,'snv');
+        plot_vaf($opts,$id,'indel');
         plot_per_sample_stats($opts,$id);
         plot_DP($opts,$id);
         plot_hwe($opts,$id);
+        plot_vaf25_by_sample($opts,$id,'snv');
+        plot_vaf25_by_sample($opts,$id,'indel');
     }
     plot($opts);
 }
@@ -245,6 +249,11 @@ sub parse_params
                 header=>'HWE',
                 exp=>"# HWE\t[2]id\t[3]1st ALT allele frequency\t[4]Number of observations\t[5]25th percentile\t[6]median\t[7]75th percentile",
             },
+            {
+                id=>'VAF',
+                header=>'VAF',
+                exp=>"# VAF\t[2]id\t[3]sample\t[4]SNV VAF distribution\t[5]indel VAF distribution",
+            },
         ],
         SN_keys=>[
             'number of samples:',
@@ -750,7 +759,7 @@ sub init_plots
         plot_hethom_by_sample = 1
         plot_snps_by_sample = 1
         plot_indels_by_sample = 1
-        plot_singletons_by_sample = 1 
+        plot_singletons_by_sample = 1
         plot_depth_by_sample = 1
         plot_SNP_count_by_af = 1
         plot_Indel_count_by_af = 1
@@ -767,6 +776,10 @@ sub init_plots
         plot_tstv_by_qual = 1
         plot_tstv_by_usr = 1
         plot_substitutions = 1
+        plot_vaf_snv = 1
+        plot_vaf_indel = 1
+        plot_vaf25_snv = 1
+        plot_vaf25_indel = 1
 
 
         # Set to 1 to use sample names for xticks instead of numeric sequential IDs
@@ -1166,7 +1179,7 @@ sub plot_tstv_by_AF
     my $img  = "tstv_by_af.$id";
     my $vals = rebin_values(\@vals,8,0);
 
-    
+
     open(my $tfh,'>',"$img.dat") or error("$img.dat: $!");
     print $tfh "# [1]Allele frequency\t[2]Number of sites\t[3]ts/tv\n";
     for (my $i=0; $i<@$vals; $i++)
@@ -1554,7 +1567,7 @@ sub plot_counts_by_AF_col
                     id = int(row[0])
                     if id not in dat: dat[id] = []
                     dat[id].append([float(row[1]),float(row[2])])
-            
+
             if plot_${title}_count_by_af:
                 fig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7))
                 ax1 = fig.add_subplot(111)
@@ -1792,15 +1805,33 @@ sub plot_substitutions
 
             if plot_substitutions:
                 fig = plt.figure(figsize=($$opts{img_width},$$opts{img_height}))
-                cm  = mpl.cm.get_cmap('autumn')
+                ts = [ 'A>G','G>A','C>T','T>C' ]
+                nts = 0
+                ntv = 0
+                for x in dat:
+                    if x[1] in ts: nts += 1
+                    else: ntv += 1
                 n = 12
-                col = []
-                for i in list(range(n)): col.append(cm(1.*i/n))
+                col  = list(range(n))
+                ecol = list(range(n))
+                for i in range(n):
+                    col[i]  = '#ffce84'
+                    ecol[i] = '#f5c781'
+                    col[1]  = col[5]  = col[6]  = col[10]  = '#ff9900'
+                    ecol[1] = ecol[5] = ecol[6] = ecol[10] = '#ef8f00'
                 ax1 = fig.add_subplot(111)
-                ax1.bar([row[0] for row in dat], [row[2] for row in dat], color=col)
+                ax1.bar([row[0] for row in dat], [row[2] for row in dat], color=col, edgecolor=ecol)
                 ax1.set_ylabel('Count')
                 ax1.ticklabel_format(style='sci', scilimits=(0,0), axis='y')
-                ax1.set_xlim(-0.5,n+0.5)
+                ax1.spines['top'].set_visible(False)
+                ax1.spines['right'].set_visible(False)
+                ax1.get_xaxis().tick_bottom()
+                ax1.get_yaxis().tick_left()
+                ax1.spines['bottom'].set_color('grey')
+                ax1.spines['left'].set_color('grey')
+                mpl.rcParams['text.color'] = '555555'
+                ax1.patch.set_visible(False)
+                ax1.set_xlim(-0.5,n-0.5)
                 plt.xticks([row[0] for row in dat],[row[1] for row in dat],rotation=45)
                 plt.title('$$opts{title}{$id}')
                 plt.savefig('$img.png')
@@ -1810,6 +1841,126 @@ sub plot_substitutions
         ";
 }
 
+sub plot_vaf
+{
+    my ($opts,$id,$type) = @_;
+
+    my @vals = get_values($opts,$id,'VAF');
+    if ( !@vals ) { return; }
+
+    my @sum = ();
+    for my $row (@vals)
+    {
+        my @cnt = split(/,/,$type eq 'snv' ? $$row[1] : $$row[2]);
+        for (my $i=0; $i<@cnt; $i++)
+        {
+            $sum[$i] += $cnt[$i];
+        }
+    }
+
+    my $fh  = $$opts{plt_fh};
+    my $img = "vaf.$type.$id";
+
+    tprint $fh, "
+            dat = [
+        ";
+    for (my $i=0; $i<@sum; $i++) { tprint $fh, "\t[".1.0*$i/(scalar @sum).",$sum[$i]],\n"; }
+    tprint $fh, "]\n";
+    tprint $fh, "
+
+            if plot_vaf_$type:
+                fig = plt.figure(figsize=($$opts{img_width},$$opts{img_height}))
+                ax1 = fig.add_subplot(111)
+                wd = 0.7        # fraction of dx distance
+                min_dx = None
+                for i in range(len(dat)-1):
+                    if min_dx==None or min_dx > abs(dat[i+1][0]-dat[i][0]): min_dx = abs(dat[i+1][0]-dat[i][0])
+                if min_dx==None: min_dx = 1
+                wd = min_dx*wd
+                ax1.bar([x[0] for x in dat],[x[1] for x in dat],wd) #,**plt_args)
+
+                ax1.set_ylabel('Count')
+                ax1.set_xlabel('Variant Allele Frequency')
+                ax1.ticklabel_format(style='sci', scilimits=(-2,2), axis='y')
+
+                ax1.spines['top'].set_visible(False)
+                ax1.spines['right'].set_visible(False)
+                ax1.get_xaxis().tick_bottom()
+                ax1.get_yaxis().tick_left()
+                ax1.spines['bottom'].set_color('grey')
+                ax1.spines['left'].set_color('grey')
+                mpl.rcParams['text.color'] = '555555'
+                ax1.patch.set_visible(False)
+
+                plt.subplots_adjust(right=0.95,bottom=0.15)
+                plt.title('$$opts{title}{$id}')
+                plt.savefig('$img.png')
+                if img_fmt != 'png': plt.savefig('$img.' + img_fmt)
+                plt.close()
+        ";
+}
+
+sub plot_vaf25_by_sample
+{
+    my ($opts,$id,$type) = @_;
+
+    my @vals = get_values($opts,$id,'VAF');
+    if ( !@vals ) { return; }
+
+    my @vaf = ();
+    for my $row (@vals)
+    {
+        my @cnt = split(/,/,$type eq 'snv' ? $$row[1] : $$row[2]);
+        my $sum = 0;
+        my $sum25 = 0;
+        for (my $i=0; $i<@cnt; $i++)
+        {
+            if ( 1.0*$i/(scalar @cnt) < 0.25 ) { $sum25 += $cnt[$i] }
+            $sum += $cnt[$i];
+        }
+        push @vaf, $sum ? $sum25/$sum : 0;
+    }
+
+    my $fh  = $$opts{plt_fh};
+    my $img = "vaf25.$type.$id";
+
+    tprint $fh, "
+            dat = [
+        ";
+    for (my $i=0; $i<@vaf; $i++) { tprint $fh, "\t[$i,$vaf[$i]],\n"; }
+    tprint $fh, "]\n";
+    tprint $fh, "
+
+            if plot_vaf25_$type:
+                fig = plt.figure(figsize=(2*$$opts{img_width},$$opts{img_height}*0.7))
+                ax1 = fig.add_subplot(111)
+                ax1.plot([row[0] for row in dat], [row[1] for row in dat], 'o', color='$$opts{id2col}[$id]',mec='$$opts{id2col}[$id]')
+                ax1.set_ylabel('nVAF<0.25')
+                ax1.set_ylim(-0.1,1.1)
+                if sample_names:
+                    plt.xticks([int(row[0]) for row in dat],[row[7] for row in dat],**sample_font)
+                    plt.subplots_adjust(**sample_margins)
+                else:
+                    plt.subplots_adjust(right=0.98,left=0.07,bottom=0.17)
+                    ax1.set_xlabel('Sample ID')
+
+                ax1.spines['top'].set_visible(False)
+                ax1.spines['right'].set_visible(False)
+                ax1.get_xaxis().tick_bottom()
+                ax1.get_yaxis().tick_left()
+                ax1.spines['bottom'].set_color('grey')
+                ax1.spines['left'].set_color('grey')
+                mpl.rcParams['text.color'] = '555555'
+                ax1.patch.set_visible(False)
+
+                plt.title('$$opts{title}{$id}')
+                plt.savefig('$img.png')
+                if img_fmt != 'png': plt.savefig('$img.' + img_fmt)
+                plt.close()
+        ";
+}
+
+
 sub singletons
 {
     my ($opts,$id) = @_;
@@ -2118,6 +2269,8 @@ sub create_pdf
     }
 
     tprint $tex, fmt_slide3v($opts, "tstv_by_sample", 'Ts/Tv by sample');
+    tprint $tex, fmt_slide3v($opts, "vaf25.snv", 'Fraction of SNVs with VAF$<$25\% by sample');
+    tprint $tex, fmt_slide3v($opts, "vaf25.indel", 'Fraction of indels with VAF$<$25\% by sample');
     tprint $tex, fmt_slide3v($opts, "hets_by_sample", 'Hets vs non-ref Homs by sample');
     tprint $tex, fmt_slide3v($opts, "singletons_by_sample", 'Singletons by sample {\normalsize(hets and homs)}');
     tprint $tex, fmt_slide3v($opts, "dp_by_sample", 'Average depth by sample');
@@ -2193,6 +2346,8 @@ sub create_pdf
     tprint $tex, fmt_slide3h($opts, "depth", 'Depth distribution');
     tprint $tex, fmt_slide3h($opts, "hwe", 'Number of HETs by AF');
     tprint $tex, fmt_slide3h($opts, "substitutions", 'Substitution types');
+    tprint $tex, fmt_slide3h($opts, "vaf.snv", 'SNV Variant Allele Frequency');
+    tprint $tex, fmt_slide3h($opts, "vaf.indel", 'Indel Variant Allele Frequency');
     #tprint $tex, fmt_slide3h($opts, "irc_by_af", 'Indel Repeat Consistency by AF');
     #tprint $tex, fmt_slide3h($opts, "irc_by_rlen", 'Indel Consistency by Repeat Type');
 
@@ -2203,7 +2358,7 @@ sub create_pdf
     my $cmd = "$engine $tex_file >$$opts{logfile} 2>&1";
     print STDERR "Creating PDF: $cmd\n" unless !$$opts{verbose};
     system($cmd);
-    if ( $? ) { error("The command exited with non-zero status, please consult the output of $engine: $$opts{dir}$$opts{logfile}\n\n"); }
+    if ( $? ) { error("The command exited with non-zero status, please consult the output of $engine: $$opts{dir}/$$opts{logfile}\n\n"); }
     print STDERR "Finished: $$opts{dir}/$pdf_file\n" unless !$$opts{verbose};
 }
 
diff --git a/mpileup.c b/mpileup.c
index 9b21b1873..d42a6a360 100644
--- a/mpileup.c
+++ b/mpileup.c
@@ -1,6 +1,6 @@
 /*  mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools
 
-    Copyright (C) 2008-2022 Genome Research Ltd.
+    Copyright (C) 2008-2023 Genome Research Ltd.
     Portions copyright (C) 2009-2012 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -101,6 +101,8 @@ typedef struct {
     int indels_v20;
     int argc;
     char **argv;
+    int write_index;
+    char *index_fn;
 } mplp_conf_t;
 
 typedef struct {
@@ -489,37 +491,43 @@ static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp,
             if ((flag & MPLP_REALN_PARTIAL) && nt > 15 && ncig > 1) {
                 // Left & right cigar op match.
                 int lr = b->core.l_qseq > 500;
-                int lm = 0, rm = 0, k;
+                int lm = 0, rm = 0, k, nm = 0;
                 for (k = 0; k < ncig; k++) {
                     int cop = bam_cigar_op(cig[k]);
                     if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
                         continue;
 
                     if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
-                        cop == BAM_CEQUAL)
+                        cop == BAM_CEQUAL) {
                         lm += bam_cigar_oplen(cig[k]);
-                    else
+                        nm++;
+                    } else {
                         break;
+                    }
                 }
 
-                for (k = ncig-1; k >= 0; k--) {
-                    int cop = bam_cigar_op(cig[k]);
-                    if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
+                // if everything is a match (or sequence (mis)match) then move on
+                // because we don't have an indel in the middle
+                if (nm != ncig) {
+                    for (k = ncig-1; k >= 0; k--) {
+                        int cop = bam_cigar_op(cig[k]);
+                        if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
+                            continue;
+
+                        if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
+                            cop == BAM_CEQUAL)
+                            rm += bam_cigar_oplen(cig[k]);
+                        else
+                            break;
+                    }
+
+                    if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4)
                         continue;
 
-                    if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
-                        cop == BAM_CEQUAL)
-                        rm += bam_cigar_oplen(cig[k]);
-                    else
-                        break;
+                    if (lm >= REALN_DIST && rm >= REALN_DIST &&
+                        has_clip < (0.15+0.05*(nt>20))*nt)
+                        continue;
                 }
-
-                if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4)
-                    continue;
-
-                if (lm >= REALN_DIST && rm >= REALN_DIST &&
-                    has_clip < (0.15+0.05*(nt>20))*nt)
-                    continue;
             }
 
             if (b->core.l_qseq > 500) {
@@ -849,6 +857,7 @@ static int mpileup(mplp_conf_t *conf)
     for (i=0; i<nsmpl; i++)
         bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
     if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output");
+    if ( conf->write_index && init_index(conf->bcf_fp,conf->bcf_hdr,conf->output_fname,&conf->index_fn)<0 ) error("Error: failed to initialise index for %s\n",conf->output_fname);
 
     conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ,
                               conf->delta_baseQ);
@@ -958,6 +967,15 @@ static int mpileup(mplp_conf_t *conf)
     bcf_destroy1(conf->bcf_rec);
     if (conf->bcf_fp)
     {
+        if ( conf->write_index )
+        {
+            if ( bcf_idx_save(conf->bcf_fp)<0 )
+            {
+                if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname);
+                error("Error: cannot write to index %s\n",conf->index_fn);
+            }
+            free(conf->index_fn);
+        }
         if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname);
         bcf_hdr_destroy(conf->bcf_hdr);
         bcf_call_destroy(conf->bca);
@@ -1227,6 +1245,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
         "  -O, --output-type TYPE  'b' compressed BCF; 'u' uncompressed BCF;\n"
         "                          'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n"
         "      --threads INT       Use multithreading with INT worker threads [0]\n"
+        "      --write-index       Automatically index the output files [off]\n"
         "\n"
         "SNP/INDEL genotype likelihoods options:\n"
         "  -X, --config STR        Specify platform specific profiles (see below)\n"
@@ -1375,6 +1394,7 @@ int main_mpileup(int argc, char *argv[])
         {"seed", required_argument, NULL, 13},
         {"ambig-reads", required_argument, NULL, 14},
         {"ar", required_argument, NULL, 14},
+        {"write-index",no_argument,NULL,21},
         {NULL, 0, NULL, 0}
     };
     while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) {
@@ -1497,6 +1517,7 @@ int main_mpileup(int argc, char *argv[])
             }
             break;
         case  20: mplp.indels_v20 = 1; break;
+        case  21: mplp.write_index = 1; break;
         case 'A': use_orphan = 1; break;
         case 'F': mplp.min_frac = atof(optarg); break;
         case 'm': mplp.min_support = atoi(optarg); break;
diff --git a/plugins/add-variantkey.c b/plugins/add-variantkey.c
index 1b1bce8b2..af9efd7a8 100644
--- a/plugins/add-variantkey.c
+++ b/plugins/add-variantkey.c
@@ -2,7 +2,7 @@
 
     Copyright (C) 2017-2018 GENOMICS plc.
 
-    Author: Nicola Asuni <nicola.asuni@genomicsplc.com>
+    Author: Nicola Asuni <nicola.asuni@tecnick.com>
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/plugins/contrast.c b/plugins/contrast.c
index 71d9d3d45..624bfeead 100644
--- a/plugins/contrast.c
+++ b/plugins/contrast.c
@@ -1,19 +1,19 @@
 /* The MIT License
 
-   Copyright (c) 2018-2021 Genome Research Ltd.
+   Copyright (c) 2018-2023 Genome Research Ltd.
 
    Author: Petr Danecek <pd3@sanger.ac.uk>
-   
+
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -69,6 +69,8 @@ typedef struct
     int ncontrol_gts, mcontrol_gts, ntotal, nskipped, ntested, ncase_al, ncase_gt;
     kstring_t case_als_smpl, case_gts_smpl;
     int max_AC, nals[4];    // nals: number of control-ref, control-alt, case-ref and case-alt alleles in the region
+    char *index_fn;
+    int write_index;
 }
 args_t;
 
@@ -81,7 +83,7 @@ const char *about(void)
 
 static const char *usage_text(void)
 {
-    return 
+    return
         "\n"
         "About: Runs a basic association test, per-site or in a region, and checks for novel alleles and\n"
         "       genotypes in two groups of samples. Adds the following INFO annotations:\n"
@@ -108,6 +110,7 @@ static const char *usage_text(void)
         "   -t, --targets REG                Similar to -r but streams rather than index-jumps\n"
         "   -T, --targets-file FILE          Similar to -R but streams rather than index-jumps\n"
         "       --targets-overlap 0|1|2      Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
+        "       --write-index                Automatically index the output files [off]\n"
         "\n"
         "Example:\n"
         "   # Test if any of the samples a,b is different from the samples c,d,e\n"
@@ -233,6 +236,7 @@ static void init_data(args_t *args)
     args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
     if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
     if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
 
     if ( args->max_AC_str )
     {
@@ -251,6 +255,15 @@ static void init_data(args_t *args)
 static void destroy_data(args_t *args)
 {
     bcf_hdr_destroy(args->hdr_out);
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(args->out_fh)<0 )
+        {
+            if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
     if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
     free(args->case_als_smpl.s);
     free(args->case_gts_smpl.s);
@@ -314,7 +327,7 @@ static int process_record(args_t *args, bcf1_t *rec)
         for (j=0; j<ngts; j++)
         {
             if ( ptr[j]==bcf_int32_vector_end ) break;
-            if ( bcf_gt_is_missing(ptr[j]) ) continue; 
+            if ( bcf_gt_is_missing(ptr[j]) ) continue;
             int ial = bcf_gt_allele(ptr[j]);
             if ( ial > 31 )
             {
@@ -353,7 +366,7 @@ static int process_record(args_t *args, bcf1_t *rec)
         for (j=0; j<ngts; j++)
         {
             if ( ptr[j]==bcf_int32_vector_end ) break;
-            if ( bcf_gt_is_missing(ptr[j]) ) continue; 
+            if ( bcf_gt_is_missing(ptr[j]) ) continue;
             int ial = bcf_gt_allele(ptr[j]);
             if ( ial > 31 )
             {
@@ -365,7 +378,7 @@ static int process_record(args_t *args, bcf1_t *rec)
                 args->nskipped++;
                 return -1;
             }
-            if ( !(control_als & (1<<ial)) ) case_al = 1; 
+            if ( !(control_als & (1<<ial)) ) case_al = 1;
             gt |= 1<<ial;
             if ( ial ) nals[3]++;
             else nals[2]++;
@@ -430,12 +443,12 @@ static int process_record(args_t *args, bcf1_t *rec)
     if ( args->annots & PRINT_NASSOC )
         bcf_update_info_int32(args->hdr_out, rec, "NASSOC", nals, 4);
 
-    if ( args->case_als_smpl.l ) 
+    if ( args->case_als_smpl.l )
     {
         bcf_update_info_string(args->hdr_out, rec, "NOVELAL", args->case_als_smpl.s);
         args->ncase_al++;
     }
-    if ( args->case_gts_smpl.l ) 
+    if ( args->case_gts_smpl.l )
     {
         bcf_update_info_string(args->hdr_out, rec, "NOVELGT", args->case_gts_smpl.s);
         args->ncase_gt++;
@@ -472,13 +485,14 @@ int run(int argc, char **argv)
         {"targets",1,0,'t'},
         {"targets-file",1,0,'T'},
         {"targets-overlap",required_argument,NULL,4},
+        {"write-index",no_argument,NULL,5},
         {NULL,0,NULL,0}
     };
     int c;
     char *tmp;
     while ((c = getopt_long(argc, argv, "O:o:i:e:r:R:t:T:0:1:a:f:",loptions,NULL)) >= 0)
     {
-        switch (c) 
+        switch (c)
         {
             case  1 : args->force_samples = 1; break;
             case 'f': args->max_AC_str = optarg; break;
@@ -522,6 +536,7 @@ int run(int argc, char **argv)
                 args->targets_overlap = parse_overlap_option(optarg);
                 if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
                 break;
+            case  5 : args->write_index = 1; break;
             case 'h':
             case '?':
             default: error("%s", usage_text()); break;
diff --git a/plugins/fill-tags.c b/plugins/fill-tags.c
index 740313f5f..b9f9b6908 100644
--- a/plugins/fill-tags.c
+++ b/plugins/fill-tags.c
@@ -473,6 +473,10 @@ uint32_t parse_tags(args_t *args, const char *str)
         if ( !strcasecmp(tags[i],"all") )
         {
             flag |= ~(SET_END|SET_TYPE);
+            // include F_MISSING as part of 'all', which requires explicitly
+            // initialising it as a filter expression not just setting a
+            // bitfield flag.
+            flag |= parse_func(args,"F_MISSING=F_MISSING","F_MISSING");
             args->warned = ~(SET_END|SET_TYPE);
             args->unpack |= BCF_UN_FMT;
         }
diff --git a/plugins/gvcfz.c b/plugins/gvcfz.c
index d9ddb6643..abb25d997 100644
--- a/plugins/gvcfz.c
+++ b/plugins/gvcfz.c
@@ -1,5 +1,5 @@
-/* 
-    Copyright (C) 2017-2021 Genome Research Ltd.
+/*
+    Copyright (C) 2017-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -9,10 +9,10 @@
     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     copies of the Software, and to permit persons to whom the Software is
     furnished to do so, subject to the following conditions:
-    
+
     The above copyright notice and this permission notice shall be included in
     all copies or substantial portions of the Software.
-    
+
     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -78,6 +78,8 @@ typedef struct
     char **argv, *region, *target, *fname, *output_fname, *keep_tags;
     bcf_hdr_t *hdr_in, *hdr_out;
     bcf_srs_t *sr;
+    char *index_fn;
+    int write_index;
 }
 args_t;
 
@@ -88,18 +90,19 @@ const char *about(void)
 
 static const char *usage_text(void)
 {
-    return 
+    return
         "\n"
         "About: Compress gVCF file by resizing gVCF blocks according to specified criteria.\n"
         "\n"
         "Usage: bcftools +gvcfz [Options]\n"
         "Plugin options:\n"
-        "   -a, --trim-alt-alleles          trim alternate alleles not seen in the genotypes\n"
-        "   -e, --exclude <expr>            exclude sites for which the expression is true\n"
-        "   -i, --include <expr>            include sites for which the expression is true\n"
-        "   -g, --group-by EXPR             group gVCF blocks according to the expression\n"
-        "   -o, --output FILE               write gVCF output to the FILE\n"
+        "   -a, --trim-alt-alleles          Trim alternate alleles not seen in the genotypes\n"
+        "   -e, --exclude <expr>            Exclude sites for which the expression is true\n"
+        "   -i, --include <expr>            Include sites for which the expression is true\n"
+        "   -g, --group-by EXPR             Group gVCF blocks according to the expression\n"
+        "   -o, --output FILE               Write gVCF output to the FILE\n"
         "   -O, --output-type u|b|v|z[0-9]  u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"
+        "       --write-index               Automatically index the output files [off]\n"
         "Examples:\n"
         "   # Compress blocks by GQ and DP. Multiple blocks separated by a semicolon can be defined\n"
         "   bcftools +gvcfz input.bcf -g'PASS:GQ>60 & DP<20; PASS:GQ>40 & DP<15; Flt1:QG>20; Flt2:-'\n"
@@ -136,7 +139,7 @@ static void init_groups(args_t *args)
         beg = ++end;
         while ( *end && *end!=';' ) end++;
         char tmp = *end; *end = 0;
-        if ( strcmp(flt,"PASS") ) 
+        if ( strcmp(flt,"PASS") )
         {
             bcf_hdr_printf(args->hdr_out, "##FILTER=<ID=%s,Description=\"%s\">", flt, hdr_str);
             if (bcf_hdr_sync(args->hdr_out) < 0)
@@ -174,6 +177,15 @@ static void destroy_data(args_t *args)
     free(args->grp);
 
     if ( args->filter ) filter_destroy(args->filter);
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(args->fh_out)<0 )
+        {
+            if ( hts_close(args->fh_out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
     if ( hts_close(args->fh_out)!=0 ) error("failed to close %s\n", args->output_fname);
 
     bcf_sr_destroy(args->sr);
@@ -203,7 +215,7 @@ static void flush_block(args_t *args, bcf1_t *rec)
         if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,"PL",&gvcf->pl,3) != 0 )
             error("Could not update FORMAT/PL at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1);
     }
-    if ( gvcf->grp < args->ngrp && args->grp[gvcf->grp].flt_id >= 0 ) 
+    if ( gvcf->grp < args->ngrp && args->grp[gvcf->grp].flt_id >= 0 )
         bcf_add_filter(args->hdr_out, gvcf->rec, args->grp[gvcf->grp].flt_id);
 
     if ( bcf_write(args->fh_out, args->hdr_out, gvcf->rec)!=0 ) error("Failed to write the header\n");
@@ -323,13 +335,14 @@ int run(int argc, char **argv)
         {"stats",required_argument,NULL,'s'},
         {"output",required_argument,NULL,'o'},
         {"output-type",required_argument,NULL,'O'},
+        {"write-index",no_argument,NULL,1},
         {NULL,0,NULL,0}
     };
     int c;
     char *tmp;
     while ((c = getopt_long(argc, argv, "vr:R:t:T:o:O:g:i:e:a",loptions,NULL)) >= 0)
     {
-        switch (c) 
+        switch (c)
         {
             case 'a': args->trim_alts = 1; break;
             case 'e':
@@ -358,6 +371,7 @@ int run(int argc, char **argv)
                           if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
                       }
                       break;
+            case  1 : args->write_index = 1; break;
             case 'h':
             case '?':
             default: error("%s", usage_text()); break;
@@ -385,6 +399,7 @@ int run(int argc, char **argv)
     set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
     args->fh_out = hts_open(args->output_fname ? args->output_fname : "-", wmode);
     if ( bcf_hdr_write(args->fh_out, args->hdr_out)!=0 ) error("Failed to write the header\n");
+    if ( args->write_index && init_index(args->fh_out,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
     while ( bcf_sr_next_line(args->sr) ) process_gvcf(args);
     flush_block(args, NULL);
 
diff --git a/plugins/isecGT.c b/plugins/isecGT.c
index c31af38ec..d83e8fdf8 100644
--- a/plugins/isecGT.c
+++ b/plugins/isecGT.c
@@ -1,5 +1,5 @@
 /*
-    Copyright (C) 2016-2021 Genome Research Ltd.
+    Copyright (C) 2016-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -45,6 +45,8 @@ typedef struct
     bcf_srs_t *sr;
     bcf_hdr_t *hdr_a, *hdr_b;
     htsFile *out_fh;
+    char *index_fn;
+    int write_index;
 }
 args_t;
 
@@ -67,6 +69,7 @@ static const char *usage_text(void)
         "   -R, --regions-file FILE         Restrict to regions listed in a file\n"
         "   -t, --targets REGION            Similar to -r but streams rather than index-jumps\n"
         "   -T, --targets-file FILE         Similar to -R but streams rather than index-jumps\n"
+        "       --write-index               Automatically index the output files [off]\n"
         "\n";
 }
 
@@ -84,6 +87,7 @@ int run(int argc, char **argv)
         {"targets-file",required_argument,NULL,'T'},
         {"output",required_argument,NULL,'o'},
         {"output-type",required_argument,NULL,'O'},
+        {"write-index",no_argument,NULL,1},
         {NULL,0,NULL,0}
     };
     int c;
@@ -115,6 +119,7 @@ int run(int argc, char **argv)
             case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
             case 't': args->targets_list = optarg; break;
             case 'T': args->targets_list = optarg; args->targets_is_file = 1; break;
+            case  1 : args->write_index = 1; break;
             case 'h':
             case '?':
             default: error("%s", usage_text()); break;
@@ -146,6 +151,7 @@ int run(int argc, char **argv)
     args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
     if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
     if ( bcf_hdr_write(args->out_fh, args->hdr_a)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(args->out_fh,args->hdr_a,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
 
     while ( bcf_sr_next_line(args->sr) )
     {
@@ -179,7 +185,15 @@ int run(int argc, char **argv)
         if ( dirty ) bcf_update_genotypes(args->hdr_a, line_a, args->arr_a, ngt_a*smpl->n);
         if ( bcf_write(args->out_fh, args->hdr_a, line_a)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
     }
-
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(args->out_fh)<0 )
+        {
+            if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
     if ( hts_close(args->out_fh)!=0 ) error("Close failed: %s\n",args->output_fname);
     smpl_ilist_destroy(smpl);
     bcf_sr_destroy(args->sr);
diff --git a/plugins/mendelian.c b/plugins/mendelian.c
deleted file mode 100644
index 65a65fe1c..000000000
--- a/plugins/mendelian.c
+++ /dev/null
@@ -1,689 +0,0 @@
-/* The MIT License
-
-   Copyright (c) 2015-2022 Genome Research Ltd.
-
-   Author: Petr Danecek <pd3@sanger.ac.uk>
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in
-   all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-   THE SOFTWARE.
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <strings.h>
-#include <getopt.h>
-#include <math.h>
-#include <inttypes.h>
-#include <htslib/hts.h>
-#include <htslib/vcf.h>
-#include <htslib/kseq.h>
-#include <htslib/synced_bcf_reader.h>
-#include <errno.h>
-#include <ctype.h>
-#include <unistd.h>     // for isatty
-#include "../bcftools.h"
-#include "../regidx.h"
-
-#define MODE_COUNT     1
-#define MODE_LIST_GOOD 2
-#define MODE_LIST_BAD  4
-#define MODE_DELETE    8
-#define MODE_ANNOTATE  16
-#define MODE_LIST_SKIP 32
-
-typedef struct
-{
-    int nok, nbad;
-    int imother,ifather,ichild;
-}
-trio_t;
-
-typedef struct
-{
-    int mpl, fpl, cpl;  // ploidies - mother, father, child
-    int mal, fal;       // expect an allele from mother and father
-}
-rule_t;
-
-typedef struct _args_t
-{
-    regidx_t *rules;
-    regitr_t *itr, *itr_ori;
-    bcf_hdr_t *hdr;
-    htsFile *out_fh;
-    int32_t *gt_arr;
-    int mode;
-    int ngt_arr, nrec;
-    trio_t *trios;
-    int ntrios, mtrios;
-    int output_type, clevel;
-    char *output_fname;
-    bcf_srs_t *sr;
-}
-args_t;
-
-static args_t args;
-static int parse_rules(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr);
-static bcf1_t *process(bcf1_t *rec);
-
-const char *about(void)
-{
-    return "Count Mendelian consistent / inconsistent genotypes [DEPRECATED, use mendelian2 instead]\n";
-}
-
-typedef struct
-{
-    const char *alias, *about, *rules;
-}
-rules_predef_t;
-
-static rules_predef_t rules_predefs[] =
-{
-    { .alias = "GRCh37",
-      .about = "Human Genome reference assembly GRCh37 / hg19, both chr naming conventions",
-      .rules =
-            "   X:1-60000               M/M + F > M\n"
-            "   X:1-60000               M/M + F > M/F\n"
-            "   X:2699521-154931043     M/M + F > M\n"
-            "   X:2699521-154931043     M/M + F > M/F\n"
-            "   Y:1-59373566            .   + F > F\n"
-            "   MT:1-16569              M   + F > M\n"
-            "\n"
-            "   chrX:1-60000            M/M + F > M\n"
-            "   chrX:1-60000            M/M + F > M/F\n"
-            "   chrX:2699521-154931043  M/M + F > M\n"
-            "   chrX:2699521-154931043  M/M + F > M/F\n"
-            "   chrY:1-59373566         .   + F > F\n"
-            "   chrM:1-16569            M   + F > M\n"
-    },
-    { .alias = "GRCh38",
-      .about = "Human Genome reference assembly GRCh38 / hg38, both chr naming conventions",
-      .rules =
-            "   X:1-9999                M/M + F > M\n"
-            "   X:1-9999                M/M + F > M/F\n"
-            "   X:2781480-155701381     M/M + F > M\n"
-            "   X:2781480-155701381     M/M + F > M/F\n"
-            "   Y:1-57227415            .   + F > F\n"
-            "   MT:1-16569              M   + F > M\n"
-            "\n"
-            "   chrX:1-9999             M/M + F > M\n"
-            "   chrX:1-9999             M/M + F > M/F\n"
-            "   chrX:2781480-155701381  M/M + F > M\n"
-            "   chrX:2781480-155701381  M/M + F > M/F\n"
-            "   chrY:1-57227415         .   + F > F\n"
-            "   chrM:1-16569            M   + F > M\n"
-    },
-    {
-        .alias = NULL,
-        .about = NULL,
-        .rules = NULL,
-    }
-};
-
-
-const char *usage(void)
-{
-    return
-        "\n"
-        "About: Count Mendelian consistent / inconsistent genotypes. Note that this plugin is DEPRECATED and\n"
-        "       will not be supported in the future. Please use the newer plugin +mendelian2 instead.\n"
-        "Usage: bcftools +mendelian [Options]\n"
-        "Options:\n"
-        "   -c, --count                     Count the number of consistent sites [DEPRECATED, use `-m c` instead]\n"
-        "   -d, --delete                    Delete inconsistent genotypes (set to \"./.\") [DEPRECATED, use `-m d` instead]\n"
-        "   -l, --list [+x]                 List consistent (+) or inconsistent (x) sites [DEPRECATED, use `-m +` or `-m x` instead]\n"
-        "   -m, --mode [+acdux]             Output mode (the default is `-m c`):\n"
-        "                                       + .. list consistent sites\n"
-        "                                       a .. add INFO/MERR annotation with the number of inconsistent samples\n"
-        "                                       c .. print counts, a text summary with the number of errors per trio\n"
-        "                                       d .. delete inconsistent genotypes (set to \"./.\")\n"
-        "                                       u .. list uninformative sites\n"
-        "                                       x .. list inconsistent sites\n"
-        "   -o, --output FILE               Write output to a file [standard output]\n"
-        "   -O, --output-type u|b|v|z[0-9]  u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"
-        "   -r, --rules ASSEMBLY[?]         Predefined rules, 'list' to print available settings, append '?' for details\n"
-        "   -R, --rules-file FILE           Inheritance rules, see example below\n"
-        "   -t, --trio M,F,C                Names of mother, father and the child\n"
-        "   -T, --trio-file FILE            List of trios, one per line (mother,father,child)\n"
-        "   -p, --ped FILE                  PED file\n"
-        "\n"
-        "Example:\n"
-        "   # Default inheritance patterns, override with -r\n"
-        "   #   region  maternal_ploidy + paternal > offspring\n"
-        "   X:1-60000            M/M + F > M\n"
-        "   X:1-60000            M/M + F > M/F\n"
-        "   X:2699521-154931043  M/M + F > M\n"
-        "   X:2699521-154931043  M/M + F > M/F\n"
-        "   Y:1-59373566         .   + F > F\n"
-        "   MT:1-16569           M   + F > M\n"
-        "\n"
-        "   bcftools +mendelian in.vcf -t Mother,Father,Child -c\n"
-        "\n";
-}
-
-regidx_t *init_rules(args_t *args, char *alias)
-{
-    const rules_predef_t *rules = rules_predefs;
-    if ( !alias ) alias = "GRCh37";
-
-    int detailed = 0, len = strlen(alias);
-    if ( alias[len-1]=='?' ) { detailed = 1; alias[len-1] = 0; }
-
-    while ( rules->alias && strcasecmp(alias,rules->alias) ) rules++;
-
-    if ( !rules->alias )
-    {
-        fprintf(stderr,"\nPRE-DEFINED INHERITANCE RULES\n\n");
-        fprintf(stderr," * Columns are: CHROM:BEG-END MATERNAL_PLOIDY + PATERNAL_PLOIDY > OFFSPRING\n");
-        fprintf(stderr," * Coordinates are 1-based inclusive.\n\n");
-        rules = rules_predefs;
-        while ( rules->alias )
-        {
-            fprintf(stderr,"%s\n   .. %s\n\n", rules->alias,rules->about);
-            if ( detailed )
-                fprintf(stderr,"%s\n", rules->rules);
-            rules++;
-        }
-        fprintf(stderr,"Run as --rules <alias> (e.g. --rules GRCh37).\n");
-        fprintf(stderr,"To see the detailed ploidy definition, append a question mark (e.g. --rules GRCh37?).\n");
-        fprintf(stderr,"\n");
-        exit(-1);
-    }
-    else if ( detailed )
-    {
-        fprintf(stderr,"%s", rules->rules);
-        exit(-1);
-    }
-    return regidx_init_string(rules->rules, parse_rules, NULL, sizeof(rule_t), &args);
-}
-
-static int parse_rules(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
-{
-    // e.g. "Y:1-59373566        .   + F > . # daugther"
-
-    // eat any leading spaces
-    char *ss = (char*) line;
-    while ( *ss && isspace(*ss) ) ss++;
-    if ( !*ss ) return -1;      // skip empty lines
-
-    // chromosome name, beg, end
-    char *tmp, *se = ss;
-    while ( se[1] && !isspace(se[1]) ) se++;
-    while ( se > ss && isdigit(*se) ) se--;
-    if ( *se!='-' ) error("Could not parse the region: %s\n", line);
-    *end = strtol(se+1, &tmp, 10) - 1;
-    if ( tmp==se+1 ) error("Could not parse the region:%s\n",line);
-    while ( se > ss && *se!=':' ) se--;
-    *beg = strtol(se+1, &tmp, 10) - 1;
-    if ( tmp==se+1 ) error("Could not parse the region:%s\n",line);
-
-    *chr_beg = ss;
-    *chr_end = se-1;
-
-    // skip region
-    while ( *ss && !isspace(*ss) ) ss++;
-    while ( *ss && isspace(*ss) ) ss++;
-
-    rule_t *rule = (rule_t*) payload;
-    memset(rule, 0, sizeof(rule_t));
-
-    // maternal ploidy
-    se = ss;
-    while ( *se && !isspace(*se) ) se++;
-    int err = 0;
-    if ( se - ss == 1 )
-    {
-        if ( *ss=='M' ) rule->mpl = 1;
-        else if ( *ss=='.' ) rule->mpl = 0;
-        else err = 1;
-    }
-    else if ( se - ss == 3 )
-    {
-        if ( !strncmp(ss,"M/M",3) ) rule->mpl = 2;
-        else err = 1;
-    }
-    else err = 1;
-    if ( err ) error("Could not parse the maternal ploidy, only \"M\", \"M/M\" and \".\" currently supported: %s\n",line);
-
-    // skip "+"
-    while ( *se && isspace(*se) ) se++;
-    if ( *se != '+' ) error("Could not parse the line: %s\n",line);
-    se++;
-    while ( *se && isspace(*se) ) se++;
-
-    // paternal ploidy
-    ss = se;
-    while ( *se && !isspace(*se) ) se++;
-    if ( se - ss == 1 )
-    {
-        if ( *ss=='F' ) rule->fpl = 1;
-        else err = 1;
-    }
-    else err = 1;
-    if ( err ) error("Could not parse the paternal ploidy, only \"F\" is currently supported: %s [%s]\n",line, ss);
-
-    // skip ">"
-    while ( *se && isspace(*se) ) se++;
-    if ( *se != '>' ) error("Could not parse the line: %s\n",line);
-    se++;
-    while ( *se && isspace(*se) ) se++;
-
-    // ploidy in offspring
-    ss = se;
-    while ( *se && !isspace(*se) ) se++;
-    if ( se - ss == 3 )
-    {
-        if ( !strncmp(ss,"M/F",3) ) { rule->cpl = 2; rule->fal = 1; rule->mal = 1; }
-        else err = 1;
-    }
-    else if ( se - ss == 1 )
-    {
-        if ( *ss=='F' ) { rule->cpl = 1; rule->fal = 1; }
-        else if ( *ss=='M' ) { rule->cpl = 1; rule->mal = 1; }
-        else err = 1;
-    }
-    else err = 1;
-    if ( err ) error("Could not parse the offspring's ploidy, only \"M\", \"F\" or \"M/F\" is currently supported: %s\n",line);
-
-    return 0;
-}
-
-void parse_ped(args_t *args, char *fname)
-{
-    htsFile *fp = hts_open(fname, "r");
-    if ( !fp ) error("Could not read: %s\n", fname);
-
-    kstring_t str = {0,0,0};
-    if ( hts_getline(fp, KS_SEP_LINE, &str) <= 0 ) error("Empty file: %s\n", fname);
-
-    int moff = 0, *off = NULL;
-    do
-    {
-        int ncols = ksplit_core(str.s,0,&moff,&off);
-        if ( ncols<4 ) error("Could not parse the ped file: %s\n", str.s);
-
-        int ifather = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[2]]);
-        int imother = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[3]]);
-        int ichild = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[1]]);
-
-        // The code in process() makes an attempt to work with partial families,
-        // the support is not complete though and can lead to core dumps. Therefore
-        // enforcing full trios for now.
-        // if ( ( ifather<0 && imother<0 ) || ichild<0 ) continue;
-        if ( ifather<0 || imother<0 || ichild<0 ) continue;
-
-        args->ntrios++;
-        hts_expand0(trio_t,args->ntrios,args->mtrios,args->trios);
-        trio_t *trios = &args->trios[args->ntrios-1];
-        trios->ifather = ifather;
-        trios->imother = imother;
-        trios->ichild  = ichild;
-
-    } while ( hts_getline(fp, KS_SEP_LINE, &str)>=0 );
-    if ( !args->ntrios ) error("No complete trios found in the PED and VCF\n");
-
-    free(str.s);
-    free(off);
-    hts_close(fp);
-}
-
-int run(int argc, char **argv)
-{
-    char *trio_samples = NULL, *trio_file = NULL, *ped_fname = NULL, *rules_fname = NULL, *rules_string = NULL;
-    memset(&args,0,sizeof(args_t));
-    args.mode = 0;
-    args.output_fname = "-";
-    args.clevel = -1;
-
-    static struct option loptions[] =
-    {
-        {"trio",1,0,'t'},
-        {"trio-file",1,0,'T'},
-        {"ped",1,0,'p'},
-        {"delete",0,0,'d'},
-        {"list",1,0,'l'},
-        {"mode",1,0,'m'},
-        {"count",0,0,'c'},
-        {"rules",1,0,'r'},
-        {"rules-file",1,0,'R'},
-        {"output",required_argument,NULL,'o'},
-        {"output-type",required_argument,NULL,'O'},
-        {0,0,0,0}
-    };
-    int c;
-    char *tmp;
-    while ((c = getopt_long(argc, argv, "?ht:T:p:l:m:cdr:R:o:O:",loptions,NULL)) >= 0)
-    {
-        switch (c)
-        {
-            case 'o': args.output_fname = optarg; break;
-            case 'O':
-                      switch (optarg[0]) {
-                          case 'b': args.output_type = FT_BCF_GZ; break;
-                          case 'u': args.output_type = FT_BCF; break;
-                          case 'z': args.output_type = FT_VCF_GZ; break;
-                          case 'v': args.output_type = FT_VCF; break;
-                          default:
-                          {
-                              args.clevel = strtol(optarg,&tmp,10);
-                              if ( *tmp || args.clevel<0 || args.clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
-                          }
-                      };
-                      if ( optarg[1] )
-                      {
-                          args.clevel = strtol(optarg+1,&tmp,10);
-                          if ( *tmp || args.clevel<0 || args.clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
-                      }
-                      break;
-            case 'R': rules_fname = optarg; break;
-            case 'r': rules_string = optarg; break;
-            case 'd':
-                args.mode |= MODE_DELETE;
-                fprintf(stderr,"Warning: -d will be deprecated, please use `-m d` instead.\n");
-                break;
-            case 'c':
-                args.mode |= MODE_COUNT;
-                fprintf(stderr,"Warning: -c will be deprecated, please use `-m c` instead.\n");
-                break;
-            case 'l':
-                if ( !strcmp("+",optarg) ) args.mode |= MODE_LIST_GOOD;
-                else if ( !strcmp("x",optarg) ) args.mode |= MODE_LIST_BAD;
-                else error("The argument not recognised: --list %s\n", optarg);
-                fprintf(stderr,"Warning: -l will be deprecated, please use -m instead.\n");
-                break;
-            case 'm':
-                if ( !strcmp("+",optarg) ) args.mode |= MODE_LIST_GOOD;
-                else if ( !strcmp("x",optarg) ) args.mode |= MODE_LIST_BAD;
-                else if ( !strcmp("a",optarg) ) args.mode |= MODE_ANNOTATE;
-                else if ( !strcmp("d",optarg) ) args.mode |= MODE_DELETE;
-                else if ( !strcmp("c",optarg) ) args.mode |= MODE_COUNT;
-                else if ( !strcmp("u",optarg) ) args.mode |= MODE_LIST_SKIP;
-                else error("The argument not recognised: --mode %s\n", optarg);
-                break;
-            case 't': trio_samples = optarg; break;
-            case 'T': trio_file = optarg; break;
-            case 'p': ped_fname = optarg; break;
-            case 'h':
-            case '?':
-            default: error("%s",usage()); break;
-        }
-    }
-    if ( rules_fname )
-        args.rules = regidx_init(rules_fname, parse_rules, NULL, sizeof(rule_t), &args);
-    else
-        args.rules = init_rules(&args, rules_string);
-    if ( !args.rules ) return -1;
-    args.itr     = regitr_init(args.rules);
-    args.itr_ori = regitr_init(args.rules);
-
-    char *fname = NULL;
-    if ( optind>=argc || argv[optind][0]=='-' )
-    {
-        if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";  // reading from stdin
-        else error("%s",usage());
-    }
-    else
-        fname = argv[optind];
-
-    if ( !trio_samples && !trio_file && !ped_fname ) error("Expected the -t/T or -p option\n");
-    if ( !args.mode ) args.mode = MODE_COUNT;
-    if ( args.mode&MODE_DELETE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD|MODE_LIST_SKIP)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD|MODE_LIST_SKIP;
-    if ( args.mode&MODE_ANNOTATE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD|MODE_LIST_SKIP)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD|MODE_LIST_SKIP;
-
-    FILE *log_fh = stderr;
-    if ( args.mode==MODE_COUNT )
-    {
-        log_fh = strcmp("-",args.output_fname) ? fopen(args.output_fname,"w") : stdout;
-        if ( !log_fh ) error("Error: cannot write to %s\n", args.output_fname);
-    }
-
-    args.sr = bcf_sr_init();
-    if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args.sr->errnum));
-    args.hdr = bcf_sr_get_header(args.sr, 0);
-    if ( args.mode!=MODE_COUNT )
-    {
-        char wmode[8];
-        set_wmode(wmode,args.output_type,args.output_fname,args.clevel);
-        args.out_fh = hts_open(args.output_fname ? args.output_fname : "-", wmode);
-        if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno));
-        if ( args.mode&MODE_ANNOTATE )
-            bcf_hdr_append(args.hdr, "##INFO=<ID=MERR,Number=1,Type=Integer,Description=\"Number of trios with Mendelian errors\">");
-        if ( bcf_hdr_write(args.out_fh, args.hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args.output_fname);
-    }
-
-    int i, n = 0;
-    char **list;
-    if ( trio_samples )
-    {
-        args.ntrios = 1;
-        args.trios = (trio_t*) calloc(1,sizeof(trio_t));
-        list = hts_readlist(trio_samples, 0, &n);
-        if ( n!=3 ) error("Expected three sample names with -t\n");
-        args.trios[0].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]);
-        args.trios[0].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]);
-        args.trios[0].ichild  = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[2]);
-        if ( args.trios[0].imother<0 ) error("The sample is not present in the VCF: %s\n",list[0]);
-        if ( args.trios[0].ifather<0 ) error("The sample is not present in the VCF: %s\n",list[1]);
-        if ( args.trios[0].ichild<0 )  error("The sample is not present in the VCF: %s\n",list[2]);
-        for (i=0; i<n; i++) free(list[i]);
-        free(list);
-    }
-    if ( trio_file )
-    {
-        list = hts_readlist(trio_file, 1, &n);
-        if ( !list ) error("Error: could not read file %s\n",trio_file);
-        args.ntrios = n;
-        args.trios = (trio_t*) calloc(n,sizeof(trio_t));
-        for (i=0; i<n; i++)
-        {
-            char *ss = list[i], *se;
-            se = strchr(ss, ',');
-            if ( !se ) error("Could not parse %s: %s\n",trio_file, ss);
-            *se = 0;
-            args.trios[i].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss);
-            if ( args.trios[i].imother<0 ) error("No such sample: \"%s\"\n", ss);
-            ss = ++se;
-            se = strchr(ss, ',');
-            if ( !se ) error("Could not parse %s\n",trio_file);
-            *se = 0;
-            args.trios[i].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss);
-            if ( args.trios[i].ifather<0 ) error("No such sample: \"%s\"\n", ss);
-            ss = ++se;
-            if ( *ss=='\0' ) error("Could not parse %s\n",trio_file);
-            args.trios[i].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss);
-            if ( args.trios[i].ichild<0 ) error("No such sample: \"%s\"\n", ss);
-            free(list[i]);
-        }
-        free(list);
-    }
-    if ( ped_fname ) parse_ped(&args, ped_fname);
-
-    while ( bcf_sr_next_line(args.sr) )
-    {
-        bcf1_t *line = bcf_sr_get_line(args.sr,0);
-        line = process(line);
-        if ( line )
-        {
-            if ( line->errcode ) error("TODO: Unchecked error (%d), exiting\n",line->errcode);
-            if ( args.out_fh && bcf_write1(args.out_fh, args.hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args.output_fname);
-        }
-    }
-    if ( args.out_fh && hts_close(args.out_fh)!=0 ) error("Error: close failed\n");
-
-    if ( args.mode & MODE_COUNT )
-    {
-        fprintf(log_fh,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio (mother,father,child)\n");
-        for (i=0; i<args.ntrios; i++)
-        {
-            trio_t *trio = &args.trios[i];
-            fprintf(log_fh,"%d\t%d\t%d\t%s,%s,%s\n",
-                    trio->nok,trio->nbad,args.nrec-(trio->nok+trio->nbad),
-                    bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->imother),
-                    bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ifather),
-                    bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ichild)
-                   );
-        }
-    }
-    if ( log_fh!=stderr && log_fh!=stdout && fclose(log_fh) ) error("Error: close failed for %s\n", args.output_fname);
-
-    free(args.gt_arr);
-    free(args.trios);
-    regitr_destroy(args.itr);
-    regitr_destroy(args.itr_ori);
-    regidx_destroy(args.rules);
-    bcf_sr_destroy(args.sr);
-    return 0;
-}
-
-static void warn_ploidy(bcf1_t *rec)
-{
-    static int warned = 0;
-    if ( warned ) return;
-    fprintf(stderr,"Incorrect ploidy at %s:%"PRId64", skipping the trio. (This warning is printed only once.)\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1);
-    warned = 1;
-}
-
-bcf1_t *process(bcf1_t *rec)
-{
-    bcf1_t *dflt = args.mode&MODE_LIST_SKIP ? rec : NULL;
-    args.nrec++;
-
-    if ( rec->n_allele > 63 ) return dflt;      // we use 64bit bitmask below
-
-    int ngt = bcf_get_genotypes(args.hdr, rec, &args.gt_arr, &args.ngt_arr);
-    if ( ngt<0 ) return dflt;
-    if ( ngt!=2*bcf_hdr_nsamples(args.hdr) && ngt!=bcf_hdr_nsamples(args.hdr) ) return dflt;
-    ngt /= bcf_hdr_nsamples(args.hdr);
-
-    int itr_set = regidx_overlap(args.rules, bcf_seqname(args.hdr,rec),rec->pos,rec->pos, args.itr_ori);
-
-    int i, nbad = 0, ngood = 0, needs_update = 0;
-    for (i=0; i<args.ntrios; i++)
-    {
-        int32_t a,b,c,d,e,f;
-        trio_t *trio = &args.trios[i];
-
-        if ( trio->imother<0 )
-        {
-            a = bcf_gt_missing;
-            b = bcf_int32_vector_end;
-        }
-        else
-        {
-            a = args.gt_arr[ngt*trio->imother];
-            b = ngt==2 ? args.gt_arr[ngt*trio->imother+1] : bcf_int32_vector_end;
-        }
-        if ( trio->ifather<0 )
-        {
-            c = bcf_gt_missing;
-            d = bcf_int32_vector_end;
-        }
-        else
-        {
-          c = args.gt_arr[ngt*trio->ifather];
-          d = ngt==2 ? args.gt_arr[ngt*trio->ifather+1] : bcf_int32_vector_end;
-        }
-        e = args.gt_arr[ngt*trio->ichild];
-        f = ngt==2 ? args.gt_arr[ngt*trio->ichild+1] : bcf_int32_vector_end;
-
-        // skip sites with missing data in child
-        if ( bcf_gt_is_missing(e) || bcf_gt_is_missing(f) ) continue;
-
-        uint64_t mother = 0, father = 0,child1,child2;
-
-        int is_ok = 0;
-        if ( !itr_set )
-        {
-            if ( f==bcf_int32_vector_end ) { warn_ploidy(rec); continue; }
-
-            // All M,F,C genotypes are diploid. Missing data are considered consistent.
-            child1 = 1<<bcf_gt_allele(e);
-            child2 = 1<<bcf_gt_allele(f);
-            mother  = bcf_gt_is_missing(a) ? child1|child2 : 1<<bcf_gt_allele(a);
-            mother |= bcf_gt_is_missing(b) || b==bcf_int32_vector_end ? child1|child2 : 1<<bcf_gt_allele(b);
-            father  = bcf_gt_is_missing(c) ? child1|child2 : 1<<bcf_gt_allele(c);
-            father |= bcf_gt_is_missing(d) || d==bcf_int32_vector_end ? child1|child2 : 1<<bcf_gt_allele(d);
-
-            if ( (mother&child1 && father&child2) || (mother&child2 && father&child1) ) is_ok = 1;
-        }
-        else
-        {
-            child1  = 1<<bcf_gt_allele(e);
-            child2  = bcf_gt_is_missing(f) || f==bcf_int32_vector_end ? 0 : 1<<bcf_gt_allele(f);
-            mother |= bcf_gt_is_missing(a) ? 0 : 1<<bcf_gt_allele(a);
-            mother |= bcf_gt_is_missing(b) || b==bcf_int32_vector_end ? 0 : 1<<bcf_gt_allele(b);
-            father |= bcf_gt_is_missing(c) ? 0 : 1<<bcf_gt_allele(c);
-            father |= bcf_gt_is_missing(d) || d==bcf_int32_vector_end ? 0 : 1<<bcf_gt_allele(d);
-
-            regitr_copy(args.itr, args.itr_ori);
-            while ( !is_ok && regitr_overlap(args.itr) )
-            {
-                rule_t *rule = &regitr_payload(args.itr,rule_t);
-                if ( child1 && child2 )
-                {
-                    if ( !rule->mal || !rule->fal ) continue;   // wrong rule (haploid), but this is a diploid GT
-                    if ( !mother ) mother = child1|child2;
-                    if ( !father ) father = child1|child2;
-                    if ( (mother&child1 && father&child2) || (mother&child2 && father&child1) ) is_ok = 1;
-                    continue;
-                }
-                if ( rule->mal )
-                {
-                    if ( mother && !(child1&mother) ) continue;
-                }
-                if ( rule->fal )
-                {
-                    if ( father && !(child1&father) ) continue;
-                }
-                is_ok = 1;
-            }
-        }
-        if ( is_ok )
-        {
-            trio->nok++;
-            ngood++;
-        }
-        else
-        {
-            trio->nbad++;
-            nbad++;
-            if ( args.mode&MODE_DELETE )
-            {
-                args.gt_arr[ngt*trio->imother] = bcf_gt_missing;
-                if ( b!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->imother+1] = bcf_gt_missing; // should be always true
-                args.gt_arr[ngt*trio->ifather] = bcf_gt_missing;
-                if ( d!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ifather+1] = bcf_gt_missing;
-                args.gt_arr[ngt*trio->ichild] = bcf_gt_missing;
-                if ( f!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ichild+1]  = bcf_gt_missing;
-                needs_update = 1;
-            }
-        }
-    }
-
-    if ( needs_update && bcf_update_genotypes(args.hdr,rec,args.gt_arr,ngt*bcf_hdr_nsamples(args.hdr)) )
-        error("Could not update GT field at %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1);
-
-    if ( args.mode&MODE_ANNOTATE ) bcf_update_info_int32(args.hdr, rec, "MERR", &nbad, 1);
-    if ( args.mode&MODE_LIST_GOOD && ngood ) return rec;
-    if ( args.mode&MODE_LIST_BAD && nbad ) return rec;
-    if ( args.mode&MODE_LIST_SKIP && !ngood && !nbad ) return rec;
-
-    return NULL;
-}
diff --git a/plugins/mendelian2.c b/plugins/mendelian2.c
index f1d5c7b02..30df1186b 100644
--- a/plugins/mendelian2.c
+++ b/plugins/mendelian2.c
@@ -1,6 +1,6 @@
 /* The MIT License
 
-   Copyright (c) 2015-2022 Genome Research Ltd.
+   Copyright (c) 2015-2023 Genome Research Ltd.
 
    Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -114,6 +114,8 @@ typedef struct _args_t
     int ngt_arr;
     stats_t stats;              // common per-site and per-sample stats
     int nref_only, nmany_als;   // per-site stats
+    char *index_fn;
+    int write_index;
 }
 args_t;
 
@@ -140,6 +142,7 @@ static const char *usage_text(void)
         "   -T, --targets-file FILE         Similar to -R but streams rather than index-jumps\n"
         "       --targets-overlap 0|1|2     Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
         "       --no-version                Do not append version and command line to the header\n"
+        "       --write-index               Automatically index the output files [off]\n"
         "\n"
         "Options:\n"
         "   -m, --mode c|[adeEgmMS]         Output mode, the default is `-m c`. Multiple modes can be combined in VCF/BCF\n"
@@ -476,6 +479,7 @@ static void init_data(args_t *args)
         args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
         if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
         if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+        if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
     }
 }
 
@@ -488,7 +492,19 @@ static void destroy_data(args_t *args)
     free(args->trio);
     free(args->gt_arr);
     free(args->rule);
-    if ( args->out_fh && hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+    if ( args->out_fh )
+    {
+        if ( args->write_index )
+        {
+            if ( bcf_idx_save(args->out_fh)<0 )
+            {
+                if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+                error("Error: cannot write to index %s\n", args->index_fn);
+            }
+            free(args->index_fn);
+        }
+        if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+    }
     bcf_hdr_destroy(args->hdr_out);
     bcf_sr_destroy(args->sr);
     free(args);
@@ -765,13 +781,9 @@ int run(int argc, char **argv)
 
     static struct option loptions[] =
     {
-        {"trio",1,0,'t'},
-        {"trio-file",1,0,'T'},
-        {"ped",1,0,'p'},
-        {"delete",0,0,'d'},
-        {"list",1,0,'l'},
+        {"pfm",1,0,'p'},
+        {"ped",1,0,'P'},
         {"mode",1,0,'m'},
-        {"count",0,0,'c'},
         {"rules",1,0,1},
         {"rules-file",1,0,2},
         {"output",required_argument,NULL,'o'},
@@ -784,11 +796,12 @@ int run(int argc, char **argv)
         {"targets-overlap",required_argument,NULL,15},
         {"include",required_argument,0,'i'},
         {"exclude",required_argument,0,'e'},
+        {"write-index",no_argument,NULL,3},
         {0,0,0,0}
     };
     int c;
     char *tmp;
-    while ((c = getopt_long(argc, argv, "?ht:T:p:m:o:O:i:e:t:T:r:R:",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "?hp:P:m:o:O:i:e:t:T:r:R:",loptions,NULL)) >= 0)
     {
         switch (c)
         {
@@ -843,6 +856,7 @@ int run(int argc, char **argv)
             case 'p': args->pfm = optarg; break;
             case  1 : args->rules_str = optarg; break;
             case  2 : args->rules_fname = optarg; break;
+            case  3 : args->write_index = 1; break;
             case 'h':
             case '?':
             default: error("%s",usage_text()); break;
diff --git a/plugins/prune.c b/plugins/prune.c
index 57ae83a5a..1593e7306 100644
--- a/plugins/prune.c
+++ b/plugins/prune.c
@@ -1,5 +1,5 @@
-/* 
-    Copyright (C) 2017-2021 Genome Research Ltd.
+/*
+    Copyright (C) 2017-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -9,10 +9,10 @@
     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     copies of the Software, and to permit persons to whom the Software is
     furnished to do so, subject to the following conditions:
-    
+
     The above copyright notice and this permission notice shall be included in
     all copies or substantial portions of the Software.
-    
+
     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -69,6 +69,8 @@ typedef struct
     htsFile *out_fh;
     bcf_hdr_t *hdr;
     bcf_srs_t *sr;
+    char *index_fn;
+    int write_index;
 }
 args_t;
 
@@ -79,7 +81,7 @@ const char *about(void)
 
 static const char *usage_text(void)
 {
-    return 
+    return
         "\n"
         "About: Prune sites by missingness or linkage disequilibrium.\n"
         "\n"
@@ -103,6 +105,7 @@ static const char *usage_text(void)
         "   -t, --targets REGION            Similar to -r but streams rather than index-jumps\n"
         "   -T, --targets-file FILE         Similar to -R but streams rather than index-jumps\n"
         "   -w, --window INT[bp|kb|Mb]      The window size of INT sites or INT bp/kb/Mb for the -n/-l options [100kb]\n"
+        "       --write-index               Automatically index the output files [off]\n"
         "Examples:\n"
         "   # Discard records with r2 bigger than 0.6 in a window of 1000 sites\n"
         "   bcftools +prune -m 0.6 -w 1000 input.bcf -Ob -o output.bcf\n"
@@ -183,6 +186,7 @@ static void init_data(args_t *args)
         }
     }
     if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
     args->ld_filter_id = -1;
     if ( args->ld_filter && strcmp(".",args->ld_filter) )
         args->ld_filter_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, args->ld_filter);
@@ -211,6 +215,15 @@ static void destroy_data(args_t *args)
 {
     if ( args->filter )
         filter_destroy(args->filter);
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(args->out_fh)<0 )
+        {
+            if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
     if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
     vcfbuf_destroy(args->vcfbuf);
     bcf_sr_destroy(args->sr);
@@ -303,20 +316,22 @@ int run(int argc, char **argv)
         {"nsites-per-win",required_argument,NULL,'n'},
         {"nsites-per-win-mode",required_argument,NULL,'N'},
         {"window",required_argument,NULL,'w'},
+        {"write-index",no_argument,NULL,4},
         {NULL,0,NULL,0}
     };
     int c;
     char *tmp;
     while ((c = getopt_long(argc, argv, "vr:R:t:T:m:o:O:a:f:i:e:n:N:w:k",loptions,NULL)) >= 0)
     {
-        switch (c) 
+        switch (c)
         {
             case  1 : args->rand_missing = 1; break;
             case  2 : args->af_tag = optarg; break;
-            case  3 : 
+            case  3 :
                 args->rseed = strtol(optarg,&tmp,10);
                 if ( tmp==optarg || *tmp ) error("Could not parse: --random-seed %s\n", optarg);
                 break;
+            case  4 : args->write_index = 1; break;
             case 'k': args->keep_sites = 1; break;
             case 'e':
                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
@@ -324,7 +339,7 @@ int run(int argc, char **argv)
             case 'i':
                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
                 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
-            case 'a': 
+            case 'a':
                 {
                     int n, i;
                     char **tag = hts_readlist(optarg,0,&n);
@@ -352,9 +367,9 @@ int run(int argc, char **argv)
                     free(tag);
                     args->ld_mask |= LD_ANNOTATE;
                 }
-                break; 
+                break;
             case 'f': args->ld_filter = optarg; break;
-            case 'n': 
+            case 'n':
                 args->nsites = strtod(optarg,&tmp);
                 if ( tmp==optarg || *tmp ) error("Could not parse: --nsites-per-win %s\n", optarg);
                 break;
@@ -364,7 +379,7 @@ int run(int argc, char **argv)
                 else if ( !strcasecmp(optarg,"rand") ) args->nsites_mode = optarg;
                 else error("The mode \"%s\" is not recognised\n",optarg);
                 break;
-            case 'm': 
+            case 'm':
                 if ( !strncasecmp("R2=",optarg,3) )
                 {
                     args->ld_max_set[VCFBUF_LD_IDX_R2] = 1;
@@ -388,7 +403,7 @@ int run(int argc, char **argv)
                 if ( !tmp || *tmp ) error("Could not parse: --max %s\n", optarg);
                 args->ld_mask |= LD_SET_MAX;
                 break;
-            case 'w': 
+            case 'w':
                 args->ld_win = strtod(optarg,&tmp);
                 if ( !*tmp ) break;
                 if ( tmp==optarg ) error("Could not parse: --window %s\n", optarg);
@@ -398,9 +413,9 @@ int run(int argc, char **argv)
                 else error("Could not parse: --window %s\n", optarg);
                 break;
             case 'T': args->target_is_file = 1; // fall-through
-            case 't': args->target = optarg; break; 
+            case 't': args->target = optarg; break;
             case 'R': args->region_is_file = 1; // fall-through
-            case 'r': args->region = optarg; break; 
+            case 'r': args->region = optarg; break;
             case 'o': args->output_fname = optarg; break;
             case 'O':
                       switch (optarg[0]) {
@@ -439,7 +454,7 @@ int run(int argc, char **argv)
     else args->fname = argv[optind];
 
     init_data(args);
-    
+
     while ( bcf_sr_next_line(args->sr) ) process(args);
     flush(args,1);
 
diff --git a/plugins/remove-overlaps.c b/plugins/remove-overlaps.c
index 2e8e6b0dd..bd0304497 100644
--- a/plugins/remove-overlaps.c
+++ b/plugins/remove-overlaps.c
@@ -1,5 +1,5 @@
-/* 
-    Copyright (C) 2017-2021 Genome Research Ltd.
+/*
+    Copyright (C) 2017-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -9,10 +9,10 @@
     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     copies of the Software, and to permit persons to whom the Software is
     furnished to do so, subject to the following conditions:
-    
+
     The above copyright notice and this permission notice shall be included in
     all copies or substantial portions of the Software.
-    
+
     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -52,6 +52,8 @@ typedef struct
     htsFile *out_fh;
     bcf_hdr_t *hdr;
     bcf_srs_t *sr;
+    char *index_fn;
+    int write_index;
 }
 args_t;
 
@@ -62,7 +64,7 @@ const char *about(void)
 
 static const char *usage_text(void)
 {
-    return 
+    return
         "\n"
         "About: Remove overlapping variants.\n"
         "\n"
@@ -80,6 +82,7 @@ static const char *usage_text(void)
         "   -R, --regions-file FILE         restrict to regions listed in a file\n"
         "   -t, --targets REGION            similar to -r but streams rather than index-jumps\n"
         "   -T, --targets-file FILE         similar to -R but streams rather than index-jumps\n"
+        "       --write-index               Automatically index the output files [off]\n"
         "\n";
 }
 
@@ -100,6 +103,7 @@ static void init_data(args_t *args)
     args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
     if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
     if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
 
     args->vcfbuf = vcfbuf_init(args->hdr, 0);
     if ( args->rmdup )
@@ -114,6 +118,15 @@ static void destroy_data(args_t *args)
 {
     if ( args->filter )
         filter_destroy(args->filter);
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(args->out_fh)<0 )
+        {
+            if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
     if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
     vcfbuf_destroy(args->vcfbuf);
     bcf_sr_destroy(args->sr);
@@ -168,13 +181,14 @@ int run(int argc, char **argv)
         {"output",required_argument,NULL,'o'},
         {"output-type",required_argument,NULL,'O'},
         {"verbose",no_argument,NULL,'v'},
+        {"write-index",no_argument,NULL,1},
         {NULL,0,NULL,0}
     };
     int c;
     char *tmp;
     while ((c = getopt_long(argc, argv, "r:R:t:T:o:O:i:e:vpd",loptions,NULL)) >= 0)
     {
-        switch (c) 
+        switch (c)
         {
             case 'd': args->rmdup = 1; break;
             case 'p': args->print_overlaps = 1; break;
@@ -186,9 +200,9 @@ int run(int argc, char **argv)
                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
                 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
             case 'T': args->target_is_file = 1; // fall-through
-            case 't': args->target = optarg; break; 
+            case 't': args->target = optarg; break;
             case 'R': args->region_is_file = 1; // fall-through
-            case 'r': args->region = optarg; break; 
+            case 'r': args->region = optarg; break;
             case 'o': args->output_fname = optarg; break;
             case 'O':
                       switch (optarg[0]) {
@@ -208,6 +222,7 @@ int run(int argc, char **argv)
                           if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
                       }
                       break;
+            case  1 : args->write_index = 1; break;
             case 'h':
             case '?':
             default: error("%s", usage_text()); break;
@@ -223,7 +238,7 @@ int run(int argc, char **argv)
     else args->fname = argv[optind];
 
     init_data(args);
-    
+
     while ( bcf_sr_next_line(args->sr) ) process(args);
     flush(args,1);
 
diff --git a/plugins/scatter.c b/plugins/scatter.c
index af358fc4f..e42edd877 100644
--- a/plugins/scatter.c
+++ b/plugins/scatter.c
@@ -1,6 +1,6 @@
 /* The MIT License
 
-    Copyright (C) 2020-2021 Giulio Genovese
+    Copyright (C) 2020-2023 Giulio Genovese
 
     Author: Giulio Genovese <giulio.genovese@gmail.com>
 
@@ -39,6 +39,7 @@ typedef struct
 {
     htsFile *fh;        // output file handle
     char *fname;        // output file name
+    char *index_fn;
 }
 subset_t;
 
@@ -60,6 +61,7 @@ typedef struct
     char **hts_opts;
     int nhts_opts;
     bcf_hdr_t *hdr;
+    int write_index;
 }
 args_t;
 
@@ -95,6 +97,7 @@ static const char *usage_text(void)
         "   -x, --extra STRING              Output records not overlapping listed regions in separate file\n"
         "   -p, --prefix STRING             Prepend string to output VCF names\n"
         "       --hts-opts LIST             Low-level options to pass to HTSlib, e.g. block_size=32768\n"
+        "       --write-index               Automatically index the output files [off]\n"
         "\n"
         "Examples:\n"
         "   # Scatter a VCF file by shards with 10000 variants each\n"
@@ -200,6 +203,7 @@ static void open_set(subset_t *set, args_t *args)
         if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_plugin");
     }
     if ( bcf_hdr_write(set->fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__, args->str.s);
+    if ( args->write_index && init_index(set->fh,args->hdr,args->str.s,&set->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->str.s);
 }
 
 static void init_data(args_t *args)
@@ -260,7 +264,17 @@ static void destroy_data(args_t *args)
     for (i=0; i<args->nsets; i++)
     {
         subset_t *set = &args->sets[i];
-        if (set->fname) {
+        if (set->fname)
+        {
+            if ( args->write_index )
+            {
+                if ( bcf_idx_save(set->fh)<0 )
+                {
+                    if ( hts_close(set->fh)!=0 ) error("Error: close failed .. %s\n", set->fname);
+                    error("Error: cannot write to index %s\n", set->index_fn);
+                }
+                free(set->index_fn);
+            }
             if ( hts_close(set->fh)!=0 ) error("Error: close failed .. %s\n", set->fname);
             free(set->fname);
         }
@@ -338,6 +352,7 @@ int run(int argc, char **argv)
         {"extra",required_argument,NULL,'x'},
         {"prefix",required_argument,NULL,'p'},
         {"hts-opts",required_argument,NULL,5},
+        {"write-index",no_argument,NULL,6},
         {NULL,0,NULL,0}
     };
     int c;
@@ -395,6 +410,7 @@ int run(int argc, char **argv)
             case 'x': args->extra = optarg;  break;
             case 'p': args->prefix = optarg;  break;
             case  5 : args->hts_opts = hts_readlist(optarg, 0, &args->nhts_opts); break;
+            case  6 : args->write_index = 1; break;
             case 'h':
             case '?':
             default: error("%s", usage_text()); break;
diff --git a/plugins/split-vep.c b/plugins/split-vep.c
index e5dfeb14a..82c1ff0bf 100644
--- a/plugins/split-vep.c
+++ b/plugins/split-vep.c
@@ -127,6 +127,8 @@ typedef struct
     int allow_undef_tags;
     int genes_mode;             // --gene-list +FILE, one of GENES_* mode, prioritize or restrict
     int print_header;
+    char *index_fn;
+    int write_index;
 }
 args_t;
 
@@ -208,8 +210,8 @@ static const char *usage_text(void)
         "   -d, --duplicate                 Output per transcript/allele consequences on a new line rather rather than\n"
         "                                     as comma-separated fields on a single line\n"
         "   -f, --format STR                Create non-VCF output; similar to `bcftools query -f` but drops lines w/o consequence\n"
-        "   -g, --gene-list [+]FILE         Consider only genes listed in FILE, or prioritize if FILE is prefixed with \"+\"\n"
-        "       --gene-list-fields LIST     Use these fields when matching genes from the -g list [SYMBOL,Gene,gene]\n"
+        "   -g, --gene-list [+]FILE         Consider only features listed in FILE, or prioritize if FILE is prefixed with \"+\"\n"
+        "       --gene-list-fields LIST     Fields to match against by the -g list, by default gene names [SYMBOL,Gene,gene]\n"
         "   -H, --print-header              Print header\n"
         "   -l, --list                      Parse the VCF header and list the annotation fields\n"
         "   -p, --annot-prefix STR          Before doing anything else, prepend STR to all CSQ fields to avoid tag name conflicts\n"
@@ -220,8 +222,8 @@ static const char *usage_text(void)
         "   -S, --severity -|FILE           Pass \"-\" to print the default severity scale or FILE to override\n"
         "                                     the default scale\n"
         "   -u, --allow-undef-tags          Print \".\" for undefined tags\n"
-        "   -x, --drop-sites                Drop sites with none of the consequences matching the severity specified by -s.\n"
-        "                                      This switch is intended for use with VCF/BCF output (i.e. -f not given).\n"
+        "   -x, --drop-sites                Drop sites without consequences (the default with -f)\n"
+        "   -X, --keep-sites                Do not drop sites without consequences (the default without -f)\n"
         "Common options:\n"
         "   -e, --exclude EXPR              Exclude sites and samples for which the expression is true\n"
         "   -i, --include EXPR              Include sites and samples for which the expression is true\n"
@@ -234,6 +236,7 @@ static const char *usage_text(void)
         "   -t, --targets REG               Similar to -r but streams rather than index-jumps\n"
         "   -T, --targets-file FILE         Similar to -R but streams rather than index-jumps\n"
         "       --targets-overlap 0|1|2     Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
+        "       --write-index               Automatically index the output files [off]\n"
         "\n"
         "Examples:\n"
         "   # List available fields of the INFO/CSQ annotation\n"
@@ -546,6 +549,7 @@ static void parse_format_str(args_t *args)
 // The program was requested to extract one or more columns via -c. It can contain names,  0-based indexes or ranges of indexes
 static void parse_column_str(args_t *args)
 {
+    if ( args->nannot ) return; // already called from parse_filter_str
     int i,j;
     int *column = NULL;
     int *types  = NULL;
@@ -693,7 +697,6 @@ static void parse_column_str(args_t *args)
 // as if the user passed them via the -c option.
 static void parse_filter_str(args_t *args)
 {
-    int max_unpack = args->convert ? convert_max_unpack(args->convert) : 0;
     args->filter = filter_parse(args->hdr_out, args->filter_str);
     if ( !args->filter ) error(NULL);     // this type of error would have been reported
     int ret = filter_status(args->filter);
@@ -706,9 +709,7 @@ static void parse_filter_str(args_t *args)
         const char **tags = filter_list_undef_tags(args->filter, &ntags);
         kstring_t str;
         str.s = args->column_str;
-        str.l = str.m = strlen(str.s);
-        destroy_annot(args);
-        destroy_column2type(args);
+        str.l = str.m = str.s ? strlen(str.s) : 0;
         for (i=0; i<ntags; i++)
         {
             if ( khash_str2int_get(args->field2idx,tags[i],&j)!=0 )
@@ -721,11 +722,10 @@ static void parse_filter_str(args_t *args)
         filter_destroy(args->filter);
         args->filter = filter_init(args->hdr_out, args->filter_str);
     }
-    max_unpack |= filter_max_unpack(args->filter);
-    if ( !args->format_str ) max_unpack |= BCF_UN_FMT;      // don't drop FMT fields on VCF input when VCF/BCF is output
-    args->sr->max_unpack = max_unpack;
-    if ( args->convert && (max_unpack & BCF_UN_FMT) )
-        convert_set_option(args->convert, subset_samples, &args->smpl_pass);
+    int ntags, i;
+    const char **tags = filter_list_used_tags(args->filter, &ntags);
+    for (i=0; i<ntags; i++)
+        if ( !strncmp("INFO/",tags[i],5) && !strcmp(tags[i]+5,args->vep_tag) ) args->raw_vep_request = 1;
 }
 static void init_data(args_t *args)
 {
@@ -864,6 +864,7 @@ static void init_data(args_t *args)
     free(tmp);
 
     if ( args->format_str ) parse_format_str(args);    // Text output, e.g. bcftools +split-vep -f '%Consequence\n'
+    if ( args->filter_str ) parse_filter_str(args);
     if ( args->column_str ) parse_column_str(args);    // The --columns option was given, update the header
     if ( args->format_str )
     {
@@ -871,10 +872,18 @@ static void init_data(args_t *args)
         args->convert = convert_init(args->hdr_out, NULL, 0, args->format_str);
         if ( !args->convert ) error("Could not parse the expression: %s\n", args->format_str);
         if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
+        convert_set_option(args->convert, force_newline, 1);
     }
-    if ( args->filter_str ) parse_filter_str(args);
     if ( args->genes_fname ) init_gene_list(args);
 
+    int max_unpack = BCF_UN_SHR;
+    if ( args->convert ) max_unpack |= convert_max_unpack(args->convert);
+    if ( args->filter ) max_unpack |= filter_max_unpack(args->filter);
+    if ( !args->format_str ) max_unpack |= BCF_UN_FMT;      // don't drop FMT fields on VCF input when VCF/BCF is output
+    args->sr->max_unpack = max_unpack;
+    if ( args->convert && (max_unpack & BCF_UN_FMT) )
+        convert_set_option(args->convert, subset_samples, &args->smpl_pass);
+
     free(str.s);
 }
 static void destroy_data(args_t *args)
@@ -903,7 +912,19 @@ static void destroy_data(args_t *args)
     free(args->csq_str);
     if ( args->filter ) filter_destroy(args->filter);
     if ( args->convert ) convert_destroy(args->convert);
-    if ( args->fh_vcf && hts_close(args->fh_vcf)!=0 ) error("Error: close failed .. %s\n",args->output_fname);
+    if ( args->fh_vcf )
+    {
+        if ( args->write_index )
+        {
+            if ( bcf_idx_save(args->fh_vcf)<0 )
+            {
+                if ( hts_close(args->fh_vcf)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+                error("Error: cannot write to index %s\n", args->index_fn);
+            }
+            free(args->index_fn);
+        }
+        if ( hts_close(args->fh_vcf)!=0 ) error("Error: close failed .. %s\n",args->output_fname);
+    }
     if ( args->fh_bgzf && bgzf_close(args->fh_bgzf)!=0 ) error("Error: close failed .. %s\n",args->output_fname);
     free(args);
 }
@@ -1096,7 +1117,7 @@ static void filter_and_output(args_t *args, bcf1_t *rec, int severity_pass, int
     {
         if ( args->nannot )
         {
-            if ( !updated || all_missing ) return;         // the standard case: using -f to print the CSQ subfields, skipping if missing
+            if ( args->drop_sites && (!updated || all_missing) ) return;         // the standard case: using -f to print the CSQ subfields, skipping if missing
         }
         else
         {
@@ -1301,6 +1322,7 @@ int run(int argc, char **argv)
     static struct option loptions[] =
     {
         {"drop-sites",no_argument,0,'x'},
+        {"keep-sites",no_argument,0,'X'},
         {"all-fields",no_argument,0,'A'},
         {"duplicate",no_argument,0,'d'},
         {"format",required_argument,0,'f'},
@@ -1325,11 +1347,12 @@ int run(int argc, char **argv)
         {"targets-overlap",required_argument,NULL,4},
         {"no-version",no_argument,NULL,2},
         {"allow-undef-tags",no_argument,0,'u'},
+        {"write-index",no_argument,NULL,6},
         {NULL,0,NULL,0}
     };
-    int c;
+    int c, drop_sites = -1;
     char *tmp;
-    while ((c = getopt_long(argc, argv, "o:O:i:e:r:R:t:T:lS:s:c:p:a:f:dA:xuHg:",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "o:O:i:e:r:R:t:T:lS:s:c:p:a:f:dA:xXuHg:",loptions,NULL)) >= 0)
     {
         switch (c)
         {
@@ -1341,7 +1364,8 @@ int run(int argc, char **argv)
                 else args->all_fields_delim = optarg;
                 break;
             case 'H': args->print_header = 1; break;
-            case 'x': args->drop_sites = 1; break;
+            case 'x': drop_sites = 1; break;
+            case 'X': drop_sites = 0; break;
             case 'd': args->duplicate = 1; break;
             case 'f': args->format_str = strdup(optarg); break;
             case 'g': args->genes_fname = optarg; break;
@@ -1390,12 +1414,14 @@ int run(int argc, char **argv)
                 if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
                 break;
             case  5 : args->gene_fields_str = optarg; break;
+            case  6 : args->write_index = 1; break;
             case 'h':
             case '?':
             default: error("%s", usage_text()); break;
         }
     }
-    if ( args->drop_sites && args->format_str ) error("Error: the -x behavior is the default (and only supported) with -f\n");
+    if ( drop_sites==-1 ) drop_sites = args->format_str ? 1 : 0;
+    args->drop_sites = drop_sites;
     if ( args->print_header && !args->format_str ) error("Error: the -H header printing is supported only with -f\n");
     if ( args->all_fields_delim && !args->format_str ) error("Error: the -A option must be used with -f\n");
     if ( args->severity && (!strcmp("?",args->severity) || !strcmp("-",args->severity)) ) error("%s", default_severity());
@@ -1440,6 +1466,7 @@ int run(int argc, char **argv)
             args->fh_vcf = hts_open(args->output_fname ? args->output_fname : "-", wmode);
             if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_split-vep");
             if ( bcf_hdr_write(args->fh_vcf, args->hdr_out)!=0 ) error("Failed to write the header to %s\n", args->output_fname);
+            if ( args->write_index && init_index(args->fh_vcf,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
         }
         while ( bcf_sr_next_line(args->sr) )
             process_record(args, bcf_sr_get_line(args->sr,0));
diff --git a/plugins/split.c b/plugins/split.c
index a362e0ed9..011981d42 100644
--- a/plugins/split.c
+++ b/plugins/split.c
@@ -1,5 +1,5 @@
-/* 
-    Copyright (C) 2017-2021 Genome Research Ltd.
+/*
+    Copyright (C) 2017-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -9,10 +9,10 @@
     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     copies of the Software, and to permit persons to whom the Software is
     furnished to do so, subject to the following conditions:
-    
+
     The above copyright notice and this permission notice shall be included in
     all copies or substantial portions of the Software.
-    
+
     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -52,6 +52,7 @@ typedef struct
     char *fname;        // output file name
     filter_t *filter;
     bcf_hdr_t *hdr;
+    char *index_fn;
 }
 subset_t;
 
@@ -70,6 +71,7 @@ typedef struct
     subset_t *sets;
     int nsets, nhts_opts;
     char **hts_opts;
+    int write_index;
 }
 args_t;
 
@@ -80,7 +82,7 @@ const char *about(void)
 
 static const char *usage_text(void)
 {
-    return 
+    return
         "\n"
         "About: Split VCF by sample, creating single- or multi-sample VCFs. The output files are named\n"
         "       by sample names whenever possible, with the characters from the set [ \\t:/\\] replaced\n"
@@ -124,6 +126,7 @@ static const char *usage_text(void)
         "       --targets-overlap 0|1|2     Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
         "   -T, --targets-file FILE         Similar to -R but streams rather than index-jumps\n"
         "       --hts-opts LIST             Low-level options to pass to HTSlib, e.g. block_size=32768\n"
+        "       --write-index               Automatically index the output files [off]\n"
         "\n"
         "Examples:\n"
         "   # Split a VCF file\n"
@@ -485,6 +488,7 @@ static void init_data(args_t *args)
         for (j=0; j<set->nsmpl; j++)
             set->hdr->samples[j] = set->rename ? set->rename[j] : args->hdr_in->samples[set->smpl[j]];
         if ( bcf_hdr_write(set->fh, set->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,str.s);
+        if ( args->write_index && init_index(set->fh,set->hdr,str.s,&set->index_fn)<0 ) error("Error: failed to initialise index for %s\n",str.s);
         if ( args->filter_str )
             set->filter = filter_init(set->hdr, args->filter_str);
     }
@@ -500,6 +504,15 @@ static void destroy_data(args_t *args)
     for (i=0; i<args->nsets; i++)
     {
         subset_t *set = &args->sets[i];
+        if ( args->write_index )
+        {
+            if ( bcf_idx_save(set->fh)<0 )
+            {
+                if ( hts_close(set->fh)!=0 ) error("Error: close failed .. %s\n", set->fname);
+                error("Error: cannot write to index %s\n", set->index_fn);
+            }
+            free(set->index_fn);
+        }
         if ( hts_close(set->fh)!=0 ) error("Error: close failed .. %s\n",set->fname);
         free(set->fname);
         free(set->smpl);
@@ -596,7 +609,7 @@ static void process(args_t *args)
     bcf_unpack(rec, BCF_UN_ALL);
 
     int i;
-    bcf1_t *out = NULL; 
+    bcf1_t *out = NULL;
     for (i=0; i<args->nsets; i++)
     {
         subset_t *set = &args->sets[i];
@@ -641,13 +654,14 @@ int run(int argc, char **argv)
         {"groups-file",required_argument,NULL,'G'},
         {"output",required_argument,NULL,'o'},
         {"output-type",required_argument,NULL,'O'},
+        {"write-index",no_argument,NULL,4},
         {NULL,0,NULL,0}
     };
     int c;
     char *tmp;
     while ((c = getopt_long(argc, argv, "vr:R:t:T:o:O:i:e:k:S:G:",loptions,NULL)) >= 0)
     {
-        switch (c) 
+        switch (c)
         {
             case  1 : args->hts_opts = hts_readlist(optarg,0,&args->nhts_opts); break;
             case 'k': args->keep_tags = optarg; break;
@@ -658,11 +672,11 @@ int run(int argc, char **argv)
                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
                 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
             case 'T': args->target = optarg; args->target_is_file = 1; break;
-            case 't': args->target = optarg; break; 
+            case 't': args->target = optarg; break;
             case 'R': args->region = optarg; args->region_is_file = 1;  break;
             case 'S': args->samples_fname = optarg; break;
             case 'G': args->groups_fname = optarg; break;
-            case 'r': args->region = optarg; break; 
+            case 'r': args->region = optarg; break;
             case 'o': args->output_dir = optarg; break;
             case 'O':
                       switch (optarg[0]) {
@@ -690,6 +704,7 @@ int run(int argc, char **argv)
                 args->targets_overlap = parse_overlap_option(optarg);
                 if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
                 break;
+            case  4 : args->write_index = 1; break;
             case 'h':
             case '?':
             default: error("%s", usage_text()); break;
@@ -708,7 +723,7 @@ int run(int argc, char **argv)
     if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n");
 
     init_data(args);
-    
+
     while ( bcf_sr_next_line(args->sr) ) process(args);
 
     destroy_data(args);
diff --git a/plugins/trio-dnm2.c b/plugins/trio-dnm2.c
index 4783458b2..7cbf7fbcd 100644
--- a/plugins/trio-dnm2.c
+++ b/plugins/trio-dnm2.c
@@ -125,6 +125,8 @@ typedef struct
     int need_QS;
     int strictly_novel;
     priors_t priors, priors_X, priors_XX;
+    char *index_fn;
+    int write_index;
 }
 args_t;
 
@@ -179,6 +181,7 @@ static const char *usage_text(void)
         "       --use-NAIVE                 A naive calling model which uses only FMT/GT to determine DNMs\n"
         "       --with-pAD                  Do not use FMT/QS but parental FMT/AD\n"
         "       --with-pPL                  Do not use FMT/QS but parental FMT/PL. Equals to DNG with bugs fixed (more FPs, fewer FNs)\n"
+        "       --write-index               Automatically index the output files [off]\n"
         "\n"
         "Example:\n"
         "   # Annotate VCF with FORMAT/DNM, run for a single trio\n"
@@ -767,6 +770,7 @@ static void init_data(args_t *args)
     args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
     if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
     if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
 
     if ( args->dnm_score_type & DNM_FLOAT )
         args->dnm_qual_float = (float*) malloc(sizeof(*args->dnm_qual_float)*bcf_hdr_nsamples(args->hdr));
@@ -796,6 +800,15 @@ static void destroy_data(args_t *args)
     free(args->ad);
     free(args->qs);
     free(args->qs3);
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(args->out_fh)<0 )
+        {
+            if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
     if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
     bcf_hdr_destroy(args->hdr_out);
     bcf_sr_destroy(args->sr);
@@ -1582,6 +1595,7 @@ int run(int argc, char **argv)
         {"targets",1,0,'t'},
         {"targets-file",1,0,'T'},
         {"targets-overlap",required_argument,NULL,15},
+        {"write-index",no_argument,NULL,16},
         {NULL,0,NULL,0}
     };
     int c;
@@ -1670,6 +1684,7 @@ int run(int argc, char **argv)
                 args->targets_overlap = parse_overlap_option(optarg);
                 if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
                 break;
+            case 16 : args->write_index = 1; break;
             case 'X': args->chrX_list_str = optarg; break;
             case 'u': set_option(args,optarg); break;
             case 'e':
diff --git a/plugins/variant-distance.c b/plugins/variant-distance.c
index a1aeb9aef..1d195c133 100644
--- a/plugins/variant-distance.c
+++ b/plugins/variant-distance.c
@@ -1,5 +1,5 @@
 /*
-    Copyright (C) 2022 Genome Research Ltd.
+    Copyright (C) 2022-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -63,6 +63,8 @@ typedef struct
     bcf_hdr_t *hdr;
     bcf_srs_t *sr;
     vcfbuf_t *buf;
+    char *index_fn;
+    int write_index;
 }
 args_t;
 
@@ -91,6 +93,7 @@ static const char *usage_text(void)
         "   -t, --targets REGION             Similar to -r but streams rather than index-jumps\n"
         "   -T, --targets-file FILE          Similar to -R but streams rather than index-jumps\n"
         "       --targets-overlap 0|1|2      Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
+        "       --write-index                Automatically index the output files [off]\n"
         "Examples:\n"
         "   bcftools +variant-distance input.bcf -Ob -o output.bcf\n"
         "\n";
@@ -126,6 +129,7 @@ static void init_data(args_t *args)
     if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
 
     if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
 
     args->buf = vcfbuf_init(args->hdr, 0);
     vcfbuf_set_opt(args->buf,int,VCFBUF_DUMMY,1)
@@ -134,6 +138,15 @@ static void destroy_data(args_t *args)
 {
     if ( args->filter )
         filter_destroy(args->filter);
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(args->out_fh)<0 )
+        {
+            if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
     if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
     bcf_sr_destroy(args->sr);
     vcfbuf_destroy(args->buf);
@@ -233,6 +246,7 @@ int run(int argc, char **argv)
         {"targets-overlap",required_argument,NULL,2},
         {"output",required_argument,NULL,'o'},
         {"output-type",required_argument,NULL,'O'},
+        {"write-index",no_argument,NULL,4},
         {NULL,0,NULL,0}
     };
     int c;
@@ -286,6 +300,7 @@ int run(int argc, char **argv)
                           if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
                       }
                       break;
+            case  4 : args->write_index = 1; break;
             case 'h':
             case '?':
             default: error("%s", usage_text()); break;
diff --git a/plugins/variantkey-hex.c b/plugins/variantkey-hex.c
index c126babfc..bb07ac992 100644
--- a/plugins/variantkey-hex.c
+++ b/plugins/variantkey-hex.c
@@ -2,7 +2,7 @@
 
     Copyright (C) 2017-2018 GENOMICS plc.
 
-    Author: Nicola Asuni <nicola.asuni@genomicsplc.com>
+    Author: Nicola Asuni <nicola.asuni@tecnick.com>
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/reheader.c b/reheader.c
index 4458f27bc..ed852173c 100644
--- a/reheader.c
+++ b/reheader.c
@@ -68,7 +68,8 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see
     kstring_t key = {0,0,0}, val = {0,0,0}, tmp = {0,0,0};
     char *chr_name = NULL, *p, *q = line + 9;   // skip ##contig=
     char *end = q;
-    int nopen = 1, chr_len = 0;
+    int nopen = 1;
+    hts_pos_t chr_len = 0;
     while ( *end && *end!='\n' ) end++;
     while ( *q && *q!='\n' && nopen>0 )
     {
@@ -118,7 +119,7 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see
         if ( !strcmp("ID",key.s) )
         {
             if ( khash_str2int_has_key(chr_seen,val.s) ) continue;
-            chr_len = faidx_seq_len(fai, val.s);
+            chr_len = faidx_seq_len64(fai, val.s);
             if ( chr_len==-1 )
             {
                 free(val.s); free(key.s); free(tmp.s);
@@ -136,7 +137,7 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see
         if ( quoted ) kputc('"',&tmp);
     }
     if ( !chr_name ) return end;
-    ksprintf(dst,"##contig=<ID=%s,length=%d%s>",chr_name,chr_len,tmp.l ? tmp.s : "");
+    ksprintf(dst,"##contig=<ID=%s,length=%"PRIhts_pos"%s>",chr_name,chr_len,tmp.l ? tmp.s : "");
     free(key.s); free(val.s); free(tmp.s);
     return q;
 }
@@ -211,7 +212,7 @@ static void update_from_fai(args_t *args)
     for (i=0; i<n; i++)
     {
         if ( khash_str2int_has_key(chr_seen,faidx_iseq(fai,i)) ) continue;
-        ksprintf(&hdr_txt_new,"##contig=<ID=%s,length=%d>\n",faidx_iseq(fai,i),faidx_seq_len(fai,faidx_iseq(fai,i)));
+        ksprintf(&hdr_txt_new,"##contig=<ID=%s,length=%"PRIhts_pos">\n",faidx_iseq(fai,i),faidx_seq_len64(fai,faidx_iseq(fai,i)));
     }
     kputs(tmp+1,&hdr_txt_new);
 
@@ -699,7 +700,7 @@ int main_reheader(int argc, char *argv[])
     int c;
     args_t *args  = (args_t*) calloc(1,sizeof(args_t));
     args->argc    = argc; args->argv = argv;
-    
+
     static struct option loptions[] =
     {
         {"temp-prefix",1,0,'T'},
diff --git a/test/annotate.33.out b/test/annotate.33.out
new file mode 100644
index 000000000..b651a0f9b
--- /dev/null
+++ b/test/annotate.33.out
@@ -0,0 +1,41 @@
+##fileformat=VCFv4.1
+##FILTER=<ID=PASS,Description="All filters passed">
+##INFO=<ID=TEST,Number=1,Type=Integer,Description="Testing Tag">
+##FORMAT=<ID=TT,Number=A,Type=Integer,Description="Testing Tag, with commas and \"escapes\" and escaped escapes combined with \\\"quotes\\\\\"">
+##INFO=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype Likelihood">
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=test,Description="Testing filter">
+##contig=<ID=1,assembly=b37,length=249250621>
+##contig=<ID=2,assembly=b37,length=249250621>
+##contig=<ID=3,assembly=b37,length=198022430>
+##contig=<ID=4,assembly=b37,length=191154276>
+##test=<ID=4,IE=5>
+##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
+##readme=AAAAAA
+##readme=BBBBBB
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">
+##INFO=<ID=STR,Number=1,Type=String,Description="Test string type">
+##INFO=<ID=XXX,Number=0,Type=Flag,Description="Sites marked with `bcftools annotate -m XXX`">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	A	B
+1	3000150	.	C	T	59.2	PASS	AN=4;AC=2;XXX	GT:GQ	0/1:245	0/1:245
+1	3000151	.	C	T	59.2	PASS	AN=4;AC=2;XXX	GT:DP:GQ	0/1:32:245	0/1:32:245
+1	3062915	id3D	GTTT	G	12.9	q10	DP4=1,2,3,4;AN=4;AC=2;INDEL;STR=test;XXX	GT:GQ:DP:GL	0/1:409:35:-20,-5,-20	0/1:409:35:-20,-5,-20
+1	3062915	idSNP	G	T,C	12.6	test	TEST=5;DP4=1,2,3,4;AN=3;AC=1,1;XXX	GT:TT:GQ:DP:GL	0/1:0,1:409:35:-20,-5,-20,-20,-5,-20	2:0,1:409:35:-20,-5,-20
+1	3106154	.	CAAA	C	342	PASS	AN=4;AC=2;XXX	GT:GQ:DP	0/1:245:32	0/1:245:32
+1	3106154	.	C	CT	59.2	PASS	AN=4;AC=2;XXX	GT:GQ:DP	0/1:245:32	0/1:245:32
+1	3157410	.	GA	G	90.6	q10	AN=4;AC=4;XXX	GT:GQ:DP	1/1:21:21	1/1:21:21
+1	3162006	.	GAA	G	60.2	PASS	AN=4;AC=2;XXX	GT:GQ:DP	0/1:212:22	0/1:212:22
+1	3177144	.	G	T	45	PASS	AN=4;AC=2;XXX	GT:GQ:DP	0/0:150:30	1/1:150:30
+1	3177144	.	G	.	45	PASS	AN=4;AC=0;XXX	GT:GQ:DP	0/0:150:30	0/0:150:30
+1	3184885	.	TAAAA	TA,T	61.5	PASS	AN=4;AC=2,2;XXX	GT:GQ:DP	1/2:12:10	1/2:12:10
+2	3199812	.	G	GTT,GT	82.7	PASS	AN=4;AC=2,2;XXX	GT:GQ:DP	1/2:322:26	1/2:322:26
+3	3212016	.	CTT	C,CT	79	PASS	AN=4;AC=2,2;XXX	GT:GQ:DP	1/2:91:26	1/2:91:26
+4	3258448	.	TACACACAC	T	59.9	PASS	AN=4;AC=2;XXX	GT:GQ:DP	0/1:325:31	0/1:325:31
+4	4000000	.	T	A,C	59.9	PASS	AN=4;AC=2,0;XXX	GT:GQ:DP	0/1:325:31	0/1:325:31
+4	4000001	.	T	A	59.9	PASS	AN=4;AC=2;XXX	GT:GQ:DP	0/1:325:31	0/1:325:31
diff --git a/test/annotate.olap.2.out b/test/annotate.olap.2.out
index eab0ef4fc..8453306df 100644
--- a/test/annotate.olap.2.out
+++ b/test/annotate.olap.2.out
@@ -5,6 +5,7 @@
 ##ALT=<ID=CNV,Description="Copy Number Variation">
 ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
 ##INFO=<ID=DB,Number=1,Type=String,Description="CNV id from a database">
+##INFO=<ID=XXX,Number=0,Type=Flag,Description="Sites listed in XXX">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
-1	10	.	C	<CNV>	.	.	END=19;DB=cnv10_15
+1	10	.	C	<CNV>	.	.	END=19;DB=cnv10_15;XXX
 1	20	.	C	<CNV>	.	.	END=30
diff --git a/test/annotate34.out b/test/annotate34.out
new file mode 100644
index 000000000..819aa0368
--- /dev/null
+++ b/test/annotate34.out
@@ -0,0 +1,14 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##contig=<ID=chr21,length=45090682>
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
+##FILTER=<ID=HET1,Description="Heterozygous in the first haplotype">
+##FILTER=<ID=HET2,Description="Heterozygous in the second haplotype">
+##INFO=<ID=SVTYPE,Number=1,Type=String,Description="SVTYPE">
+##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="SVLEN">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End coordinate in reference for SV">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	syndip
+chr21	8914240	.	gttccattccattccattcaattccattccattgcattccattccattccattcca	G	30	HET2	SVTYPE=DEL;SVLEN=55;END=8914295	GT:AD	0|.:3,1
+chr21	8914680	.	tattccattccattcc	TATTCCATTCCATTCCATTCCATTCCATTCCATTCCATTCCATTCTAGTTGATTCCATTCCATTCCATCCCGTTCCATTCCATTCCGTTACTTTCTATTCCATTCCATTCCATTCC	30	HET2	SVTYPE=INS;SVLEN=100;END=8914681	GT:AD	0|.:1,1
+chr21	8914690	.	c	CATTCCATTCCATTCCATTCCATTCTAGTTGATTCCATTCCATTCCATCCCGTTCCATTCCATTCCGTTACTTTCT	30	HET2	SVTYPE=INS;SVLEN=75;END=8914691	GT:AD	0|.:2,1
diff --git a/test/annotate34.vcf b/test/annotate34.vcf
new file mode 100644
index 000000000..4b60b89ba
--- /dev/null
+++ b/test/annotate34.vcf
@@ -0,0 +1,13 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##contig=<ID=chr21,length=45090682>
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
+##FILTER=<ID=HET1,Description="Heterozygous in the first haplotype">
+##FILTER=<ID=HET2,Description="Heterozygous in the second haplotype">
+##INFO=<ID=SVTYPE,Number=1,Type=String,Description="SVTYPE">
+##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="SVLEN">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	syndip
+chr21	8914240	.	gttccattccattccattcaattccattccattgcattccattccattccattcca	G	30	HET2	SVTYPE=DEL;SVLEN=55	GT:AD	0|.:3,1
+chr21	8914680	.	tattccattccattcc	TATTCCATTCCATTCCATTCCATTCCATTCCATTCCATTCCATTCTAGTTGATTCCATTCCATTCCATCCCGTTCCATTCCATTCCGTTACTTTCTATTCCATTCCATTCCATTCC	30	HET2	SVTYPE=INS;SVLEN=100	GT:AD	0|.:1,1
+chr21	8914690	.	c	CATTCCATTCCATTCCATTCCATTCTAGTTGATTCCATTCCATTCCATCCCGTTCCATTCCATTCCGTTACTTTCT	30	HET2	SVTYPE=INS;SVLEN=75	GT:AD	0|.:2,1
diff --git a/test/annots34.tab b/test/annots34.tab
new file mode 100644
index 000000000..06b217627
--- /dev/null
+++ b/test/annots34.tab
@@ -0,0 +1,3 @@
+chr21	8914240	8914240	8914295
+chr21	8914680	8914680	8914681
+chr21	8914690	8914690	8914691
diff --git a/test/concat.5.3.out b/test/concat.5.3.out
new file mode 100644
index 000000000..a6eeb44e3
--- /dev/null
+++ b/test/concat.5.3.out
@@ -0,0 +1,8 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##contig=<ID=chr1>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+chr1	1	.	A	C	.	.	.
+chr1	2	.	C	G	.	.	.
+chr1	3	.	G	T	.	.	.
+chr1	4	.	T	A	.	.	.
diff --git a/test/consensus.19.out b/test/consensus.19.out
new file mode 100644
index 000000000..84a241db0
--- /dev/null
+++ b/test/consensus.19.out
@@ -0,0 +1,20 @@
+>1:2-501
+TAC:A:AT:Tga::t+++AT:AaAAAGAACATAACCTACGTATCAACTAAAGTGGTTGTT
+TG:AGAAAAGGAAGACTTAAAAAGAGTCAGTACTAACCTACATAATATATACAATGTTCA
+TTAAATAATAAAATGAGCTCATCATACTTAGGTCATCATAAATATATCTGAAATTCACAA
+ATATTGATCAAATGGTAAAATAGACAAGTAGATTTTAATAGGTTAAACAATTACTGATTC
+TCTTGAAAGAATAAATTTAATATGAGACCTATTTCATTATAATGAACTCACAAATTAGAA
+ACTTCACACTGGGGGCTGGAGAGATGGCTCAGTAGTTAAGAACACTGACTGCTCTTCTGA
+AGGTCCTGAGTTCAAATCCCAGCAACCACATGGTGACTTACAACCATCTGTAATGACATC
+TGATGCCCTCTGGTGTGTCTGAAGACAGCTACAGTGTACTTACATAAAATAATAAATAAA
+TCTTTAAAAACAAAAAAAAAGAA
+>2
+gaagatcttttccttattaaggatctgaagctctgtagatttgtattctattaaacatgg
+A:::attagtgattttccatattctttaagtcattttagagtaatgtgttcttaagat::
+:tcagaaaaacaaaaacttgtgctttcctgtttgaaaaacaaacagctgtggggaatgG+
++++++++tgtcgggacagcctttttatA----------aaataatgttgaggctttgata
+cgtcaaagttatatttcaaatggaatcacttagacctcgtttctgagtgtcaatggccat
+attggggAtttgctgctgccaatgacaGcacaccctgggaatgccccaactacttactac
+aaagcagtgttacatggagaagatcttcaagagtctttttgctagatctttccttggctt
+ttgatgtgactcctctcaataaaatccacagtaatatagtgagtggtctcctgctccaaa
+ccagtatt:cagacacagttaatccagac
diff --git a/test/consensus.21.fa b/test/consensus.21.fa
new file mode 100644
index 000000000..e81503b5b
--- /dev/null
+++ b/test/consensus.21.fa
@@ -0,0 +1,6 @@
+>17
+ACGTACGT
+>18
+ACGTACGT
+>19
+ACGTACGT
diff --git a/test/consensus.21.vcf b/test/consensus.21.vcf
new file mode 100644
index 000000000..0c4dbb702
--- /dev/null
+++ b/test/consensus.21.vcf
@@ -0,0 +1,11 @@
+##fileformat=VCFv4.2
+##reference=file://some/path/human_g1k_v37.fasta
+##contig=<ID=17>
+##contig=<ID=18>
+##contig=<ID=19>
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	a
+19	2	.	C	A	.	.	.	GT	0/0
+19	3	.	G	C	.	.	.	GT	0/0
+19	4	.	T	C	.	.	.	GT	0/1
+19	5	.	A	C	.	.	.	GT	1/1
diff --git a/test/consensus21.1.out b/test/consensus21.1.out
new file mode 100644
index 000000000..7633e2ff2
--- /dev/null
+++ b/test/consensus21.1.out
@@ -0,0 +1,6 @@
+>17
+ACGTACGT
+>18
+ACGTACGT
+>19
+ACGYCCGT
diff --git a/test/consensus7.vcf b/test/consensus7.vcf
index 21d657291..bc27f1f69 100644
--- a/test/consensus7.vcf
+++ b/test/consensus7.vcf
@@ -4,9 +4,9 @@
 ##ALT=<ID=DEL,Description="Deletion">
 ##contig=<ID=1,assembly=b37,length=249250621>
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA001
-1	2	.	C	A	.	.	.	GT	0|1
-1	3	.	G	A	.	.	.	GT	0|1
-1	4	.	T	A	.	.	.	GT	0|1
-1	6	.	C	A	.	.	.	GT	0/1
-1	7	.	G	A	.	.	.	GT	0/1
-1	8	.	T	A	.	.	.	GT	0/1
+1	2	.	C	A	.	.	.	GT	0|1|0|1
+1	3	.	G	A	.	.	.	GT	0|1|0|1
+1	4	.	T	A	.	.	.	GT	0|1|0|1
+1	6	.	C	A	.	.	.	GT	0/1/0/1
+1	7	.	G	A	.	.	.	GT	0/1/0/1
+1	8	.	T	A	.	.	.	GT	0/1/0/1
diff --git a/test/convert.tsv b/test/convert.tsv
new file mode 100644
index 000000000..e9f7c7e24
--- /dev/null
+++ b/test/convert.tsv
@@ -0,0 +1,24 @@
+rs001	1	2	A   A
+rs002	1	10	A   G
+rs003	1	14	A   G
+rs004	1	24	T   C
+rs005	1	44	C   G
+rs006	1	53	G   G
+rs007	1	60	G   G
+rs008	1	62	C   C
+rs009	1	75	A   A
+rs010	1	80	G   G
+rs011	1	89	T   T
+rs013	1	99	C   C
+rs014	1	102	G   G
+rs015	1	112	T   T
+rs016	2	5	C   C
+rs017	2	11	C   T
+rs018	2	16	C   C
+rs019	2	20	G   G
+rs020	2	33	C   T
+rs021	2	39	A   A
+rs022	2	44	C   C
+rs023	2	48	C   C
+rs024	2	55	A   A
+rs025	2	59	C   T
diff --git a/test/convert.tsv.vcf b/test/convert.tsv.vcf
new file mode 100644
index 000000000..fec089b66
--- /dev/null
+++ b/test/convert.tsv.vcf
@@ -0,0 +1,31 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##contig=<ID=1,length=150>
+##contig=<ID=2,length=77>
+##contig=<ID=Y,length=40>
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+1	2	.	A	.	.	.	.
+1	10	.	A	G	.	.	.
+1	14	.	A	G	.	.	.
+1	24	.	T	C	.	.	.
+1	44	.	C	G	.	.	.
+1	53	.	G	.	.	.	.
+1	60	.	G	.	.	.	.
+1	62	.	C	.	.	.	.
+1	75	.	A	.	.	.	.
+1	80	.	G	.	.	.	.
+1	89	.	T	.	.	.	.
+1	99	.	C	.	.	.	.
+1	102	.	G	.	.	.	.
+1	112	.	T	.	.	.	.
+2	5	.	C	.	.	.	.
+2	11	.	C	T	.	.	.
+2	16	.	C	.	.	.	.
+2	20	.	G	.	.	.	.
+2	33	.	C	T	.	.	.
+2	39	.	A	.	.	.	.
+2	44	.	C	.	.	.	.
+2	48	.	C	.	.	.	.
+2	55	.	A	.	.	.	.
+2	59	.	C	T	.	.	.
diff --git a/test/csq.chr.out b/test/csq.chr.out
new file mode 100644
index 000000000..81d3f79be
--- /dev/null
+++ b/test/csq.chr.out
@@ -0,0 +1,2 @@
+.
+missense|gtrV|gtrV|protein_coding|+|1L>1I|1T>A
diff --git a/test/csq.nchr.fa b/test/csq.nchr.fa
new file mode 100644
index 000000000..f6f29f3ee
--- /dev/null
+++ b/test/csq.nchr.fa
@@ -0,0 +1,2 @@
+>1
+TTAAGGCTGTTTTTTTATTAATGTCATCGTCCATCCTGCAGGGTTGAACTTGAAAGAATA
diff --git a/test/csq.nchr.gff b/test/csq.nchr.gff
new file mode 100644
index 000000000..0a70b0749
--- /dev/null
+++ b/test/csq.nchr.gff
@@ -0,0 +1,3 @@
+1	.	gene	1	60	.	+	.	ID=gene:gtrV;Name=gtrV;gene_biotype=protein_coding
+1	.	transcript	1	60	.	+	.	ID=transcript:gtrV;Parent=gene:gtrV;gene_biotype=protein_coding
+1	.	CDS	1	60	.	+	.	Parent=transcript:gtrV
diff --git a/test/csq.nchr.vcf b/test/csq.nchr.vcf
new file mode 100644
index 000000000..d29f2f6dd
--- /dev/null
+++ b/test/csq.nchr.vcf
@@ -0,0 +1,6 @@
+##fileformat=VCFv4.2
+##reference=dummy.fa
+##contig=<ID=1,length=1254>
+##FORMAT=<ID=GT,Type=String,Number=1,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	smpl
+1	1	.	T	A	.	.	.	GT	0|1
diff --git a/test/csq.ychr.fa b/test/csq.ychr.fa
new file mode 100644
index 000000000..98aea0857
--- /dev/null
+++ b/test/csq.ychr.fa
@@ -0,0 +1,2 @@
+>chr1
+TTAAGGCTGTTTTTTTATTAATGTCATCGTCCATCCTGCAGGGTTGAACTTGAAAGAATA
diff --git a/test/csq.ychr.gff b/test/csq.ychr.gff
new file mode 100644
index 000000000..99326c5b8
--- /dev/null
+++ b/test/csq.ychr.gff
@@ -0,0 +1,3 @@
+chr1	.	gene	1	60	.	+	.	ID=gene:gtrV;Name=gtrV;gene_biotype=protein_coding
+chr1	.	transcript	1	60	.	+	.	ID=transcript:gtrV;Parent=gene:gtrV;gene_biotype=protein_coding
+chr1	.	CDS	1	60	.	+	.	Parent=transcript:gtrV
diff --git a/test/csq.ychr.vcf b/test/csq.ychr.vcf
new file mode 100644
index 000000000..0f1139e2a
--- /dev/null
+++ b/test/csq.ychr.vcf
@@ -0,0 +1,6 @@
+##fileformat=VCFv4.2
+##reference=dummy.fa
+##contig=<ID=chr1,length=1254>
+##FORMAT=<ID=GT,Type=String,Number=1,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	smpl
+chr1	1	.	T	A	.	.	.	GT	0|1
diff --git a/test/csq/EDUMMY0003/long-del.txt b/test/csq/EDUMMY0003/long-del.txt
index 3a8fd3fcc..2e3222073 100644
--- a/test/csq/EDUMMY0003/long-del.txt
+++ b/test/csq/EDUMMY0003/long-del.txt
@@ -1,3 +1,3 @@
 28503	TTCACACCTGATGTGCGTCC	T	3_prime_utr|PCGF3|ENST00000430644|NMD,3_prime_utr|PCGF3|ENST00000440452|NMD,5_prime_utr|PCGF3|ENST00000521023|protein_coding,frameshift|PCGF3|ENST00000400151|protein_coding|+|157SSHLMCVLTPDESVLWC*>157SSHLMSLCFGV|28503TTCACACCTGATGTGCGTCC>T
-28503	TTCACACCTGATGTGCGTCC	T	3_prime_utr|PCGF3|ENST00000430644|NMD,3_prime_utr|PCGF3|ENST00000440452|NMD,5_prime_utr|PCGF3|ENST00000521023|protein_coding,frameshift|PCGF3|ENST00000400151|protein_coding|+|157SSHLMCVLTPDESVLWC*>157SSHLMSLCFGV|28503TTCACACCTGATGTGCGTCC>T
+28503	TTCACACCTGATGTGCGTCC	T	3_prime_utr&NMD_transcript|PCGF3|ENST00000430644|NMD,3_prime_utr&NMD_transcript|PCGF3|ENST00000440452|NMD,5_prime_utr|PCGF3|ENST00000521023|protein_coding,frameshift|PCGF3|ENST00000400151|protein_coding|+|157SSHLMCVLTPDESVLWC*>157SSHLMSLCFGV|28503TTCACACCTGATGTGCGTCC>T
 
diff --git a/test/csq/ENSMUST00000121418/filter-problem.2.cmd.out b/test/csq/ENSMUST00000121418/filter-problem.2.cmd.out
index ce3b6f10c..32d88a799 100644
--- a/test/csq/ENSMUST00000121418/filter-problem.2.cmd.out
+++ b/test/csq/ENSMUST00000121418/filter-problem.2.cmd.out
@@ -1,3 +1,3 @@
-25105	G	A	5_prime_utr|Rab3il1|ENSMUST00000121418|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000131407|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000137637|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000149967|protein_coding,synonymous|Rab3il1|ENSMUST00000113161|protein_coding|+|52E|25105G>A,synonymous|Rab3il1|ENSMUST00000117641|protein_coding|+|52E|25105G>A,synonymous|Rab3il1|ENSMUST00000144788|NMD|+|52E|25105G>A
+25105	G	A	5_prime_utr|Rab3il1|ENSMUST00000121418|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000131407|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000137637|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000149967|protein_coding,synonymous&NMD_transcript|Rab3il1|ENSMUST00000144788|NMD|+|52E|25105G>A,synonymous|Rab3il1|ENSMUST00000113161|protein_coding|+|52E|25105G>A,synonymous|Rab3il1|ENSMUST00000117641|protein_coding|+|52E|25105G>A
 25496	ACAAA	ACAAAGCAAA	.
 25500	AACAA	AACAAGACAA	intron|Rab3il1||NMD,intron|Rab3il1||protein_coding
diff --git a/test/csq/ENSMUST00000121418/filter-problem.3.cmd.out b/test/csq/ENSMUST00000121418/filter-problem.3.cmd.out
index 328acbb64..a93bcbd79 100644
--- a/test/csq/ENSMUST00000121418/filter-problem.3.cmd.out
+++ b/test/csq/ENSMUST00000121418/filter-problem.3.cmd.out
@@ -1,3 +1,3 @@
-25105	G	A	5_prime_utr|Rab3il1|ENSMUST00000121418|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000131407|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000137637|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000149967|protein_coding,synonymous|Rab3il1|ENSMUST00000113161|protein_coding|+|52E|25105G>A,synonymous|Rab3il1|ENSMUST00000117641|protein_coding|+|52E|25105G>A,synonymous|Rab3il1|ENSMUST00000144788|NMD|+|52E|25105G>A
+25105	G	A	5_prime_utr|Rab3il1|ENSMUST00000121418|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000131407|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000137637|protein_coding,5_prime_utr|Rab3il1|ENSMUST00000149967|protein_coding,synonymous&NMD_transcript|Rab3il1|ENSMUST00000144788|NMD|+|52E|25105G>A,synonymous|Rab3il1|ENSMUST00000113161|protein_coding|+|52E|25105G>A,synonymous|Rab3il1|ENSMUST00000117641|protein_coding|+|52E|25105G>A
 25496	ACAAA	ACAAAGCAAA	intron|Rab3il1||NMD,intron|Rab3il1||protein_coding
 25500	AACAA	AACAAGACAA	.
diff --git a/test/csq/ENST00000360372/ENST00000360372.fa.fai b/test/csq/ENST00000360372/ENST00000360372.fa.fai
new file mode 100644
index 000000000..0bd0f8f06
--- /dev/null
+++ b/test/csq/ENST00000360372/ENST00000360372.fa.fai
@@ -0,0 +1 @@
+chr1	14286	31	60	61
diff --git a/test/csq/ENST00000479739/short-cds-start-lost.txt b/test/csq/ENST00000479739/short-cds-start-lost.txt
index e1ae5a880..5c5bc9902 100644
--- a/test/csq/ENST00000479739/short-cds-start-lost.txt
+++ b/test/csq/ENST00000479739/short-cds-start-lost.txt
@@ -1,3 +1,3 @@
 25091	T	C	start_lost&splice_region|SH3YL1|ENST00000479739|NMD|-|1M>1V|25091T>C
-25091	T	C	start_lost&splice_region|SH3YL1|ENST00000479739|NMD|-|1M>1V|25091T>C
+25091	T	C	start_lost&splice_region&NMD_transcript|SH3YL1|ENST00000479739|NMD|-|1M>1V|25091T>C
 
diff --git a/test/csq/ENST00000479739/short-cds-start-lost.txt-l b/test/csq/ENST00000479739/short-cds-start-lost.txt-l
index e1ae5a880..5c5bc9902 100644
--- a/test/csq/ENST00000479739/short-cds-start-lost.txt-l
+++ b/test/csq/ENST00000479739/short-cds-start-lost.txt-l
@@ -1,3 +1,3 @@
 25091	T	C	start_lost&splice_region|SH3YL1|ENST00000479739|NMD|-|1M>1V|25091T>C
-25091	T	C	start_lost&splice_region|SH3YL1|ENST00000479739|NMD|-|1M>1V|25091T>C
+25091	T	C	start_lost&splice_region&NMD_transcript|SH3YL1|ENST00000479739|NMD|-|1M>1V|25091T>C
 
diff --git a/test/csq/ENST00000571540/ENST00000571540.fa b/test/csq/ENST00000571540/ENST00000571540.fa
new file mode 100644
index 000000000..9ea3e30e9
--- /dev/null
+++ b/test/csq/ENST00000571540/ENST00000571540.fa
@@ -0,0 +1,84 @@
+>17
+CAGCCCCAGTTCACCCCTCGGGGCGGAGGATCTCCTGAGTGATTCATCAGAACCCCCTGGGCTCAACCAAGTGTCGTCTG
+AAGTGACCTCCCAGCTCTATGCTTCTTTGCGCCTCAGCCGGCAGGCGGAGGCCACGGCCCGAGCCCAGCTGTATTTACCC
+TCCACCTCCCCGCCTCATGAAGGGTTAGACGGCTTCGCCCAAGAATTGAGTCGAAGCTTGTCAGTCGGATTGGAAAAGAA
+CTTGAAGAAAAAGGTGAGGGAAGTGTGTCTTGGAGACCACTGTGGCACTAGACACCAGAGAGTCTTGGGATTGGGGTTGG
+TAAAAATAAAAAGCTTTGATGAGATTTGAACTCTTCCTGTTGGATTTCATATTCCTTTTAACTGCATAGGCAGCCATGCT
+TATAAGGGAGGGAGTGACCTGGGACACCATAATTTGAAAATTATGAAACTTCCCAGTGTTTTTTTGTGAGAGACATCTCT
+GCTCCGAGTAGATAGAGCAAACCTATGGGGTAGGTTGGTGAGCTATTTCCCGTGCACTGGGAATGGGTAAGGTTTCTTGA
+TCCCAAGAGGGAGCTAGGGACTTAGGATAGCAGCTCCGATCCTTCCAGCTCAACACTATGTTGATAGTATGGTTCCAACT
+TTGGCATGTACATCATTAAGACATAGCTTAGTCAACTACAACATGTTACAGAGAAGATAGTTATTAGTATTGTATAGAAA
+AGGTGTCAGAGTCAATTTGCAGACTGGTTAGGTCTTCCAGAGTTTGAAAATGATAGCCATAAGCCATACAGTCCTTACAT
+TTGTTCTTGAATCGGCAAGAATCTCCGCAACTCTTGTCTTATACCTCCTCGATTATTTTATTTACACTCTCTTTCTGGCT
+TCTGTAGTATTTGAATTTCTGATTCAAGTTTAGGACATCCCTTATTGGATCCGTTTGGTTTTTCTTGTTTTCCAAAGTGC
+TGGAGTGAAAATTCTACCCTGGCAATAGGTAGGAGATAGATAACACAGACTGCATCTGATTTCCATGTGGCTTTTTTCCA
+GGATGGTTCTAAGCATATCTTTGAGATGGAAAGTGTTCGGGGTCAGCTCCAGACCATGCTCCAAACCTCACGTGATACAG
+CCTATCGTGAGTAAGCCCCTTCCCTAGAACTATGAAGGAGAACCTAGATGTAAGGGGTGGGAGAAAGTGGACAGAGGAAG
+CAGGCAGGAAAACCTCTGAGCAAGAGATTTCAGGAGAAAAGTCTCCCGTTCTTTGGGAGAGGGGGTGAAGGAGTGAGGGA
+ACTGGAAAGGAAGGCCTTTTCTCTTCAGGTGGCCCCTGTTTTTGTCATTACCTGACTCTGGCTTTGGGGTCCCTCTGGCT
+GCCTGCAGGGGATCCTCTCATTCCTGGCGCTGGCTCAGAGAGACGGGAAGAGGACTCCTTTGACAGTGATAGCACAGCCA
+CCTTGCTCAAGTGAGTTCTCCTTGTGGTTCTCCTAGCTTGTTTGCTTTCTTGGAAATCCAGGTGTTTTGTTCCTTATTTC
+TCCCTTTATTGCTTTTGTACCTCTTAAGAACCCAAGAGGCTTATCACTTGCCTTTCAGTTTTCATCTTTTTCTACTGTCT
+CCTGCCCCCTTTACTCCTTCAGGAACACAGGTATCTGTTGCCTAGTACCTGGCTTTCTCTTAACCTGCAGTCATTCTTCC
+CCTCTAAAGAGTTAGCTTGTTTCTGCTGAAACCTTTAAGTGAATCACAGACCTGTGTGTTATTTATAGCACTATGGTTAT
+ATAGCTTTACGACTTTGGACAAGTCTCTTGCCCTCTCTGGGTCTCAGTTTCCTCATTTGTAGAATATCACAGTTGGACAA
+GTGGATTCCAAAAGCTTTCCCAGCCCTAACATTCTCTACCTGATTTGCTCAGCACCCGGCCCCTGCAAGACTTGTCTCCA
+TCTAGCTCAGCCCAAGCCCTGGAGGAGCTGTTTCCCCGCTACACCAGCCTTCGGCCAGGGCCTCCACTCAATCCCCCAGA
+TTTTCAGGGGCTGAGAGATGCATTGGATTCAGAGCATACCCGCCGCAAGGTAAGATGCAAACGCTTCCTTTCGAAAGCAG
+CAAAGATTAGAAAGAGGGGACCCAAGTGTTGAAAAGGGCTGAGGGGGCCTGGCGTGGTGGCTCACGCCTGTAATCACAGC
+ACTTTGGGAGGCTGAGGCAGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTGGCTAACACGGTGAAACCCGTCTCTA
+CTAAAAATAGAAAAAATTAGCCGGGCATGGTGGTGGGTGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATG
+GCGTGAACCTGGGAGGCGGAGCTTGCAGTGAGCCGAGATCATGCCGCTGCACTCCAGCCTGGGCGACAGAGCGAGACTCC
+GTCTCAAAAAAAAAAAACTAAAAAAGAAAAGGGCTGAGGGTTAAGAGCCTGAAAGCTGGGACTTAAATGTTTTTCTGAGG
+AGGGCTTTTTTGACCATTCCATCTAATATGGCACCACCTCCTTTCCAAGTAGTCAGATCGCCTTCTTATTTCTTTCATGT
+AGCTTATCGTTCTGATATATCTTGTTTTTTTCTCCTCTCTAAAATTTGAGGGTCCGTATCTTGTTTGTCTTAGTACTGGC
+TGCCTAAGGTCTAGAATATACATAACACAGAAACACTCTGTGTTTGTCGAATGGACACAAACTCTGCTAGCTTGTGCACA
+GGATTTACAGCTACTGGTGATATACTGAGCTTTATCTCTGTGAGCCTCTGCCTTAGTGTCTGAAGGGGACAGAGAAATAG
+ACAAAGGAGAGGAAAAGTCAAAATCACTGGTTGCCATTTTTGGAAGCCTCTCTTTCTCCATGTTAAACCCCTTGGCATGA
+ATGAAAGGCTTTCCCCTTTATCCTTGCTGTGTTTATTCATTGTGTCCAATGAGCTTCTGAAGGTAGGAGTAACTTTGCAC
+ATTTTGCTGTGTGGGCTGCTGCATCTCTGCCAAAAGGTTTATGGGATTTTTACTCAGGAGATCCAGGAAAAGGAGGAGCC
+TGGAATCTAAGCACTTATCTCCTGAAAAGGTACAGAAACATGTTGGAGATCTTGAGCCTCAAGGTGCTTGCTTCAGAATT
+TTCCTGATTCTCCCTCTCCAGACCTATTCTGTTCTTGGGACCCAAGCTTCTTGGCTCCAGCCCACTCCCCCACTAGAGGA
+GCTGGAAAGTTTGGTGCTGTGGTCATACCAAAGATGGGCAACACCCAGACTCCTCACCCTTTTCCCCAGCATTGTGAGCG
+CCATATTCAGAGCCTGCAGACCCGAGTGTTAGAGCTACAGCAACAATTAGCCGTGGCTGTGGCTGCCGACCGCAAGAAAG
+ATACCATGATTGAACAACTGGACAAGGTACCAGGGTAGCAAAATGTGGGTGGGTCTCTCCATGAAGAGCATTAAGGAATA
+ATAAATAAGTGGGTGGCCAACCAATGTTTCTTGGTACATGCTGAGAGCTGGGCAAGGGGTTGGTTTGCTGACTGTTGGGA
+GAAGATGGCTGTTGACCCTGCCCCTGTGGGTAGAAAGAGGCAAAAAAGTTATTTTGAAATTTCATCTTACTTGCCCTACC
+TAAGACCCTGGCCCGTGTGGTGGAGGGCTGGAACCGGCATGAGGCTGAGCGGACAGAGGTTCTCAGGGGACTTCAAGAGG
+AACACCAGGCAGCAGAGCTCACCAGAAGCAAGCAGCAGGAGGTGAGCGCCCTGGAGCATATGGCATTAGAACCTGAGTCA
+CAGATCTCAAGATGGAAAGGGGCAGAAATAGCTCTGGAATTAGGGTTTCCAACAGATTGACCTCAGTCTTGTAGTGGCTG
+TCCTGACATTTCTTCTCATCCCTCCTTTCTTCTCCTTTAACCGTTTCTGCTGTCCAACTTGAATCATCAATGTCTTAATT
+TTTCCCACACCATGTAGTCTTCTATCTGGCCCCTTAGTCATTTAGTTAACTGTGAAGTTTCAAGTTTACATTGTCAATGC
+TTTATAAAATATAGCACACACAGATTTCCCACAGTTCCTCTGCCCTGTATTCCTCTCTTCCTGAAAGACAGTAACCCGCC
+TGGAACAAAGCCTTTCTGAGGCCATGGAGGCCCTGAATCGTGAGCAGGAAAGTGCCAGACTGCAGCAACGGGAAAGAGAG
+ACACTGGTGAGAAGATTGGACTGGGTTAATTCCACTGGAAGCTGTTAATTACTTCTAGAGAGCTGTGGGCTATTGGTGGA
+TTGTGGGAGATTATAATTTGAGTTGCACCAGAGCACTGTTTCCCAAAGTGTGTTCCTTAGAACACTAATTCAGCTAGATA
+TTCTATTAAAAAAAAAGGCTCTGCTACCAAGTCGGTTTGAGAAACTCTGCAAATTGTATCATCATTAGAATATTAGTGTC
+ATCTGAAGAATTATTTTAAAATATAGGCTAGGCACGGTGGCTCACCCCTGTAATCCCAGCACTTTGGGAGGCCAAGGCAG
+GCAGATCATGAGGTCAGGAGTTTGAGACCAGCCTGGCCAGCATGGTGAAACCCCGTCTCTACTAAAAATACAAAAAACTA
+GCTGGACATGGTGGTGCATGCCTGTAGTCCCAGCGACTTGGGAGGCTGAGGCAGGAGAATTGCTTGAACCCAGCAGGCAG
+AGGTTGCAGTGAGCTGAGATCAAGCCACTGCACTCCAGCCTGGGTGACAGAGTGAGACTCCATCTCAAAAAATAAAAATA
+AAAAATAAAATAAAATAAAGATTTCTGGGACTCACCTGCAGAGGTTTTGATTCAGTAGATATATGGTGGAACTCAAATCT
+TTTCAGAAATTTCCTGGGTGATTTTTTTCAATCTGGTTTGGGACCTCTGGTGTAGGGCATGGCCAAAAAAGGTATGAAAT
+TGACTAACTCAAGTTTCTTTTCGTGTTTTTTTTTTTTTTTTTTTGAGACAGGGTCGCGCTCTGTTGTCCAGGCTGGAGTG
+CAGTGGCACAATCTCAGCTCACTGCAGCCTCTGCCCCCTGGATTCTAGCGATTCTCCCACCTTAGCCTCCCAAGTAGCTG
+GGACTACAGGCGTATACCACCACGCCTAGCTAATTTTTGTATTTTTTTGGTAGAGACGGGGTTTCGCCATGTTGGCCAGG
+CTGGTCTCAAACTCCTGACCTCAAGTGATCTGCCCGCCTTGGCCTCCCAAAGTGCTGGGATTACAGGCATAAGCCACCAT
+ACCCAGCCACTTCACTCAAGTCTCTTAGAGCTGAAATGATAAGGTGATTAGCCATATGAATTAGTAGCTGGGATTGGAAC
+TTGTTGCATGCATAATGCATGCTTGCTACATGCATGCTGGTACTTGTAGTCTTAACAGGCTATATCTGACAGTCCTGATG
+ATGATGGGTGATGGTTCACGATTATAGGCTGGATATTCTAGTCTTAGTGGAATTTCTTTTGACGATGATGGTGGTAACTA
+TTGTGTATTGAGTACTTACTATGTGCCTGATGCCGTGCTAAATGCCTTACATATGCTTTTTACTTTAATGCTCAGAAGGC
+AGACCCTGTTATGATCTCCAGTTTACAGATAAGAATGTATGCCTAAGAACACATAGTAAATTCCAGAACCAGGATTCGAA
+CCTGAGATTTGAACCCAGGTCTACCTGGCTCTGGAACAGGAACTCTTATCTACTATATTATAATCCTATGAGCTTGTCAA
+CTATTGTGTTTGTTTGTTTTGAGATGGAGTCTCGCTCTGTTGCCCAGGCTAGGGTTCAGTGGCACAATCTTGGCTCACTG
+CAACCTCCGCCTCCCAGGTTCAAGCAATTCTCCTGCCTCAGCCTCCTGAGTAGCTGGGAATACAGGTGTGCACCACCATG
+CCCAGCTAATTTTTATATTTTTAGTAGAGACATGGTTTCACCATGTTGGCCAGGCTGGTCCCAAACTCCTGACCTCAGAT
+GATCTGCCTGCCTCAGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACTGCGCCTGTCAACTATTGTGGGGTTTCTT
+TTGGTTTTATTTGAGATGGAGTCTTGCTCGATGTCGCCTAGGCTGGAGTGCAGTTGTGTGATCCCAGCTCACTGCAACCT
+CCACCTCCCAGGTTAGAGCGATTCTCCTGCCTTTGCCTCCCAAGTAGCTGGAATTACAGGCACCCACCACCACGCCTGGC
+TAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCATGTTGGACCAGGCTGGTCTCAAACTCCTGACCTCAGGTAATCT
+GCCTGCCTTGGCCTCCCAAAATGCTGGGATTACAGGTGTGAGCCACCGCACCCGGCCTAATTATTGTGTTTTTAATGAGC
+ATAATTCTCAGCAGGCACCTTGTATGAGGGTCAGAGGGCAGATCGCTGGTCTCTGAAGAGCCTACTAAGGAGCTTGGTTT
+TCTTCTGCGGCTGTAGGAGGAGGAAAGGCAAGCTCTGACTCTGAGGTTGGAGGCAGAACAGCAGCGGTGCTGTGTCCTGC
+AGGAAGAGCGGGATGCAGCTCGGGCTGGGCAACTGAGTGAGCATCGAGAGTTGGAGACTCTTCGGGCTGCCCTAGAAGAA
+GAACGGCAGACCTGGGCCCAGCAAGAGCACCAGCTTAAGGAACACTACCAGGCGCTGCAGGAGGAGA
diff --git a/test/csq/ENST00000571540/ENST00000571540.fa.fai b/test/csq/ENST00000571540/ENST00000571540.fa.fai
new file mode 100644
index 000000000..c2608fa04
--- /dev/null
+++ b/test/csq/ENST00000571540/ENST00000571540.fa.fai
@@ -0,0 +1 @@
+17	6627	4	80	81
diff --git a/test/csq/ENST00000571540/ENST00000571540.gff b/test/csq/ENST00000571540/ENST00000571540.gff
new file mode 100644
index 000000000..856cccb66
--- /dev/null
+++ b/test/csq/ENST00000571540/ENST00000571540.gff
@@ -0,0 +1,15 @@
+17	ensembl_havana	gene	-995	16824	.	+	.	ID=gene:ENSG00000170037;Name=CNTROB;biotype=protein_coding;description=centrobin%2C centriole duplication and spindle assembly protein [Source:HGNC Symbol%3BAcc:HGNC:29616];gene_id=ENSG00000170037;logic_name=ensembl_havana_gene_homo_sapiens;version=14
+17	havana	mRNA	21	6607	.	+	.	ID=transcript:ENST00000571540;Parent=gene:ENSG00000170037;Name=CNTROB-206;biotype=nonsense_mediated_decay;transcript_id=ENST00000571540;transcript_support_level=5;version=5
+17	havana	exon	21	253	.	+	.	Parent=transcript:ENST00000571540;Name=ENSE00002655061;constitutive=0;ensembl_end_phase=0;ensembl_phase=1;exon_id=ENSE00002655061;rank=1;version=1
+17	havana	CDS	21	253	.	+	2	ID=CDS:ENSP00000458688;Parent=transcript:ENST00000571540;protein_id=ENSP00000458688
+17	havana	exon	1042	1126	.	+	.	Parent=transcript:ENST00000571540;Name=ENSE00003522521;constitutive=0;ensembl_end_phase=1;ensembl_phase=0;exon_id=ENSE00003522521;rank=2;version=1
+17	havana	CDS	1042	1126	.	+	0	ID=CDS:ENSP00000458688;Parent=transcript:ENST00000571540;protein_id=ENSP00000458688
+17	havana	exon	1369	1450	.	+	.	Parent=transcript:ENST00000571540;Name=ENSE00001609771;constitutive=0;ensembl_end_phase=2;ensembl_phase=1;exon_id=ENSE00001609771;rank=3;version=1
+17	havana	CDS	1369	1450	.	+	2	ID=CDS:ENSP00000458688;Parent=transcript:ENST00000571540;protein_id=ENSP00000458688
+17	havana	exon	1893	2049	.	+	.	Parent=transcript:ENST00000571540;Name=ENSE00003784956;constitutive=0;ensembl_end_phase=0;ensembl_phase=2;exon_id=ENSE00003784956;rank=4;version=1
+17	havana	CDS	1893	2049	.	+	1	ID=CDS:ENSP00000458688;Parent=transcript:ENST00000571540;protein_id=ENSP00000458688
+17	havana	CDS	3142	3312	.	+	0	ID=CDS:ENSP00000458688;Parent=transcript:ENST00000571540;protein_id=ENSP00000458688
+17	havana	exon	3142	3386	.	+	.	Parent=transcript:ENST00000571540;Name=ENSE00002678822;constitutive=0;ensembl_end_phase=-1;ensembl_phase=0;exon_id=ENSE00002678822;rank=5;version=1
+17	havana	exon	3605	3721	.	+	.	Parent=transcript:ENST00000571540;Name=ENSE00003688389;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003688389;rank=6;version=1
+17	havana	exon	4068	4166	.	+	.	Parent=transcript:ENST00000571540;Name=ENSE00003562163;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003562163;rank=7;version=1
+17	havana	exon	6417	6607	.	+	.	Parent=transcript:ENST00000571540;Name=ENSE00002649879;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00002649879;rank=8;version=1
diff --git a/test/csq/ENST00000571540/nmd.txt b/test/csq/ENST00000571540/nmd.txt
new file mode 100644
index 000000000..f7cfad3c5
--- /dev/null
+++ b/test/csq/ENST00000571540/nmd.txt
@@ -0,0 +1,3 @@
+3188	C	T	missense&NMD_transcript|CNTROB|ENST00000571540|NMD|+|201P>201L|3188C>T
+3188	C	T	missense&NMD_transcript|CNTROB|ENST00000571540|NMD|+|201P>201L|3188C>T
+
diff --git a/test/csq/ENST00000571540/nmd.vcf b/test/csq/ENST00000571540/nmd.vcf
new file mode 100644
index 000000000..eb4a9e4e9
--- /dev/null
+++ b/test/csq/ENST00000571540/nmd.vcf
@@ -0,0 +1,7 @@
+##fileformat=VCFv4.2
+##contig=<ID=17,length=249250621>
+##INFO=<ID=type,Number=.,Type=String,Description="">
+##INFO=<ID=EXP,Number=1,Type=String,Description="Expected consequence">
+##INFO=<ID=EXPL,Number=1,Type=String,Description="Expected consequence with bt/csq -l">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+17	3188	.	C	T	.	.	type=ENST00000571540:7936284-C-T;EXP=missense&NMD_transcript|CNTROB|ENST00000571540|NMD|+|201P>201L|3188C>T
diff --git a/test/csq/ENST00000573314/incorrect-insertion-overlap.txt b/test/csq/ENST00000573314/incorrect-insertion-overlap.txt
index 12dd09ada..3b6618d5e 100644
--- a/test/csq/ENST00000573314/incorrect-insertion-overlap.txt
+++ b/test/csq/ENST00000573314/incorrect-insertion-overlap.txt
@@ -1,3 +1,3 @@
 32388	GTCCGTCGGCATAAACTT	GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT	frameshift|VWA9|ENST00000573314|NMD|-|4DCVRRWARGEWGGRGPGTAFRCLLGLQVKGLWVKNNPGPALGLPATVAAAPRCL*>4ESLCRRTVSGDGHGVSGEAEGPGLPFAACWGYR*|32388GTCCGTCGGCATAAACTT>GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT
-32388	GTCCGTCGGCATAAACTT	GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT	frameshift|VWA9|ENST00000573314|NMD|-|4DCVRRWARGEWGGRGPGTAFRCLLGLQVKGLWVKNNPGPALGLPATVAAAPRCL*>4ESLCRRTVSGDGHGVSGEAEGPGLPFAACWGYR*|32388GTCCGTCGGCATAAACTT>GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT
+32388	GTCCGTCGGCATAAACTT	GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT	frameshift&NMD_transcript|VWA9|ENST00000573314|NMD|-|4DCVRRWARGEWGGRGPGTAFRCLLGLQVKGLWVKNNPGPALGLPATVAAAPRCL*>4ESLCRRTVSGDGHGVSGEAEGPGLPFAACWGYR*|32388GTCCGTCGGCATAAACTT>GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT
 
diff --git a/test/csq/ENST00000573314/incorrect-insertion-overlap.txt-l b/test/csq/ENST00000573314/incorrect-insertion-overlap.txt-l
index 12dd09ada..3b6618d5e 100644
--- a/test/csq/ENST00000573314/incorrect-insertion-overlap.txt-l
+++ b/test/csq/ENST00000573314/incorrect-insertion-overlap.txt-l
@@ -1,3 +1,3 @@
 32388	GTCCGTCGGCATAAACTT	GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT	frameshift|VWA9|ENST00000573314|NMD|-|4DCVRRWARGEWGGRGPGTAFRCLLGLQVKGLWVKNNPGPALGLPATVAAAPRCL*>4ESLCRRTVSGDGHGVSGEAEGPGLPFAACWGYR*|32388GTCCGTCGGCATAAACTT>GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT
-32388	GTCCGTCGGCATAAACTT	GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT	frameshift|VWA9|ENST00000573314|NMD|-|4DCVRRWARGEWGGRGPGTAFRCLLGLQVKGLWVKNNPGPALGLPATVAAAPRCL*>4ESLCRRTVSGDGHGVSGEAEGPGLPFAACWGYR*|32388GTCCGTCGGCATAAACTT>GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT
+32388	GTCCGTCGGCATAAACTT	GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT	frameshift&NMD_transcript|VWA9|ENST00000573314|NMD|-|4DCVRRWARGEWGGRGPGTAFRCLLGLQVKGLWVKNNPGPALGLPATVAAAPRCL*>4ESLCRRTVSGDGHGVSGEAEGPGLPFAACWGYR*|32388GTCCGTCGGCATAAACTT>GTCCGTCGGCATAAACTTTCCGTCGGCATAAACTT
 
diff --git a/test/csq/ENST00000580206/test.cmd.out b/test/csq/ENST00000580206/test.cmd.out
index 09b9d0d95..39ae1c3e0 100644
--- a/test/csq/ENST00000580206/test.cmd.out
+++ b/test/csq/ENST00000580206/test.cmd.out
@@ -1,16 +1,16 @@
-35750	C	CAAAAGA	inframe_insertion|ANKRD30B|ENST00000358984|protein_coding|+|353P>353QKK|35750C>CAAAAGA+35751C>A,inframe_insertion|ANKRD30B|ENST00000580206|NMD|+|353P>353QKK|35750C>CAAAAGA+35751C>A,inframe_insertion|ANKRD30B|ENST00000665241|protein_coding|+|353P>353QKK|35750C>CAAAAGA+35751C>A
+35750	C	CAAAAGA	inframe_insertion&NMD_transcript|ANKRD30B|ENST00000580206|NMD|+|353P>353QKK|35750C>CAAAAGA+35751C>A,inframe_insertion|ANKRD30B|ENST00000358984|protein_coding|+|353P>353QKK|35750C>CAAAAGA+35751C>A,inframe_insertion|ANKRD30B|ENST00000665241|protein_coding|+|353P>353QKK|35750C>CAAAAGA+35751C>A
 35751	C	A	@35750
 
 35750	C	CAAAAGA	inframe_insertion|ANKRD30B|ENST00000358984|protein_coding|+|353P>353QKK|35750C>CAAAAGA+35751C>A
 35751	C	A	@35750
 
-35750	CC	CAAAAGAA	inframe_insertion|ANKRD30B|ENST00000358984|protein_coding|+|353P>353QKK|35750CC>CAAAAGAA,inframe_insertion|ANKRD30B|ENST00000580206|NMD|+|353P>353QKK|35750CC>CAAAAGAA,inframe_insertion|ANKRD30B|ENST00000665241|protein_coding|+|353P>353QKK|35750CC>CAAAAGAA
+35750	CC	CAAAAGAA	inframe_insertion&NMD_transcript|ANKRD30B|ENST00000580206|NMD|+|353P>353QKK|35750CC>CAAAAGAA,inframe_insertion|ANKRD30B|ENST00000358984|protein_coding|+|353P>353QKK|35750CC>CAAAAGAA,inframe_insertion|ANKRD30B|ENST00000665241|protein_coding|+|353P>353QKK|35750CC>CAAAAGAA
 
 35750	CC	CAAAAGAA	inframe_insertion|ANKRD30B|ENST00000358984|protein_coding|+|353P>353QKK|35750CC>CAAAAGAA
 
-35750	CCAG	C	inframe_deletion|ANKRD30B|ENST00000358984|protein_coding|+|353PA>353P|35750CCAG>C+35755A>T,inframe_deletion|ANKRD30B|ENST00000580206|NMD|+|353PA>353P|35750CCAG>C+35755A>T,inframe_deletion|ANKRD30B|ENST00000665241|protein_coding|+|353PA>353P|35750CCAG>C+35755A>T
+35750	CCAG	C	inframe_deletion&NMD_transcript|ANKRD30B|ENST00000580206|NMD|+|353PA>353P|35750CCAG>C+35755A>T,inframe_deletion|ANKRD30B|ENST00000358984|protein_coding|+|353PA>353P|35750CCAG>C+35755A>T,inframe_deletion|ANKRD30B|ENST00000665241|protein_coding|+|353PA>353P|35750CCAG>C+35755A>T
 35755	A	T	@35750
-35756	A	C	missense|ANKRD30B|ENST00000358984|protein_coding|+|355K>354Q|35756A>C,missense|ANKRD30B|ENST00000580206|NMD|+|355K>354Q|35756A>C,missense|ANKRD30B|ENST00000665241|protein_coding|+|355K>354Q|35756A>C
+35756	A	C	missense&NMD_transcript|ANKRD30B|ENST00000580206|NMD|+|355K>354Q|35756A>C,missense|ANKRD30B|ENST00000358984|protein_coding|+|355K>354Q|35756A>C,missense|ANKRD30B|ENST00000665241|protein_coding|+|355K>354Q|35756A>C
 
 35750	CCAG	C	inframe_deletion|ANKRD30B|ENST00000358984|protein_coding|+|353PA>353P|35750CCAG>C+35755A>T
 35755	A	T	@35750
@@ -19,7 +19,7 @@
 35750	CCAGCA	CCT	inframe_deletion|ANKRD30B|ENST00000358984|protein_coding|+|353PA>353P|35750CCAGCA>CCT
 35756	A	C	missense|ANKRD30B|ENST00000358984|protein_coding|+|355K>354Q|35756A>C
 
-35750	CCAG	C	missense&inframe_altering|ANKRD30B|ENST00000358984|protein_coding|+|353PA>353PP|35750CCAG>C+35755A>ACCT,missense&inframe_altering|ANKRD30B|ENST00000580206|NMD|+|353PA>353PP|35750CCAG>C+35755A>ACCT,missense&inframe_altering|ANKRD30B|ENST00000665241|protein_coding|+|353PA>353PP|35750CCAG>C+35755A>ACCT
+35750	CCAG	C	missense&inframe_altering&NMD_transcript|ANKRD30B|ENST00000580206|NMD|+|353PA>353PP|35750CCAG>C+35755A>ACCT,missense&inframe_altering|ANKRD30B|ENST00000358984|protein_coding|+|353PA>353PP|35750CCAG>C+35755A>ACCT,missense&inframe_altering|ANKRD30B|ENST00000665241|protein_coding|+|353PA>353PP|35750CCAG>C+35755A>ACCT
 35755	A	ACCT	@35750
 
 35750	CCAG	C	missense&inframe_altering|ANKRD30B|ENST00000358984|protein_coding|+|353PA>353PP|35750CCAG>C+35755A>ACCT
diff --git a/test/fill-tags-hemi.1.out b/test/fill-tags-hemi.1.out
index 10d4ddca3..634012187 100644
--- a/test/fill-tags-hemi.1.out
+++ b/test/fill-tags-hemi.1.out
@@ -3,6 +3,7 @@
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##contig=<ID=1,assembly=b37,length=249250621>
 ##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
+##INFO=<ID=F_MISSING,Number=.,Type=Float,Description="Added by +fill-tags expression F_MISSING=F_MISSING">
 ##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
 ##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes">
 ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
@@ -16,7 +17,7 @@
 ##FORMAT=<ID=VAF,Number=A,Type=Float,Description="The fraction of reads with alternate allele (nALT/nSumAll)">
 ##FORMAT=<ID=VAF1,Number=1,Type=Float,Description="The fraction of reads with alternate alleles (nSumALT/nSumAll)">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	A	B
-1	3177144	.	G	T,A	45	PASS	NS=2;AN=2;AF=0.5,0.5;MAF=0.5;AC=1,1;AC_Het=0,0;AC_Hom=0,0;AC_Hemi=1,1;HWE=1,1;ExcHet=1,1	GT	1	2
-1	3177144	.	G	T	45	PASS	NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=1;HWE=1;ExcHet=1	GT	0/.	1/.
-1	3177144	.	G	T	45	PASS	NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=1;HWE=1;ExcHet=1	GT	./0	./1
-1	3177144	.	G	T	45	PASS	NS=1;AN=1;AF=1;MAF=0;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=1;HWE=1;ExcHet=1	GT	./.	./1
+1	3177144	.	G	T,A	45	PASS	F_MISSING=0;NS=2;AN=2;AF=0.5,0.5;MAF=0.5;AC=1,1;AC_Het=0,0;AC_Hom=0,0;AC_Hemi=1,1;HWE=1,1;ExcHet=1,1	GT	1	2
+1	3177144	.	G	T	45	PASS	F_MISSING=1;NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=1;HWE=1;ExcHet=1	GT	0/.	1/.
+1	3177144	.	G	T	45	PASS	F_MISSING=1;NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=1;HWE=1;ExcHet=1	GT	./0	./1
+1	3177144	.	G	T	45	PASS	F_MISSING=1;NS=1;AN=1;AF=1;MAF=0;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=1;HWE=1;ExcHet=1	GT	./.	./1
diff --git a/test/fill-tags-hemi.2.out b/test/fill-tags-hemi.2.out
index e746d6c6a..1bc2e60a5 100644
--- a/test/fill-tags-hemi.2.out
+++ b/test/fill-tags-hemi.2.out
@@ -3,6 +3,7 @@
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##contig=<ID=1,assembly=b37,length=249250621>
 ##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
+##INFO=<ID=F_MISSING,Number=.,Type=Float,Description="Added by +fill-tags expression F_MISSING=F_MISSING">
 ##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
 ##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes">
 ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
@@ -16,7 +17,7 @@
 ##FORMAT=<ID=VAF,Number=A,Type=Float,Description="The fraction of reads with alternate allele (nALT/nSumAll)">
 ##FORMAT=<ID=VAF1,Number=1,Type=Float,Description="The fraction of reads with alternate alleles (nSumALT/nSumAll)">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	A	B
-1	3177144	.	G	T,A	45	PASS	NS=2;AN=2;AF=0.5,0.5;MAF=0.5;AC=1,1;AC_Het=0,0;AC_Hom=0,0;AC_Hemi=1,1;HWE=1,1;ExcHet=1,1	GT	1	2
-1	3177144	.	G	T	45	PASS	NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1	GT	0/.	1/.
-1	3177144	.	G	T	45	PASS	NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1	GT	./0	./1
-1	3177144	.	G	T	45	PASS	NS=1;AN=1;AF=1;MAF=0;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1	GT	./.	./1
+1	3177144	.	G	T,A	45	PASS	F_MISSING=0;NS=2;AN=2;AF=0.5,0.5;MAF=0.5;AC=1,1;AC_Het=0,0;AC_Hom=0,0;AC_Hemi=1,1;HWE=1,1;ExcHet=1,1	GT	1	2
+1	3177144	.	G	T	45	PASS	F_MISSING=1;NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1	GT	0/.	1/.
+1	3177144	.	G	T	45	PASS	F_MISSING=1;NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1	GT	./0	./1
+1	3177144	.	G	T	45	PASS	F_MISSING=1;NS=1;AN=1;AF=1;MAF=0;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1	GT	./.	./1
diff --git a/test/fill-tags-hwe.out b/test/fill-tags-hwe.out
index 45f0ae0e8..8837eb167 100644
--- a/test/fill-tags-hwe.out
+++ b/test/fill-tags-hwe.out
@@ -3,6 +3,7 @@
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##contig=<ID=1,assembly=b37,length=249250621>
 ##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
+##INFO=<ID=F_MISSING,Number=.,Type=Float,Description="Added by +fill-tags expression F_MISSING=F_MISSING">
 ##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
 ##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes">
 ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
@@ -16,7 +17,7 @@
 ##FORMAT=<ID=VAF,Number=A,Type=Float,Description="The fraction of reads with alternate allele (nALT/nSumAll)">
 ##FORMAT=<ID=VAF1,Number=1,Type=Float,Description="The fraction of reads with alternate alleles (nSumALT/nSumAll)">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	S1	S2	S3	S4	S5	S6	S7	S8	S9	S10
-1	3177144	.	G	T	45	PASS	NS=10;AN=20;AF=0;MAF=0;AC=0;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1	GT	0/0	0/0	0/0	0/0	0/0	0/0	0/0	0/0	0/0	0/0
-1	3177144	.	G	T	45	PASS	NS=10;AN=20;AF=0.1;MAF=0.1;AC=2;AC_Het=2;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=0.947368	GT	1/0	1/0	0/0	0/0	0/0	0/0	0/0	0/0	0/0	0/0
-1	3177144	.	G	T	45	PASS	NS=10;AN=20;AF=0.1;MAF=0.1;AC=2;AC_Het=0;AC_Hom=2;AC_Hemi=0;HWE=0.0526316;ExcHet=1	GT	1/1	0/0	0/0	0/0	0/0	0/0	0/0	0/0	0/0	0/0
-1	3177144	.	G	T,C	45	PASS	NS=10;AN=20;AF=0.5,0;MAF=0.5;AC=10,0;AC_Het=10,0;AC_Hom=0,0;AC_Hemi=0,0;HWE=0.00690641,1;ExcHet=0.00554244,1	GT	1/0	1/0	1/0	1/0	1/0	1/0	1/0	1/0	1/0	1/0
+1	3177144	.	G	T	45	PASS	F_MISSING=0;NS=10;AN=20;AF=0;MAF=0;AC=0;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1	GT	0/0	0/0	0/0	0/0	0/0	0/0	0/0	0/0	0/0	0/0
+1	3177144	.	G	T	45	PASS	F_MISSING=0;NS=10;AN=20;AF=0.1;MAF=0.1;AC=2;AC_Het=2;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=0.947368	GT	1/0	1/0	0/0	0/0	0/0	0/0	0/0	0/0	0/0	0/0
+1	3177144	.	G	T	45	PASS	F_MISSING=0;NS=10;AN=20;AF=0.1;MAF=0.1;AC=2;AC_Het=0;AC_Hom=2;AC_Hemi=0;HWE=0.0526316;ExcHet=1	GT	1/1	0/0	0/0	0/0	0/0	0/0	0/0	0/0	0/0	0/0
+1	3177144	.	G	T,C	45	PASS	F_MISSING=0;NS=10;AN=20;AF=0.5,0;MAF=0.5;AC=10,0;AC_Het=10,0;AC_Hom=0,0;AC_Hemi=0,0;HWE=0.00690641,1;ExcHet=0.00554244,1	GT	1/0	1/0	1/0	1/0	1/0	1/0	1/0	1/0	1/0	1/0
diff --git a/test/gtcheck.5.1.out b/test/gtcheck.5.1.out
index 4e7fbd625..639730377 100644
--- a/test/gtcheck.5.1.out
+++ b/test/gtcheck.5.1.out
@@ -5,4 +5,8 @@ INFO	sites-skipped-monoallelic	1
 INFO	sites-skipped-no-data	1
 INFO	sites-skipped-GT-not-diploid	1
 INFO	sites-skipped-PL-not-diploid	1
+INFO	sites-used-PL-vs-PL	0
+INFO	sites-used-PL-vs-GT	1
+INFO	sites-used-GT-vs-PL	0
+INFO	sites-used-GT-vs-GT	1
 DC	A	A	3.000150e-04	4.605170e+01	2
diff --git a/test/gvcf.merge.1.out b/test/gvcf.merge.1.out
index d8a9b497b..f9b679d1a 100644
--- a/test/gvcf.merge.1.out
+++ b/test/gvcf.merge.1.out
@@ -24,6 +24,6 @@
 ##INFO=<ID=BLOCKAVG_min30p3a,Number=0,Type=Flag,Description="Non-variant site block. All sites in a block are constrained to be non-variant, have the same filter value, and have all sample values in range [x,y], y <= max(x+3,(x*1.3)). All printed site block sample values are the minimum observed in the region spanned by the block">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	G06	D05	H09
 chr1	10106	.	C	.	0	LowGQX	BLOCKAVG_min30p3a;AN=2	GT:GQX:DP:DPF	./.:.:.:.	0/0:12:5:0	./.:.:.:.
-chr1	10107	.	C	.	0	LowGQX;HighDPFRatio	BLOCKAVG_min30p3a;AN=2	GT:GQX:DP:DPF	.:.:0:1	./.:.:.:.	0/0:5:2:0
+chr1	10107	.	C	.	0	LowGQX;HighDPFRatio	BLOCKAVG_min30p3a;AN=4	GT:GQX:DP:DPF	.:.:0:1	0/0:12:5:0	0/0:5:2:0
 chr1	10108	.	N	.	0	LowGQX;HighDPFRatio	END=10110;BLOCKAVG_min30p3a;AN=2	GT:GQX:DP:DPF	.:.:0:1	./.:.:.:.	0/0:5:2:0
 chr1	10111	.	N	.	0	LowGQX	END=10120;BLOCKAVG_min30p3a;AN=2	GT:GQX:DP:DPF	./.:.:.:.	./.:.:.:.	0/0:5:2:0
diff --git a/test/merge.10.1.out b/test/merge.10.1.out
index c34673867..2325b4ba6 100644
--- a/test/merge.10.1.out
+++ b/test/merge.10.1.out
@@ -1,13 +1,14 @@
 ##fileformat=VCFv4.3
 ##FILTER=<ID=PASS,Description="All filters passed">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##contig=<ID=1,assembly=b37,length=249250621>
 ##reference=file:ref.fa
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
-1	3000000	.	C	CCG	.	.	.
-1	3000000	.	C	CAA	.	.	.
-1	3000150	.	CC	C	.	.	.
-1	3000150	.	C	CTT	.	.	.
-1	3000152	.	C	A	.	.	.
-1	3000152	.	C	CA	.	.	.
-1	3000154	.	C	A	.	.	.
-1	3000154	.	C	T	.	.	.
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	a	b
+1	3000000	.	C	CCG	.	.	.	GT	0/1	./.
+1	3000000	.	C	CAA	.	.	.	GT	./.	0/1
+1	3000150	.	CC	C	.	.	.	GT	0/1	./.
+1	3000150	.	C	CTT	.	.	.	GT	./.	0/1
+1	3000152	.	C	A	.	.	.	GT	0/1	./.
+1	3000152	.	C	CA	.	.	.	GT	./.	0/1
+1	3000154	.	C	A	.	.	.	GT	0/1	./.
+1	3000154	.	C	T	.	.	.	GT	./.	0/1
diff --git a/test/merge.10.2.out b/test/merge.10.2.out
index 2b345cf56..f38e9d02c 100644
--- a/test/merge.10.2.out
+++ b/test/merge.10.2.out
@@ -1,10 +1,11 @@
 ##fileformat=VCFv4.3
 ##FILTER=<ID=PASS,Description="All filters passed">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##contig=<ID=1,assembly=b37,length=249250621>
 ##reference=file:ref.fa
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
-1	3000000	.	C	CCG,CAA	.	.	.
-1	3000150	.	CC	C,CTTC	.	.	.
-1	3000152	.	C	A	.	.	.
-1	3000152	.	C	CA	.	.	.
-1	3000154	.	C	A,T	.	.	.
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	a	b
+1	3000000	.	C	CCG,CAA	.	.	.	GT	0/1	0/2
+1	3000150	.	CC	C,CTTC	.	.	.	GT	0/1	0/2
+1	3000152	.	C	A	.	.	.	GT	0/1	./.
+1	3000152	.	C	CA	.	.	.	GT	./.	0/1
+1	3000154	.	C	A,T	.	.	.	GT	0/1	0/2
diff --git a/test/merge.10.3.out b/test/merge.10.3.out
index cec954427..53f1a51c5 100644
--- a/test/merge.10.3.out
+++ b/test/merge.10.3.out
@@ -1,11 +1,12 @@
 ##fileformat=VCFv4.3
 ##FILTER=<ID=PASS,Description="All filters passed">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##contig=<ID=1,assembly=b37,length=249250621>
 ##reference=file:ref.fa
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
-1	3000000	.	C	CCG,CAA	.	.	.
-1	3000150	.	CC	C,CTTC	.	.	.
-1	3000152	.	C	A	.	.	.
-1	3000152	.	C	CA	.	.	.
-1	3000154	.	C	A	.	.	.
-1	3000154	.	C	T	.	.	.
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	a	b
+1	3000000	.	C	CCG,CAA	.	.	.	GT	0/1	0/2
+1	3000150	.	CC	C	.	.	.	GT	0/1	./.
+1	3000150	.	C	CTT	.	.	.	GT	./.	0/1
+1	3000152	.	C	A	.	.	.	GT	0/1	./.
+1	3000152	.	C	CA	.	.	.	GT	./.	0/1
+1	3000154	.	C	A,T	.	.	.	GT	0/1	0/2
diff --git a/test/merge.10.a.vcf b/test/merge.10.a.vcf
index f2da6e185..c3672980c 100644
--- a/test/merge.10.a.vcf
+++ b/test/merge.10.a.vcf
@@ -1,8 +1,9 @@
 ##fileformat=VCFv4.3
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##contig=<ID=1,assembly=b37,length=249250621>
 ##reference=file:ref.fa
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
-1	3000000	.	C	CCG	.	.	.
-1	3000150	.	CC	C	.	.	.
-1	3000152	.	C	A	.	.	.
-1	3000154	.	C	A	.	.	.
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	a
+1	3000000	.	C	CCG	.	.	.	GT	0/1
+1	3000150	.	CC	C	.	.	.	GT	0/1
+1	3000152	.	C	A	.	.	.	GT	0/1
+1	3000154	.	C	A	.	.	.	GT	0/1
diff --git a/test/merge.10.b.vcf b/test/merge.10.b.vcf
index 15c96e9b7..eff5262fb 100644
--- a/test/merge.10.b.vcf
+++ b/test/merge.10.b.vcf
@@ -1,8 +1,9 @@
 ##fileformat=VCFv4.3
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##contig=<ID=1,assembly=b37,length=249250621>
 ##reference=file:ref.fa
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
-1	3000000	.	C	CAA	.	.	.
-1	3000150	.	C	CTT	.	.	.
-1	3000152	.	C	CA	.	.	.
-1	3000154	.	C	T	.	.	.
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	b
+1	3000000	.	C	CAA	.	.	.	GT	0/1
+1	3000150	.	C	CTT	.	.	.	GT	0/1
+1	3000152	.	C	CA	.	.	.	GT	0/1
+1	3000154	.	C	T	.	.	.	GT	0/1
diff --git a/test/merge.gvcf.10.1.out b/test/merge.gvcf.10.1.out
index 285548169..3a51f253d 100644
--- a/test/merge.gvcf.10.1.out
+++ b/test/merge.gvcf.10.1.out
@@ -1,6 +1,11 @@
 ##fileformat=VCFv4.2
 ##FILTER=<ID=PASS,Description="All filters passed">
 ##contig=<ID=chr1,length=248956422>
+##contig=<ID=chr2,length=248956422>
 ##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
-chr1	1	.	A	<*>,C	.	.	END=2
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	a	b
+chr1	1	.	A	<*>,C	.	.	END=2	GT	0/0	0/2
+chr2	1	.	A	<*>	.	.	END=2	GT	0/0	./.
+chr2	3	.	G	C,<*>,A	.	.	.	GT	0/1	0/3
+chr2	4	.	T	<*>	.	.	END=6	GT	0/0	0/0
diff --git a/test/merge.gvcf.10.2.out b/test/merge.gvcf.10.2.out
index ddaf00e52..8feceaef3 100644
--- a/test/merge.gvcf.10.2.out
+++ b/test/merge.gvcf.10.2.out
@@ -1,7 +1,12 @@
 ##fileformat=VCFv4.2
 ##FILTER=<ID=PASS,Description="All filters passed">
 ##contig=<ID=chr1,length=248956422>
+##contig=<ID=chr2,length=248956422>
 ##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
-chr1	1	.	A	<*>	.	.	END=2
-chr1	1	.	A	C	.	.	.
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	a	b
+chr1	1	.	A	<*>,C	.	.	END=2	GT	0/0	0/2
+chr2	1	.	A	<*>	.	.	END=2	GT	0/0	./.
+chr2	3	.	G	C,<*>	.	.	.	GT	0/1	./.
+chr2	3	.	G	A,<*>	.	.	.	GT	./.	0/1
+chr2	4	.	T	<*>	.	.	END=6	GT	0/0	0/0
diff --git a/test/merge.gvcf.10.3.out b/test/merge.gvcf.10.3.out
index 7129995b2..ca9bfccb5 100644
--- a/test/merge.gvcf.10.3.out
+++ b/test/merge.gvcf.10.3.out
@@ -1,7 +1,13 @@
 ##fileformat=VCFv4.2
 ##FILTER=<ID=PASS,Description="All filters passed">
 ##contig=<ID=chr1,length=248956422>
+##contig=<ID=chr2,length=248956422>
 ##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
-chr1	1	.	A	<*>,C	.	.	.
-chr1	2	.	C	<*>	.	.	.
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	a	b
+chr1	1	.	A	<*>,C	.	.	.	GT	0/0	0/2
+chr1	2	.	C	<*>	.	.	.	GT	0/0	./.
+chr2	1	.	A	<*>	.	.	END=2	GT	0/0	./.
+chr2	3	.	G	C,<*>,A	.	.	.	GT	0/1	0/3
+chr2	4	.	T	<*>	.	.	END=6	GT	0/0	0/0
+chr2	7	.	G	<*>	.	.	END=8	GT	./.	0/0
diff --git a/test/merge.gvcf.10.4.out b/test/merge.gvcf.10.4.out
index 38c43ad4c..8be54a9c2 100644
--- a/test/merge.gvcf.10.4.out
+++ b/test/merge.gvcf.10.4.out
@@ -1,7 +1,14 @@
 ##fileformat=VCFv4.2
 ##FILTER=<ID=PASS,Description="All filters passed">
 ##contig=<ID=chr1,length=248956422>
+##contig=<ID=chr2,length=248956422>
 ##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
-chr1	1	.	A	C	.	.	.
-chr1	2	.	C	<*>	.	.	.
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	a	b
+chr1	1	.	A	<*>,C	.	.	.	GT	0/0	0/2
+chr1	2	.	C	<*>	.	.	.	GT	0/0	./.
+chr2	1	.	A	<*>	.	.	END=2	GT	0/0	./.
+chr2	3	.	G	C,<*>	.	.	.	GT	0/1	./.
+chr2	3	.	G	A,<*>	.	.	.	GT	./.	0/1
+chr2	4	.	T	<*>	.	.	END=6	GT	0/0	0/0
+chr2	7	.	G	<*>	.	.	END=8	GT	./.	0/0
diff --git a/test/merge.gvcf.10.a.vcf b/test/merge.gvcf.10.a.vcf
index da7568d60..6d8b498dd 100644
--- a/test/merge.gvcf.10.a.vcf
+++ b/test/merge.gvcf.10.a.vcf
@@ -1,6 +1,11 @@
 ##fileformat=VCFv4.2
 ##FILTER=<ID=PASS,Description="All filters passed">
 ##contig=<ID=chr1,length=248956422>
+##contig=<ID=chr2,length=248956422>
 ##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
-chr1	1	.	A	<*>	.	.	END=2
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	a
+chr1	1	.	A	<*>	.	.	END=2	GT	0/0
+chr2	1	.	A	<*>	.	.	END=2	GT	0/0
+chr2	3	.	G	C,<*>	.	.	.	GT	0/1
+chr2	4	.	T	<*>	.	.	END=6	GT	0/0
diff --git a/test/merge.gvcf.10.b.vcf b/test/merge.gvcf.10.b.vcf
index 9e2840fab..0062a2ed2 100644
--- a/test/merge.gvcf.10.b.vcf
+++ b/test/merge.gvcf.10.b.vcf
@@ -1,5 +1,10 @@
 ##fileformat=VCFv4.2
 ##FILTER=<ID=PASS,Description="All filters passed">
 ##contig=<ID=chr1,length=248956422>
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
-chr1	1	.	A	C	.	.	.
+##contig=<ID=chr2,length=248956422>
+##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	b
+chr1	1	.	A	C	.	.	.	GT	0/1
+chr2	3	.	G	A,<*>	.	.	.	GT	0/1
+chr2	4	.	T	<*>	.	.	END=8	GT	0/0
diff --git a/test/merge.gvcf.10.fa b/test/merge.gvcf.10.fa
index 9cea0be52..86b1e1550 100644
--- a/test/merge.gvcf.10.fa
+++ b/test/merge.gvcf.10.fa
@@ -1,2 +1,4 @@
 >chr1
 ACGT
+>chr2
+ACGTACGT
diff --git a/test/merge.gvcf.11.1.out b/test/merge.gvcf.11.1.out
new file mode 100644
index 000000000..ada937137
--- /dev/null
+++ b/test/merge.gvcf.11.1.out
@@ -0,0 +1,12 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##reference=file.fa
+##contig=<ID=chr20,length=59373566,assembly=B37,md5=1e86411d73e6f00a10590f976be01623,species="Homo sapiens">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the region described in this record">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	s1	s2	s3
+chr20	64249835	.	T	<NON_REF>	0	.	END=64249836	GT	0/0	0/0	0/0
+chr20	64249837	.	T	<NON_REF>	0	.	.	GT	0/0	0/0	0/0
+chr20	64249838	.	T	<NON_REF>	0	.	END=64250066	GT	0/0	0/0	0/0
+chr20	64250067	.	T	<NON_REF>	0	.	.	GT	0/0	0/0	./.
+chr20	64250068	.	N	<NON_REF>	0	.	END=64251648	GT	0/0	./.	./.
diff --git a/test/merge.gvcf.11.a.vcf b/test/merge.gvcf.11.a.vcf
new file mode 100644
index 000000000..0c5cf70d2
--- /dev/null
+++ b/test/merge.gvcf.11.a.vcf
@@ -0,0 +1,8 @@
+##fileformat=VCFv4.2
+##reference=file.fa
+##contig=<ID=chr20,length=59373566,assembly=B37,md5=1e86411d73e6f00a10590f976be01623,species="Homo sapiens">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the region described in this record">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	s1
+chr20	64249835	.	T	<NON_REF>	0	.	END=64249837	GT	0/0
+chr20	64249838	.	T	<NON_REF>	0	.	END=64251648	GT	0/0
diff --git a/test/merge.gvcf.11.b.vcf b/test/merge.gvcf.11.b.vcf
new file mode 100644
index 000000000..3a73a061c
--- /dev/null
+++ b/test/merge.gvcf.11.b.vcf
@@ -0,0 +1,8 @@
+##fileformat=VCFv4.2
+##reference=file.fa
+##contig=<ID=chr20,length=59373566,assembly=B37,md5=1e86411d73e6f00a10590f976be01623,species="Homo sapiens">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the region described in this record">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	s2
+chr20	64249835	.	T	<NON_REF>	0	.	END=64250066	GT	0/0
+chr20	64250067	.	T	<NON_REF>	0	.	END=64250067	GT	0/0
diff --git a/test/merge.gvcf.11.c.vcf b/test/merge.gvcf.11.c.vcf
new file mode 100644
index 000000000..b8b0a083f
--- /dev/null
+++ b/test/merge.gvcf.11.c.vcf
@@ -0,0 +1,8 @@
+##fileformat=VCFv4.2
+##reference=file.fa
+##contig=<ID=chr20,length=59373566,assembly=B37,md5=1e86411d73e6f00a10590f976be01623,species="Homo sapiens">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the region described in this record">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	s3
+chr20	64249835	.	T	<NON_REF>	0	.	END=64249836	GT	0/0
+chr20	64249837	.	T	<NON_REF>	0	.	END=64250066	GT	0/0
diff --git a/test/merge.gvcf.2.out b/test/merge.gvcf.2.out
index a939338b0..4bb0c75d2 100644
--- a/test/merge.gvcf.2.out
+++ b/test/merge.gvcf.2.out
@@ -18,16 +18,16 @@
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	AAA	BBB	CCC
 2	21444416	.	G	<*>	.	.	END=21444427;MinDP=5;QS=1,0	PL:DP	0,15,125:5	.:.	.:.
 2	21444428	.	C	<*>	.	.	END=21444429;MinDP=2;QS=2,0	PL:DP	0,15,125:5	0,6,51:2	.:.
-2	21444430	.	TCAA	T,TAA,<*>	0	.	MinDP=2;QS=1.60366,0.304878,0.0914634,0	PL:DP:DV	37,0,79,35,73,113,.,.,.,.:5:2	0,.,.,.,.,.,6,.,.,51:2:.	.:.:.
+2	21444430	.	TCAA	T,TAA,<*>	0	.	MinDP=2;QS=1.60366,0.304878,0.0914634,0	PL:DP:DV	37,0,79,35,73,113,113,113,113,113:5:2	0,51,51,51,51,51,6,51,51,51:2:.	.:.:.
+2	21444431	.	CA	C,CAAACAAAAAA	0	.	QS=0.75,0.25,1	PL:DP:DV	0,4,10,10,10,10:4:1	28,28,28,3,28,0:1:1	.:.:.
 2	21444431	.	C	<*>	.	.	MinDP=4;QS=1,0	PL:DP	0,12,110:4	.:.	.:.
-2	21444431	.	CA	C,CAAACAAAAAA	0	.	QS=0.75,0.25,1	PL:DP:DV	0,4,10,.,.,.:4:1	28,.,.,3,.,0:1:1	.:.:.
 2	21444433	.	C	<*>	0	.	END=21444444;QS=0.75,0.25	PL:DP:DV	0,4,10:4:1	.:.:.	.:.:.
 3	1	.	C	<*>	0	.	END=4;MinDP=33;QS=0.75,0.25	PL:DP:DV	0,4,10:4:1	.:.:.	.:.:.
-3	5	.	C	<*>,T	0	.	MinDP=33;QS=1.5,0.25,0.25	PL:DP:DV	0,4,10,.,.,.:4:1	0,.,.,4,.,10:4:1	.:.:.
+3	5	.	C	<*>,T	0	.	MinDP=33;QS=1.5,0.25,0.25	PL:DP:DV	0,4,10,10,10,10:4:1	0,10,10,4,10,10:4:1	.:.:.
 3	6	.	N	<*>	0	.	END=10;MinDP=33;QS=0.75,0.25	PL:DP:DV	0,4,10:4:1	.:.:.	.:.:.
 1	1619670	.	C	<*>	0	.	END=1619782;MinDP=33;QS=0.75,0.25	PL:DP:DV	0,4,10:4:1	.:.:.	.:.:.
 1	1619783	.	C	<*>	0	.	END=1619787;MinDP=33;QS=0.75,1.25	PL:DP:DV	0,4,10:4:1	28,3,0:1:1	.:.:.
-1	1619788	.	G	<*>,GAAAAAAA	0	.	MinDP=33;QS=0.75,0.25,1	PL:DP:DV	0,4,10,.,.,.:4:1	28,.,.,3,.,0:1:1	.:.:.
+1	1619788	.	G	<*>,GAAAAAAA	0	.	MinDP=33;QS=0.75,0.25,1	PL:DP:DV	0,4,10,10,10,10:4:1	28,28,28,3,28,0:1:1	.:.:.
 1	1619789	.	N	<*>	0	.	END=1619877;MinDP=33;QS=0.75,0.25	PL:DP:DV	0,4,10:4:1	.:.:.	.:.:.
 4	20000975	.	C	<*>	0	.	END=20001021;MinDP=33;QS=0.75,0.25	PL:DP:DV	0,4,10:4:1	.:.:.	.:.:.
 4	20001022	.	C	<*>	0	.	END=20001070;MinDP=33;QS=1.5,0.5	PL:DP:DV	0,4,10:4:1	0,4,10:4:1	.:.:.
@@ -44,9 +44,9 @@
 6	630	.	T	<*>	.	.	.	PL	66,1,1	66,2,3	.
 6	631	.	N	<*>	.	.	END=666	PL	66,1,1	.	.
 7	701	.	T	<*>	.	.	.	PL	77,1,1	77,2,1	.
-7	702	.	T	<*>	.	.	.	PL	.	77,2,2	.
-7	703	.	T	<*>	.	.	.	PL	77,1,2	.	.
+7	702	.	T	<*>	.	.	.	PL	77,1,1	77,2,2	.
+7	703	.	T	<*>	.	.	.	PL	77,1,2	77,2,2	.
 7	704	.	N	<*>	.	.	END=777	PL	77,1,2	.	.
 8	1	.	T	<*>	.	.	END=2	PL	88,1,1	.	.
-8	3	.	T	<*>,A	.	.	.	PL	88,1,1,.,.,.	88,.,.,2,.,1	88,.,.,3,.,1
+8	3	.	T	<*>,A	.	.	.	PL	88,1,1,1,1,1	88,88,88,2,88,1	88,88,88,3,88,1
 8	4	.	N	<*>	.	.	END=10	PL	88,1,1	.	.
diff --git a/test/merge.gvcf.5.1.out b/test/merge.gvcf.5.1.out
new file mode 100644
index 000000000..74f6efbd1
--- /dev/null
+++ b/test/merge.gvcf.5.1.out
@@ -0,0 +1,10 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
+##contig=<ID=chr22,length=10514891>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	11	13
+chr22	10510106	.	T	<NON_REF>	.	.	END=10510112	GT	0/0	./.
+chr22	10510113	.	C	<NON_REF>,T	.	.	.	GT	0/0	2/2
+chr22	10510114	.	N	<NON_REF>	.	.	END=10510117	GT	0/0	./.
diff --git a/test/merge.gvcf.5.a.vcf b/test/merge.gvcf.5.a.vcf
new file mode 100644
index 000000000..1a77662e3
--- /dev/null
+++ b/test/merge.gvcf.5.a.vcf
@@ -0,0 +1,7 @@
+##fileformat=VCFv4.2
+##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
+##contig=<ID=chr22,length=10514891>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	11
+chr22	10510106	.	T	<NON_REF>	.	.	END=10510117	GT	0/0
diff --git a/test/merge.gvcf.5.b.vcf b/test/merge.gvcf.5.b.vcf
new file mode 100644
index 000000000..9dda0326b
--- /dev/null
+++ b/test/merge.gvcf.5.b.vcf
@@ -0,0 +1,7 @@
+##fileformat=VCFv4.2
+##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
+##contig=<ID=chr22,length=10514891>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	13
+chr22	10510113	.	C	T,<NON_REF>	.	.	.	GT	1/1
diff --git a/test/merge.mrules.1.1.out b/test/merge.mrules.1.1.out
new file mode 100644
index 000000000..b22420e09
--- /dev/null
+++ b/test/merge.mrules.1.1.out
@@ -0,0 +1,19 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##reference=file://hs38DH.fa
+##contig=<ID=chr1>
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of high-quality bases">
+##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum per-sample depth in this gVCF block">
+##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Minimum per-sample depth in this gVCF block">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Minimum per-sample depth in this gVCF block">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SampleA	SampleB
+chr1	1769963	.	A	<NON_REF>	.	.	END=1769967	GT:PL	0/0:0,3,45	./.:.
+chr1	1769968	.	T	<NON_REF>	.	.	.	GT:PL	0/0:0,3,45	0/0:0,18,270
+chr1	1769969	.	CAAAACAAAAACA	CAAAACA,<NON_REF>,C	.	.	.	GT:AD:PL	1/1:0,9,0,0:405,27,0,405,27,405,405,405,405,405	3/3:0,0,0,4:181,181,181,181,181,181,12,181,12,0
+chr1	1769976	.	A	<NON_REF>	.	.	.	GT:PL	0/0:0,0,0	./.:.
+chr1	1769982	.	A	<NON_REF>	.	.	.	GT:PL	./.:.	0/0:0,0,0
+chr1	1769983	.	C	T,A	.	.	.	GT:AD:PL	1/1:0,9,0:405,27,0,405,405,405	2/2:0,0,4:181,181,181,12,181,0
+chr1	1769990	.	CAAAACAAAAACA	CAAAACA,<NON_REF>,C	.	.	.	GT:AD:PL	1:0,9,0,0:405,27,0,0	3:0,0,0,4:181,0,0,12
+chr1	1769991	.	C	T,A	.	.	.	GT:AD:PL	1:0,9,0:405,0,405	2:0,0,4:181,181,0
diff --git a/test/merge.mrules.1.2.out b/test/merge.mrules.1.2.out
new file mode 100644
index 000000000..c935f2b7c
--- /dev/null
+++ b/test/merge.mrules.1.2.out
@@ -0,0 +1,19 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##reference=file://hs38DH.fa
+##contig=<ID=chr1>
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of high-quality bases">
+##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum per-sample depth in this gVCF block">
+##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Minimum per-sample depth in this gVCF block">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Minimum per-sample depth in this gVCF block">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SampleA	SampleB
+chr1	1769963	.	A	<NON_REF>	.	.	END=1769967	GT:PL	0/0:0,3,45	./.:.
+chr1	1769968	.	T	<NON_REF>	.	.	.	GT:PL	0/0:0,3,45	0/0:0,18,270
+chr1	1769969	.	CAAAACAAAAACA	CAAAACA,<NON_REF>,C	.	.	.	GT:AD:PL	1/1:0,9,0,0:405,27,0,405,27,405,405,405,405,405	3/3:0,0,0,4:181,181,181,181,181,181,12,181,12,0
+chr1	1769976	.	A	<NON_REF>	.	.	.	GT:PL	0/0:0,0,0	./.:.
+chr1	1769982	.	A	<NON_REF>	.	.	.	GT:PL	./.:.	0/0:0,0,0
+chr1	1769983	.	C	T,A	.	.	.	GT:AD:PL	1/1:0,9,.:405,27,0,.,.,.	2/2:0,.,4:181,.,.,12,.,0
+chr1	1769990	.	CAAAACAAAAACA	CAAAACA,<NON_REF>,C	.	.	.	GT:AD:PL	1:0,9,0,0:405,27,0,0	3:0,0,0,4:181,0,0,12
+chr1	1769991	.	C	T,A	.	.	.	GT:AD:PL	1:0,9,.:405,0,.	2:0,.,4:181,.,0
diff --git a/test/merge.mrules.1.a.vcf b/test/merge.mrules.1.a.vcf
new file mode 100644
index 000000000..cd09132fe
--- /dev/null
+++ b/test/merge.mrules.1.a.vcf
@@ -0,0 +1,16 @@
+##fileformat=VCFv4.2
+##reference=file://hs38DH.fa
+##contig=<ID=chr1>
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of high-quality bases">
+##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum per-sample depth in this gVCF block">
+##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Minimum per-sample depth in this gVCF block">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Minimum per-sample depth in this gVCF block">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SampleA
+chr1	1769963	.	A	<NON_REF>	.	.	END=1769968	GT:PL	0/0:0,3,45
+chr1	1769969	.	CAAAACA	C,<NON_REF>	.	.	.	GT:AD:PL	1/1:0,9,0:405,27,0,405,27,405
+chr1	1769976	.	A	<NON_REF>	.	.	END=1769976	GT:PL	0/0:0,0,0
+chr1	1769983	.	C	T	.	.	.	GT:AD:PL	1/1:0,9:405,27,0
+chr1	1769990	.	CAAAACA	C,<NON_REF>	.	.	.	GT:AD:PL	1:0,9,0:405,27,0
+chr1	1769991	.	C	T	.	.	.	GT:AD:PL	1:0,9:405,0
diff --git a/test/merge.mrules.1.b.vcf b/test/merge.mrules.1.b.vcf
new file mode 100644
index 000000000..8345153a7
--- /dev/null
+++ b/test/merge.mrules.1.b.vcf
@@ -0,0 +1,16 @@
+##fileformat=VCFv4.2
+##reference=file://hs38DH.fa
+##contig=<ID=chr1>
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of high-quality bases">
+##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum per-sample depth in this gVCF block">
+##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Minimum per-sample depth in this gVCF block">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Minimum per-sample depth in this gVCF block">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SampleB
+chr1	1769968	.	T	<NON_REF>	.	.	END=1769968	GT:PL	0/0:0,18,270
+chr1	1769969	.	CAAAACAAAAACA	C,<NON_REF>	.	.	.	GT:AD:PL	1/1:0,4,0:181,12,0,181,12,181
+chr1	1769982	.	A	<NON_REF>	.	.	END=1769982	GT:PL	0/0:0,0,0
+chr1	1769983	.	C	A	.	.	.	GT:AD:PL	1/1:0,4:181,12,0
+chr1	1769990	.	CAAAACAAAAACA	C,<NON_REF>	.	.	.	GT:AD:PL	1:0,4,0:181,12,0
+chr1	1769991	.	C	A	.	.	.	GT:AD:PL	1:0,4:181,0
diff --git a/test/norm.4.1.out b/test/norm.4.1.out
index 6df7de93d..a786de4c1 100644
--- a/test/norm.4.1.out
+++ b/test/norm.4.1.out
@@ -4,4 +4,4 @@
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample
 1	10	.	T	C	.	.	.	GT	0/1
-1	10	.	T	TAC,TATAC,TATATAC,TATATATAC	.	.	.	GT	2/0
+1	10	.	T	TAC,TATAC,TATATAC,TATATATAC	.	.	.	GT	0/2
diff --git a/test/norm.merge.2.out b/test/norm.merge.2.out
index a81c76bc7..d3c14147c 100644
--- a/test/norm.merge.2.out
+++ b/test/norm.merge.2.out
@@ -33,6 +33,6 @@
 ##FILTER=<ID=FAIL1,Description="Failed filter 1">
 ##FILTER=<ID=FAIL2,Description="Failed filter 2">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	XY00001	XY00002	XY00003
-1	105	.	T	C	999	PASS	.	GT:FGI:FRI	1:1,2:3,4	1/0:5,6,7:8,9	1/0:.:.,.
+1	105	.	T	C	999	PASS	.	GT:FGI:FRI	1:1,2:3,4	0/1:5,6,7:8,9	0/1:.:.,.
 1	110	.	C	A	999	PASS	.	GT:FGI	1:1,2	0:3,4	0:.
 1	150	.	A	C	999	PASS	.	GT:FGI	1:1,2	0:.	0:3,4
diff --git a/test/norm.phased-join.1.out b/test/norm.phased-join.1.out
new file mode 100644
index 000000000..6f5c01718
--- /dev/null
+++ b/test/norm.phased-join.1.out
@@ -0,0 +1,6 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##contig=<ID=20,length=2147483647>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample
+20	1	.	A	C,G,T	.	.	.	GT	3|2|1|0
diff --git a/test/norm.phased-join.vcf b/test/norm.phased-join.vcf
new file mode 100644
index 000000000..9afdad96f
--- /dev/null
+++ b/test/norm.phased-join.vcf
@@ -0,0 +1,8 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##contig=<ID=20,length=2147483647>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample
+20	1	.	A	C	.	.	.	GT	0|0|1|0
+20	1	.	A	G	.	.	.	GT	0|1|0|0
+20	1	.	A	T	.	.	.	GT	1|0|0|0
diff --git a/test/norm.phased-split.1.out b/test/norm.phased-split.1.out
new file mode 100644
index 000000000..9afdad96f
--- /dev/null
+++ b/test/norm.phased-split.1.out
@@ -0,0 +1,8 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##contig=<ID=20,length=2147483647>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample
+20	1	.	A	C	.	.	.	GT	0|0|1|0
+20	1	.	A	G	.	.	.	GT	0|1|0|0
+20	1	.	A	T	.	.	.	GT	1|0|0|0
diff --git a/test/norm.phased-split.vcf b/test/norm.phased-split.vcf
new file mode 100644
index 000000000..2b44d34be
--- /dev/null
+++ b/test/norm.phased-split.vcf
@@ -0,0 +1,5 @@
+##fileformat=VCFv4.2
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##contig=<ID=20,length=2147483647>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample
+20	1	.	A	C,G,T	.	.	.	GT	3|2|1|0
diff --git a/test/norm.right-align.1.out b/test/norm.right-align.1.out
new file mode 100644
index 000000000..f72145e5a
--- /dev/null
+++ b/test/norm.right-align.1.out
@@ -0,0 +1,10 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##contig=<ID=7,length=249250621>
+##INFO=<ID=type,Number=.,Type=String,Description="">
+##INFO=<ID=EXP,Number=1,Type=String,Description="Expected consequence">
+##INFO=<ID=EXPL,Number=1,Type=String,Description="Expected consequence with bt/csq -l">
+##INFO=<ID=ORI,Number=1,Type=String,Description="Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+7	897	.	GGAATTAAGA	G	.	.	.
+7	910	.	G	C	.	.	.
diff --git a/test/norm.right-align.2.out b/test/norm.right-align.2.out
new file mode 100644
index 000000000..0f5e30ff8
--- /dev/null
+++ b/test/norm.right-align.2.out
@@ -0,0 +1,10 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##contig=<ID=7,length=249250621>
+##INFO=<ID=type,Number=.,Type=String,Description="">
+##INFO=<ID=EXP,Number=1,Type=String,Description="Expected consequence">
+##INFO=<ID=EXPL,Number=1,Type=String,Description="Expected consequence with bt/csq -l">
+##INFO=<ID=ORI,Number=1,Type=String,Description="Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+7	900	.	ATTAAGAGAA	A	.	.	ORI=7|897|GGAATTAAGA|G
+7	910	.	G	C	.	.	.
diff --git a/test/norm.right-align.fa b/test/norm.right-align.fa
new file mode 100644
index 000000000..31ea7c86c
--- /dev/null
+++ b/test/norm.right-align.fa
@@ -0,0 +1,488 @@
+>7	7:154955-184155
+TGAGGGCTGAGGTGACCCTTGTCTCTGTGTTCTTGTCCCCCCCAGCTTGTGGAGCCTCTT
+ACACCCAGTGGAGAAGCTCCCAACCAAGCTCTCTTGAGGATCTTGAAGGAAACTGAATTC
+AAAAAGATCAAAGTGCTGGGCTCCGGTGCGTTCGGCACGGTGTATAAGGTAAGGTCCCTG
+GCACAGGCCTCTGGGCTGGGCCGCAGGGCCTCTCATGGTCTGGTGGGGAGCCCAGAGTCC
+TTGCAAGCTGTATATTTCCATCATCTACTTTACTCTTTGTTTCACTGAGTGTTTGGGAAA
+CTCCAGTGTTTTTCCCAAGTTATTGAGAGGAAATCTTTTATAACCACAGTAATCAGTGGT
+CCTGTGAGACCAATTCACAGACCAAAGGCATTTTTATGAAAGGGGCCATTGACCTTGCCA
+TGGGGTGCAGCACAGGGCGGGAGGAGGGCCGCCTCTCACCGCACGGCATCAGAATGCAGC
+CCAGCTGAAATGGGCTCATCTTCGTTTGCTTCTTCTAGATCCTCTTTGCATGAAATCTGA
+TTTCAGTTAGGCCTAGACGCAGCATCATTAAATTCTGGATGAAATGATCCACACGGACTT
+TATAACAGGCTTTACAAGCTTGAGATTCTTTTATCTAAATAATCAGTGTGATTCGTGGAG
+CCCAACAGCTGCAGGGCTGCGGGGGCGTCACAGCCCCCAGCAATATCAGCCTTAGGTGCG
+GCTCCACAGCCCCAGTGTCCCTCACCTTCGGGGTGCATCGCTGGTAACATCCACCCAGAT
+CACTGGGCAGCATGTGGCACCATCTCACAATTGCCAGTTAACGTCTTCCTTCTCTCTCTG
+TCATAGGGACTCTGGATCCCAGAAGGTGAGAAAGTTAAAATTCCCGTCGCTATCAAGGAA
+TTAAGAGAAGCAACATCTCCGAAAGCCAACAAGGAAATCCTCGATGTGAGTTTCTGCTTT
+GCTGTGTGGGGGTCCATGGCTCTGAACCTCAGGCCCACCTTTTCTCATGTCTGGCAGCTG
+CTCTGCTCTAGACCCTGCTCATCTCCACATCCTAAATGTTCACTTTCTATGTCTTTCCCT
+TTCTAGCTCTAGTGGGTATAACTCCCTCCCCTTAGAGACAGCACTGGCCTCTCCCATGCT
+GGTATCCACCCCAAAAGGCTGGAAACAGGCAATTACTGGCATCTACCCAGCACTAGTTTC
+TTGACACGCATGATGAGTGAGTGCTCTTGGTGAGCCTGGAGCATGGGTATTGTTTTTGGT
+ATTTTTTGGATGAAGAAATGGAGGCATAAAGAAATTGGCTGACCCTTATATGGCTGGGAT
+AGGGTTTAAGCCCCTTGTTATTTCTGACTCTGAAACTTGCATTCAATTCACTCCACCAAG
+TTATCTCATCTTTGAAATGGCTTTTTTTAAAGGTGCCTAGAATATGATGGCGTGCAGTCT
+ATAAACTGTTGCCCACCTTCTGTACTTTCTCTCAGAATAATTCACATTCTTCTCCAGTGT
+CTGTTGATTGTTACTTTGTGGAATAAGTTCTTGGAAAATTCCACAAGATTATTGTTATCT
+TCTTACTACCAATTCTATTGAACTTTCTCCACCTTCTCTGGGCCTTCCCCAGCCAGTGGT
+GGGAAGATGCTGGCTGGAGTCTGACAGAGCCTCTTCTACACTGGCCTGGGCTTGCTGTGA
+GTTGGTGGAAACCTTTGCTCTTGTCCCAACACAGAGCAAGTGAAAGAGGAGGTCAAGGGG
+CTCAGGCAGCGGACTAGGGAAGCAGAATCGAGGAAAAGGAAAAATGGCTGACTTATTACC
+TCAAAACTCTAGAGAATTTAGTTGATCTTACAGCCAAGAAGGACAAAAGCCAGAGAGTAA
+TATCCTCCGCCTCATGTCTAACCCACAGAATACATAGCAAGTAAAGAGAACATGGGCCTT
+TATAAAAATGTCTTAAGATACAATTTTTTAATTGGAGGAAATCTACAGTTTAATTTTCTC
+TGGGCAGCTTTTCTTCCTTTTATTATAGTAGGGGAAATCCCATGTTGATATACTTCTAAA
+TGAAAGATGATGAATTGATATAATACAATAAAAAATCTGTAAAATTGATGATATACTTAT
+CAAGAAAAATTAGCTTTCATTTTAACGGTTTACAAATTGAGTCAAGTCCTAGTAACAAAA
+TGTTAAGTCTATTAACATAACCACAAGAAATACAGGAAGACGGGCAATCTGTGAAGCCTT
+TCACTTACAATCTCTGGCCCCTCACCTGTGCTGTGTAGGAAAATCTTTGTGCACAATTTG
+CTTCCTTAATTCATTTTTTATTCATTCAACACATTCTAATAAATTATACAAAATCATGTT
+GAAATGTGAATTTCAGTGGTATTTATAAATGCAGTGTGAGGAGGGTTTGGATGTATTCTA
+AGACAATAGTTGTGCTTTGGGAAGGAAGCAGTGTTCACTGAAAAGTGCCCCCAGGACCTT
+TTAATTGGAGGAAATATGCTTCTGTGGAGTTGGAAATGGGGTAGAAGATAGATAAGGTCA
+AGGCTTAAAAGTTAAGTGCACCCAACATCTGAAGCGTCCATGGGCCTGGCATGGTGGCTT
+TCGCCTGTAATCCCAGCACTTTGGGAGGCTGAGGCAGGAGGATCCCTTGAGCTTAGGAGT
+TTGAGACCAGCCTGGGCAACATACTGAGACCCAGTCTCTACAAAAAATAAAAAATTAGCT
+GGGTGTGGTGTCTCATGCCTGTAGTCCCAGCCACTCAGGAGATGGGAAGATGGCTTGAGT
+CCAGGAGATCTAGGCTGCAGTGAGCTAAAATCTCACCACTGCACTCCAGCCTGGGTGACA
+AAGCAAGACCCTGCTCAAAAAAATAGTTAGATATAAATATTAATATAGATACCTATATAT
+ATCTGAATATAGATATCTATATATACTCTGTATATAGTTATTTAGATATATAAATATATA
+TGATATATATTTAGAGAGATATATATTTAGAGAGATATATATTTAGAGATTTATATATAT
+TTTATATATATTTAGAGATATATATCTCTAAATATATATCTCTCTCTAAATATATATATA
+TCTCTCTCTAAATATATATATATCCCTAAATATATTAAATAAATAAAAGAAATAAAAGAA
+AGCTCAGTTTGGCCTCCTGCTTGTCCTGTCTCCTCATCCCCTCTTCCCCCTCCATCATTT
+TATTTCCTTGCCCCATGTTTCTTCACTGCGGCCATGTCCCCCCTCCTCTCCAATGATGGA
+TGTCATGTCTGCTGCAGTCAGAGGGCGACAAGCCTGGAGTGTTCCCTGAAGCCTGTGGTT
+TGTGGTTTGTCCTGCAGCTCAGGCTGCCCAGGCCTCACCAGCAATCCTGGCGGGCAGGGC
+ACCACACTGGGATGGAGAGGGGGAAGCTGGAGGAGGCACTTTCTGGTAAAGAAAGCAAAA
+GCCAGCAGTGCCCAGGCCAATTTCAACAGGGAGTTAAATAGCACCTTAATCCTGTGGCAG
+GACAGCTCATGGGGCCATGTGTGCTCTTAGAAAGACTCACATGCACGCATGCACGGCAGC
+AATGACTCCATACTCACGTTCCCCTGCAGACACCAGGCCCCCACAGCCGGCACACACACT
+GCAGCCCCAGTTCCATGTTGCTAGCAGTGGCTTAGTGAATGAGTAAAGTTCTTAAAATGC
+AGGGGACACCTGCCCTTCATTCATAAGGCTGGACGTACACCTCTCCTTAAGGAGTTCAAG
+AGCTAGTGGAATCCCAATTCATACGGTAGAGCCATTCACAGATGAGAGAGACAAGCCAGA
+AGGAAGGAACCAAAAGTCATGTCAGCAGTTAGGACAAAATAACAGGCTTTCAAGGTCACA
+AAGCCTCAGGGACACTCCTGCGGTGGGACTGGGCTAGGAGCCATGGGGGCTCCAACTGTG
+CGCTCTGCCTGCCAGCCTGTGGGTGCTGGGGCTCCACGAAGATTGTTGTGGAATACCAAG
+CATGCTTGCTGTAGGTCACGGTGCACGTTTACTACTTCCAAGACAAACAGCCGAGAACAA
+AGCTCGCTTTAGCTTCTGCGTACACCGAACGGGACACACGACTGAACAGCGTTCCCATTG
+TGCCTGCTGGGTGGGGAGGAAGTGATGGCCCAGTGGGTCTATCAGATGTTAGTAGGATGG
+GGCCTGGCGGGGCTCCAGGCTCTGTGTGGCCGACACCCACGCCCCCCGCTCTGCTCCCCA
+TTCCCAGCCCCAGGTCAGCCCTGCGAGGCCCTGCAGCAGATGGGCTGCTCAAACTGCTCT
+GGTTTGCAGATTTTTCTTCCCTCTCAAATGAATACAATATGTTTTCAAGTCTCAACCAGA
+TCTTGAGAAAATAGGAAGAGCCAGAGGGTTTCTTTGGTGTTATGGTTGTACAGCTTCCCA
+GACTCCGGGGGAGAGATGTGATTTGTGCTTTCTGGCAATCCCATGGCGTATTAAATTTTC
+ATAGGCTTTCCAGTTTAAATTTAGGGTAGGCAATGGAAGGGAACGCAAAACAGATTTCTA
+GGTGTACTGTGTGTGTGTCTCCCACGTCTAAAGTCTGTTAACTGGAGCACCCAACAGGCC
+CCACAGGCTGCCTTCACACAGAGGACCTGGGGCGCCTCCGACCCATTGGGGTGAGCAGTG
+GGCCATGGAGGGAGCCAGGGTCAGGAGACCTGGTTGTGGGCCTGACCTGACCCTGCTCAG
+GGTGGCCTCAGGTGGGCCGTTCACCTCGTCAGCCTCAGCTTACCCTCTGACTACAGTGAC
+CTCAGACAAAATACGCTTCCTGGCCCTGTCCAGTTCTGACTTTTTATAAACAAGCACTTA
+TCCAAGTTAAAGGGATATTTTCAATATCTACTGAGTCCACAGATATTAAATATCTCCTCT
+CTTCTTTAAAATTGTGGCATTATCTTTAGAATATAAAAGGAAAATAACACACACTCTCCT
+TGAAAATAGAGAGCCTAAACACTCTGCAGGAAATATTTAAAGCTATAGTTTTTGTTTGTT
+TGTCTTGAATGCAAGTGGCCTGGACTTTGACTTGCTTTGAGTCTTTGACCTTCATGACTT
+CAGTACAGTTCAACCCTGACAGTTTTGAAGTAGGTATGTGCCTAGATCTGCCCTAGTCCC
+TGCTGGAATGTTGAAGAAGCAAAGGTCCAGGCCCTCAGAGCACTTGCCACGTACTTGCCA
+ACAGATACGGGGCGGAGACTTGAGTCAACGTAAGAGCAAGTGTGTGCCGGGTGATCCGAC
+ACTGCAGAGCGCCAGCTAGACCCTAAGCGTGTGCTAGGGGCTGACCAAGCCGTTCTTTCC
+TCAAAAACTTGGTGGGGAGGGTATTTTTAAAATCACACAAATATTTAAGTACAGATTATG
+ATGACTGCCTCAAAGCAGTGGCTCTTCAGCTTCATCAAGCTTCAGAGTCCAGAGGGTTTG
+TTCATATGGAAGGCTAGGCCTGTCTCCTGCATTTCACCCTCTTGGCCTGGGGGCGGGACC
+CAAGAATGTGTGGCTCTAAAAGGTTCCCAGGCAATGCTGAGGCTGCTTTCTGAAGGAAAA
+ACTGCAAGATACCAGGAGAGTTTCATTTAGATTGAAGAGTCGAGGAAGGCTCCTCTGAGA
+AAGAGTCTGCTAAGGAAGGAGGAGGTGGGTTCTGGGGACAGAGGTTCTCCCGTGGGTAAG
+GGTGGAGGGAAGCTCTCCTGGGGAGAAGGTGGGCAGGAGGACCAGAGGCTGGAGGGAGGA
+GGGCAGTCAGCCTCGGGGCTTCCCAGGAACAGGGACGGCCAGGGCAGGGTTTAGGGCAAG
+GAAAGCGTGTGAGCATATTTGTATTTTAGTAAATATTTACAGTTTGCCCTCCATGTCTGC
+AGTTTCATATCCATGGATTCAATCAACCACAATGAAAAACGTTGGGGAAAAAAATTGCAT
+CGGTACTGAACATATACGGACTTTTTTTCTTGTCATTATTCCCTAAACAATACAGCATAA
+CAATTATTCACATAGCATTTGCACTGTATTAGGTACTATAGGTAATCAGGAGATGCTGTA
+GATGGGAGGATGTCTGTAGGTTACACACAAATGCTGTGCCACTTTATATCAGGGGCTTGA
+GCATCCTCACATTTTGATATTTAAGGGAGGTCCTGGAACCAATTCCCCAGATACTGAGGG
+TCCACTGTCTGTGTCCCCTCGCCCCACCTTGCCTTTGTCTCCTGTCTCCTATCTCCACCC
+TGCCTCCCGCCAGCCTGTTGCTCCTGACCTGCCCGGGCACCCTGGAGCAGCACCCTATCT
+CAGAGCCTGGCTCAGTGTGTTCACTTCTGCAGAGAAACTAACTTGCCCAAGTCCACACTC
+AAAACATAGGCATTGCTGAGATGTGAAAAGCAGCTGTGGATGCTTTCTGCTACAGTCTGT
+GTGTTCTTTTCCATATCTGAATAAAAGGTCACCACCATTTGTATTTTAAAGAGAAAGAGA
+ATTTATGGGTGGAAATTGGGGATTCCCTCATTCTCAGTCAGACAGAAAAGAGGGCCCCAT
+TGTGTGCCTGATTGCAAATAAATTTAGCTTCCTCAGCCCAAGAATAGCAGAAGGGTTAAA
+ATAAAGTCTGTATTTATGGCTCTGTCAAAGGAAGGCCCCTGCCTTGGCAGCCAGCCGGAA
+TTAGCAGGGCAGCAGATGCCTGACTCAGTGCAGCATGGATTTCCCATAGGGAGCCTGGGG
+GCACAGCACAGAGAGACCACTTCTCTTTAGAAATGGGTCCCGGGCAGCCAGGCAGCCTTT
+AGTCACTGTAGATTGAATGCTCTGTCCATTTCAAAACCTGGGACTGGTCTATTGAAAGAG
+CTTATCCAGCTACTCTTTGCAGAGGTGCTGTGGGCAGGGTCCCCAGCCCAAATGCCCACC
+CATTTCCCAGAGCACAGTCAGGGCCAAGCCTGGCCTGTGGGGAAGGGAGGCCTTTCTCCC
+TGCTGGCTCGGTGCTCCCCGGATGCCTTCTCCATCGCTTGTCCTCTGCAGCACCCACAGC
+CAGCGTTCCTGATGTGCAGGGTCAGTCATTACCCAGGGTGTTCCGGACCCCACACAGATT
+CCTACAGGCCCTCATGATATTTTAAAACACAGCATCCTCAACCTTGAGGCGGAGGTCTTC
+ATAACAAAGATACTATCAGTTCCCAAACTCAGAGATCAGGTGACTCCGACTCCTCCTTTA
+TCCAATGTGCTCCTCATGGCCACTGTTGCCTGGGCCTCTCTGTCATGGGGAATCCCCAGA
+TGCACCCAGGAGGGGCCCTCTCCCACTGCATCTGTCACTTCACAGCCCTGCGTAAACGTC
+CCTGTGCTAGGTCTTTTGCAGGCACAGCTTTTCCTCCATGAGTACGTATTTTGAAACTCA
+AGATCGCATTCATGCGTCTTCACCTGGAAGGGGTCCATGTGCCCCTCCTTCTGGCCACCA
+TGCGAAGCCACACTGACGTGCCTCTCCCTCCCTCCAGGAAGCCTACGTGATGGCCAGCGT
+GGACAACCCCCACGTGTGCCGCCTGCTGGGCATCTGCCTCACCTCCACCGTGCAGCTCAT
+CACGCAGCTCATGCCCTTCGGCTGCCTCCTGGACTATGTCCGGGAACACAAAGACAATAT
+TGGCTCCCAGTACCTGCTCAACTGGTGTGTGCAGATCGCAAAGGTAATCAGGGAAGGGAG
+ATACGGGGAGGGGAGATAAGGAGCCAGGATCCTCACATGCGGTCTGCGCTCCTGGGATAG
+CAAGAGTTTGCCATGGGGATATGTGTGTGCGTGCATGCAGCACACACACATTCCTTTATT
+TTGGATTCAATCAAGTTGATCTTCTTGTGCACAAATCAGTGCCTGTCCCATCTGCATGTG
+GAAACTCTCATCAATCAGCTACCTTTGAAGAATTTTCTCTTTATTGAGTGCTCAGTGTGG
+TCTGATGTCTCTGTTCTTATTTCTCTGGAATTCTTTGTGAATACTGTGGTGATTTGTAGT
+GGAGAAGGAATATTGCTTCCCCCATTCAGGACTTGATAACAAGGTAAGCAAGCCAGGCCA
+AGGCCAGGAGGACCCAGGTGATAGTGGTGGAGTGGAGCAGGTGCCTTGCAGGAGGCCCAG
+TGAGGAGGTGCAAGGAGCTGACAGAGGGCGCAGCTGCTGCTGCTATGTGGCTGGGGCCTT
+GGCTAAGTGTCCCCCTTTCCACAGGCTCGCTCCAGAGCCAGGGCGGGGCTGAGAGAGCAG
+AGTGGTCAGGTAGCCCTGCCTGGGTGCTGGAGACAGGCACAGAACAACAAGCCAGGTATT
+TCACAGCTGGTGCGGACCCAGAAAGACTTCTGCTTTTGCCCCAAACCCCTCCCATCTCCA
+TCCCAGTCTTGCATCAGTTATTTGCACTCAACTTGCTAAGTCCTATTTTTTTCTAACAAT
+GGGTATACATTTCATCCCATTGACTTTAAAGGATTTGCAGGCAGGCCCTGTCTCTGAGAA
+TACGCCGTTGCCCGTCATCTCTCTCCGACAGCAGGGCAGGGGGTCCAGAGATGTGCCAGG
+GACCAGAGGGAGGGAGCAGACACCCACCCGGCCTGGGCAGGTCCTCCTCATTGCTTGCAT
+CCGCCTGGTTAGCAGTGGCAGTCAGTCCTGCCGAGTCATTCGTGAGGCGCTCACCCAACT
+CCAGGCAGATGTAAAAGGTGACCTACAAGAAGACAAACAAAAACATCTGGAGCGCTCTTA
+TGCCAGCATCTGCCCTTGACACCACCAGGCAGGCTGTTGCTGGGAGCCGTGGTGCTTGGG
+TAAGCTCCTTCCCATGGCAGAGCTCCTGGGACGCATTGTAGAAGCAGGGACCACCTCCCA
+GGATAACCAGATAGCAGCACACCCTGCACAGCCCCTTTTACTCCAGCATCATCGGGCATT
+GATATCTCAGCTGCAGCCACAGGCGGCCCCCAGCACCCCAGGAAGTGGGGAGCGCTCATG
+CTTCTCTGAGCACAAAAATCACTGAATATTTTTGCCATTCTCATGGTCATAACCCGGGCC
+ACAGAGTAGAACACTCCTATCACTGTTGTTAGACAGTGGTCCTGGGAGAGGGTCTTGTGT
+GCCTCGGATGCCAGGGCCTCTTTTTATTGGGAGGTGCTTGTTATTTCTGTGTGTGGCTGC
+ATTTGTTTCCCAAGACTGCCACAACAAATCATCACCAACTTGGTAGCTCAACATAGCACA
+GCTTTATTCCCTCCTGGCTCTGGAGGCCAGGTGTCTAAAAGGCCATGCTCCCACAATGGT
+TCTGAGGAGGATCCTTCCTGCCTCTCTGGCTTCTGGTGGCTCCAGCATCCCTGGGCTGTG
+GCTGCACCTCCCCATGTCAACCTCCGTCTTCACAAGGCCTTTTCCTGTGTCTCTGCAACC
+ACAGGCCCCTCTCCTTTCTCTTAATAAAGATACCAGTCATTGAGTTTGAAAATTGCTAAG
+AGAGTCTGTTGTAAATCTTCTTAGCACAAAAAAAAATGACAGATATGTGAAGTGGTAGAT
+ATATTAATTAGTTTGATTTGATCACTCCGCTATGTGTATAAATGTCAAAACAAACATTGC
+ACTCCATAAATATATATATTAAAAAAGATCCCAGTCATTGCATTTAGGACCCACCCTAAA
+TCCAGGATGATTTCATTTCAAGACTTTTAACTAGATTTGCAAAACCCCATTTCCAAATAA
+GGTCACATTCTGCAGTTTTGGGTAGACGTGAAATGTGGAGACACTGTGCAACCCACTGTC
+TTGGGGAGGGGGTGGTCAGCCTGGGGCAGATGTTGCTGGGTGTGGAGCTACATCCACTCA
+TGCCCTGACCTGGAACCCAGACCTGCTTCCCCAGCTCTCCTCCTGGTTATCTGAAGCAGG
+GAATGGAGAGCACTGCCCTCCTTGCCCAGGCAGTCTCTATCACCTGGTTTTAGTTTCTTC
+TTAGCACATATTGCCCCAGAATATCTGGTTGGTTTATGGCTTACTTGAGTTTGTGCCTAC
+CTGTCCCAACCGGGAGGTGAGCCCTGGCTATTCCCCAAACCCGGCCCTGCATGTGGGAGC
+TGCCCTTCCTCCGTTCATCAGAGGGGGCCAACAGTCCACAGCTGTTCTTAATCATCTCCC
+AGTAACCCCCAGCTCCACAAAGGTGACTCCTTACATGGTGGAGAGGTGGTCGGGCCATCC
+GTGTGAAATGTGTATGTGACCGTTTTCCTTAAGGGGCACGTAGTCTTGGCAGGTTTCGCT
+CAATATAGGATGAGCTCAGGACTCCAGTGGACTGTGGATTCAGATCTGGATTCTGGCGCA
+TTCGCCGTGTGAACGGGGGCACGTTGCTGGCCTGTCTGCGCCTCGTCTCCCGACTGTGGA
+GTGTGTTCTGCCCCTTGTCTTTCTGGGAGGTAGGGAGGGCAGTGAGCCCCTTCGCATCGC
+CCACCACAGGCCCAGCACATGGCTGATCCCCACTGAGTGTTCTTTTCCTCCTTTGATCCC
+CTTTGGCTGACCTAGGTTGGAGCAGCCACTAAAATATACCCAGAAACATCTTCCTAATCT
+ACATCTGTGCCAACCCTCATTCCCTGGCGCAGCATGACCATCACATGCCCGCCATTGTTC
+CTGATCTCTGCTGCTCATGACCTGCTCTCCAGCGCTCCTTCTCATGCTCACATTCCAGTT
+GGCCTGACCTAGATAAGTGGAGGTTTATTTGACCCCAAAAATTAGCCTTCTACAAACGAA
+TATAATAGTGTCCATTACAGAGAATAAACTTAGTGCGTGTCCCATTTAAGCAGAAGTTAC
+TGAAAGCCTGAGTTTAAGTTTCCAGGGCCTGAAAGTTTTCCATGACAGTTTTCTGCATAA
+TATTACCTACAATTTCAATCTGTTATTTAAAGCCATTCTTGTGTTTGTTGTACTTTGATT
+AGCTTTATTTTGATTTGAAGTCCTTTTACATTACGGGCAGTTAACGCTTTGTCTCTGTTA
+GATTTGCTTTTTAGTTCACAAGAGAAACCTCATTCCTCTGTATTTGAATAGTTGCAATGA
+TGGAACAGCTGTCCCTGGAGGGAAATGAAAACAGTGATTCCCCAAATTGTGACAATAGAA
+ATTTGCTCTTGGGTTACTTACAATGTATCTGAGTATTAAAAAATTTTCTTTTTAAACGTT
+TGAAGTAAAACTACCCAGAAACACTTAGTGGCTGACCAGAAACTAAACTCCTGGCATCCT
+CAAAATGGGATTTATTGGCTTATAAATGTCCTGTGTTGACTCACAAAGGCACAAACTATC
+TAGGTAAGTTTTCTTCTAAATGTTGATGGGAGAGCTGGCCACTGTTATGCAAGTTTCATT
+GTCCTGACTAAACTGCCAAAGAGATTACATAAAATTATATCAACTAGACAAAAGGAAAAA
+GGAAAAAAAACAGAGGTGTCTTGGGAGGAATCCATATGAGACCAGTAGACCATGAGAGAG
+ACATCCCTTGCCATCTACAAGGAAAATGGATTTTGTTCTCCATATGCAAAACCATCTCAG
+GAGCTTGCGGAGACACCACTTGCTTACTAGCCAGAAAGAGCAGGTGCCTCCTAAATTCCC
+CACACAGGAGCTCACAGTGGCTTTCATGCACTGGGATTAAGTTAGACTTAAGAAAGCCTG
+TCTACTCTTCCTGGGATTTACAAGCCAGCTAGTAAATCCCAGAATAAATCACACGGCACA
+GTCATCCAAAGATCCCGTCATCCGTGCCGTTTGGAAAGCCCTGCTCCTGTGCCACCCTCT
+CCCCGTGGAGCCTCCCATGCCCAGGACTGCAGAGTCCTGCCATTCAGACTGCAACTCATC
+TCACATTCTTCCAAACTATTTGGACAACAGAGCTTTCTCATCACCTAATGCAGATTACAG
+TCTCACAGAATTGAGTGTTCAGGCAGACACTGATGTGGTTCTGTAGTACAGCAAACAATA
+TCAGTTTACAGTCCTGAGGCCAGGCCTGGTGAACAACGCACGGTAGCGGTGGGGCAGGGT
+TCTCAGAATGAAACTGGCTTACACATGGCACTCTCTGACCACAACTGTATAAGCACCAAA
+CTACACTTAGTTCCATCTATGAGGTAAAATTTAATGCAGATGAACATCAAAGAAAACGTC
+AAAGGCTCCTTTTTACAAGTACGTGGGCTACTTAATTTGGTCCAAGTCCATTTTAAAAAG
+CCCTAGGTGCTTTCACGGCTCTGCTACTGACAAGAAGCCCCAGTGCCTGTGAGCTGCTAA
+TGGGAGGGAGAGGAAGATGAGCTGAGTGGGCCGGGCTATCCCGTCCACACCGGGAGACAG
+GGAAGGAGACTCCAAGCTGGTGGTGCCAGCACATTCCAGGCCACTCAGGCCTATTCCTAG
+GTGCCAGGTCACGAAAACCACGCTGACAGATCGTGCTGTGTGCGTGTCATAGCACACAAG
+CAGGACTGTGAGAGAGTGAAAGTGACACTGGGTGGAGCACTGAGGAAGGGCCACAGTGTG
+TTGGTGGAGATAGGCTGTCATGGAGAAGAGACCCTGGCTTGCTCTACATTGCTTCCAATG
+CAACTGCAAGGCAGGTCCCAGAGGGCTCCGGCCTTCGTCATCCAGGTTTGCTCCCTCCCC
+TCATGGCTTTCCCATCCTCAGATGAGGACTCGGCAGAGCCTACCCCTGCTGACTAACTGT
+GGCCCCAGGGTGGTGACTCAGCCCTGCACCTCCTGATCCCGTCTGCACTGGGCCAGAGAG
+GATGACTTACCCAGCACGTTCACATCACACAGCTTTGTGGATTCCTAGGTCCAAGGACCA
+GAGATTTCAGTTATGTGAGTTATTTTTTTTATTTGTTCTTGCGTATTCCACAAAGGGTCG
+CAGCTAAACTTAACCTAATGATCACTTTAGTATATCACTAAAAAGACAAAGCTCACAGTG
+CTGTTGAAGCACATTCATCATCTTTAGACATTTTGACTAGTTATTTCTTAAGCATTTACC
+TGCTAGTGTTAAGCATCACATGAAATACATATAGAAGTAAGACAAAATTTCTTATCTCCC
+CAAGTTTGCCAACAAATACAGAGCAGGAAGGGAAGCAGGTCAGAGCAGGAGGCGCAGCTA
+TAGTGAGGCCACCATGCAAGGCACAGGGAGGGTGAGCTCCAAGTTTGAATGGAATGGGTC
+TGTCAGCCAAGCCCCCTGGCTCTGGGAAGATAGCAGTGAACAAGCCAGATGGCCCCTCAC
+CCTCCAGAGCCGTGAGTCCTGCAGACCAAACAGCGTGACAGGTCCTTTCCCTGTCCAGGA
+GGCCTCTGTGGGTGAGAGTTGGCTGCGGACAGGGCGTGAAGGCACTTGAGGGTGGGGAAG
+TGACTCTGACTGGGAGATGCTGAGGACAGGGAGGAAACCACCAGATAAGGGACACTGGGG
+AGGAGGGGTGGACCCCTCAGGGCCAAGCACATGGAGCCTCATCACAAAGGCAAGATGGTG
+GCCAAATTCAAGGTCGCTGCAAAAGGAATGGAGAAGAGAGAATAGATTTGGCATTTGGAG
+GAAATGGTGACAATCATGAGCACCTACCCGGGACTCTCCATGGGTGCTATCTCTACATAA
+ACTCATTCCACCCTCTGATTAATCCATTCTACATATGGGGAAACAAAGGCATGCGGTGTT
+TACGTCACTTGCCAAGATCTCAGGATTTGATCCAGGTGGCCTGGTTCCATGGTGCAGCCT
+CTCAGCCTGCATGGATGCCCCAGCTCAGAGCATGACTCTCAGGACAGGGGTCCCAGCAGC
+CCTCCCTCCCTGAGCAGCAGGGTGCCCGTGCTGCACCACTTCTGTCTAGGAATAGGACAT
+TCTGACACTTTCCTGCCTCTTCCGAGGTCTAGCACTTACTCTATGCCTGCCTGGGAAGGT
+GGCAAGCTGGCCTGAGGAACAGACTCTTCCATTTTTTAGGGAGCTCAAGGCCACAGATGC
+TCTGAGATCTGGAGTCCAGAGACAGGAGCGGAGGCTTCTCCTGGTGACCACTCTGCTTAA
+AAACTTCATCAGATCCGTAGTTTCAGAGCCCCCCTGAACCCCATCCCTTACCTCTACCAG
+TTGCAGGTGGGTCTCTGGGGTGGGGCTGCCCTCCCCACCAGCACCCCAAGGGCTAAAAGG
+TTGAGGGGAGAACACCATCATTTGTACAGGGGGATCCTGGAAGATGAGGCCTGAGAAAGC
+CCTGCGGGGCCCCTCACCTTCTCCCTAGCTGTGGCCAAGAGTGTCTGGCCTTGCCTGCCT
+CAGGACCAGCCCAAAGTGGAGGTGAGAGGTGAGCCCCAGCCCCCAGGGGAAGGGTGATGG
+TGGTCTTGGTCTCAGCATGGTTCTGGTAGAGGTGGGTTATTTTGAAGATGATGAACCTTA
+AGCCTCTTTCTGATCTTGCTTTAAATAAATACTTCTGAACAACAGCAACAACAGAATAGT
+GTTGATAGGAAAGCCCTCCACTCCACCAGAACCACGCGGCCTTCTCGTCCTCCCCTCCTC
+CACTTCCTTCCTAAGTCACTGCTCCATGAGCTCTTCCACAGGAGATTTACAAAATAGAAC
+ACAAACAATCCAGTTCCCGCCTCTCACTCTGAACTCCTCCCAAGACTCGTGGGGTGCGGC
+AGCCCCTGGGAACACCCAGCCCTTCAAGGTCAAACACAGCCCCCGCCCCTCACTCTGGGG
+TACCCTGCCAGAATAAGCCCCGACAGCCATGTGGAGCAGAGCCTTCTTTTTTGTAAGTGG
+AAGTTCCAGGCTGGCTTTTCAAATCCCCTTTTAACCTCAGTGCTGTATTTCAAAATTCAT
+TCCAGTTTTCCTGTAGTAATTAACAAAAATAAATATTTTAATTTCAATTAAAGTGAGGGT
+CTCGGAGAAGAAGCAGGAACTGAGTTTCCTGAGAGGCCCCGCTGAGGCTTTGTTGATATT
+TCTTCCTGCGACCTCTGCTCGGACCCTGGGAGCTCACAGGCCGTATCGCAGCTCTTATCT
+TTGGGGACCAGTTAAAGCATAACTGCGCCAGGCACAGAGTTGTCCTTTCAAATGTGCCGG
+CAGTGGGACGGAGACCCATGCGTCAAGTCTCCTCTAAGTTCACATGGGATTCTCTCCTTG
+TCCCAAAGCTGTCTCTGACTTAAAACCCTCCAACTGATTACCTGAATTCCAGAATATGTC
+CTGTGCTCTCTGCCCTTTCCCACGCCTTTGGTGAAGACCGGTGTTCTGAGGAAACAGACA
+CTGTGTAGAAATGGCTCAGGTCCTTTAAAGCCCTGGTGTGAGGAGTGGGGAAGGGCTGGG
+CCAGAGGTCAGCTGGATTTGTTAGATTGACAGAGTGACGCGGACTTCCCCAGAGGCACGG
+GACCAAGGTGCATGCTCACGCTGTCTCATGCTCTCACACATAATGTGTGTGTGTGTGTGT
+GTGTATATATATATACACATATACATATATATATATACACACATATGCATATATATAAAA
+CCCCAAGCAGCCTCTGGCTTAGCAGGTGCATTTCCCAGCAGGGCAATTAAAGCCATGGTC
+CCAGTAGTGGTCTTGGGGTCTCAGGGTATTTGGTCTGTGCAGCCACATGCTTCAGTCTCT
+GGACCCCAGGTCATCTAACGAGGTGGTCGTGTGGGGACTGGGATAGAAAAGGTGTCTGCA
+CGGACGTGTGTGAAAGGGCTGGCACATCGCCAGTGCTCAGCACTGTCAGCTGCTATCACC
+AGTCATTCAATCATTCATTCATTCAGTTGTTCATTCTTCAACAGGCCGTTTTAAAAATGT
+GCCCAGTATACCAAAATCTCCGCTAAGCATTTAAAGAGGCAGAATGAAAGTTAGCAGTGG
+TGGTGAAACGAAGCTGGGAATGTGCTCTGAGGGCCTCCTTGTGGGCTTAATGAATATGTA
+GAAACCACGCATTTTAAATAGAGAGGGAGAAAGGGAGAGGTTCCTGGTCCTCTGCATGGG
+GACTTGTGTGTGGCTCTTTACTGTAGGCCTGTGCCACTCCTGCTCAACAGCTACCACAGA
+GGACGCCTTCAACAAATGTGAAGAACGAACAAAAGGTACAAATGTGAAGAACGAACAGGG
+TAGAAAGAAAGGAGAAAGCAAGGGTGAGGGTGAGAAATCAAGGGACAGAGAAGAGAGAAG
+AGGAGATAGCCTGGGAGTTCACACAGCCAAGAAGGTAGACACTCAGTTGAACCAGCAAGA
+GGCTGAGCCTAACTCTCCCTTTCGAATGGGCAGGAGTTCATGATATTTAATAAACAGAGG
+CCTTGCTCTGTAAGAGACAGGGTACCAGGCAGAGAGCAAGTCAGCATCGCAGGAGTCAAA
+CGAGGCAGACAGCGGGGGCAGGGAGCTTGCCTCTGAAGGAGACCCAGGCTGCCAGAGTAG
+CAGGGAGTCTGGGCCAGTCCTCTTTTGGGAAGCGCTTCCTCGGCTTCTGCCCCCCCTCTC
+CTCTCCCTTTCCACCCACCATCCTGACATAATACTTCCTAATCTGGAAGTGTTGTCCAGA
+GAAGAACCTGCTCATTTCCTCTTAAGTAGGCAGGGAAGCACTAACGTCCAGCAGCATCGG
+AAACCCGTAGGAGCGCTCTCGGCAGTGCAGGGTGAGGGGACAGTCCATGTAGTCATGAGA
+CGTGGGTGTCAGGCAAGCGTCTCTTTTCCAAAAGAGAAAAACATTAAAGGCCTCACAAAC
+GGCGCCCAAAGACTAATTCTGCATAGCATCTTTGCGAGACCCTAGGTTCTTATGATGACT
+GGTTTTGCCTGAGAAAGAAAAAATTTTAATTTTGCTCTGACATGCCAATTCAACAAATCA
+TTTTCACATAATATTCATGCAAAAAAAAAACAATTTGCCAGAAAACTTGGGAATCCATCC
+ACATCTACAGCTTTTCCCTGCAGTCACACTACAGTGGGATCCCTCCATACAGGAGCGGCA
+GAGTGGAGCAGGCTAGAGATGCCTGTTTGTTTCTGTTTGCTGCACCGCAGCAAGCATTTC
+TGTCGTGCCCACTCTGTACTAGAAAGTACATGAACATCAGCCATAAAGGGAACTAGAAAG
+GTGGCCCACCCTCTTGGTGGAGAGAGAAGAGAGTGTGGTAGAAACAATAATAAGAAGTCT
+GCAGAACTTGACCCCTCCCAGCCTCTCCCACCTGCCAGCCTGGCCCTTGCAGAGAGATGC
+AGGCTGCCATTCTTAGGCCAAAGCCTGGGACAGTTGGGCTCAGCAAGGTAGGCATCCGTC
+AAGCAAGGAGGAGCAGGGGTCAGCAGTGACCCCAGCAGCCAGCAGGGAGAAAGGTGCATG
+TGACAAGGACACCAGAGGCCGTGGGTCAGGATCAGCCAGGGTCAGGGTAGCATTTCTAGG
+AATTCACTCTGTTGGGCGCTGTGCTGGCTGCTTCTCACATATTATTCCTTTCTTACTCTC
+AGAGCAGAGATTTCAATTGCAGCGAGATTGTGGAGGCAGCCAGGGAGGTGGGGAGGGTGG
+TGTCTTCTAAAAGCATTTTCAGTATCCATGTGGTTTCAGTAATAATAATAATAATAAACC
+AGTGAAAAGTAAAACAGGACAAAAATCTTCATAGGCAGTGAACCATATCAGAGAGTCCAA
+GAAAGCACAATGAGAGTGTGGCTTAAAAACCCTGAACGACATTCCTTTGCACCAGCTTGG
+TGAGGAGGGCATGGTCCCCGCCACCCCCCACCCCCACTTTGCAGATAAACCACATGCAGG
+AAGGTCAGCCTGGCAAGTCCAGTAAGTTCAAGCCCAGGTCTCAACTGGGCAGCAGAGCTC
+CTGCTCTTCTTTGTCCTCATATACGAGCACCTCTGGACTTAAAACTTGAGGAACTGGATG
+GAGAAAAGTTAATGGTCAGCAGCGGGTTACATCTTCTTTCATGCGCCTTTCCATTCTTTG
+GATCAGTAGTCACTAACGTTCGCCAGCCATAAGTCCTCGACGTGGAGAGGCTCAGAGCCT
+GGCATGAACATGACCCTGAATTCGGATGCAGAGCTTCTTCCCATGATGATCTGTCCCTCA
+CAGCAGGGTCTTCTCTGTTTCAGGGCATGAACTACTTGGAGGACCGTCGCTTGGTGCACC
+GCGACCTGGCAGCCAGGAACGTACTGGTGAAAACACCGCAGCATGTCAAGATCACAGATT
+TTGGGCTGGCCAAACTGCTGGGTGCGGAAGAGAAAGAATACCATGCAGAAGGAGGCAAAG
+TAAGGAGGTGGCTTTAGGTCAGCCAGCATTTTCCTGACACCAGGGACCAGGCTGCCTTCC
+CACTAGCTGTATTGTTTAACACATGCAGGGGAGGATGCTCTCCAGACATTCTGGGTGAGC
+TCGCAGCAGCTGCTGCTGGCAGCTGGGTCCAGCCAGGGTCTCCTGGTAGTGTGAGCCAGA
+GCTGCTTTGGGAACAGTACTTGCTGGGACAGTGAATGAGGATGTTATCCCCAGGTGATCA
+TTAGCAAATGTTAGGTTTCAGTCTCTCCCTGCAGGATATATAAGTCCCCTTCAATAGCGC
+AATTGGGAAAGGTCACAGCTGCCTTGGTGGTCCACTGCTGTCAAGGACACCTAAGGAACA
+GGAAAGGCCCCATGCGGACCCGAGCTCCCAGGGCTGTCTGTGGCTCGTGGCTGGGACAGG
+CAGCAATGGAGTCCTTCTCTCCCTTCACTGGCTCGGTTTCTCTTAGGGACCCTCACAGCA
+CTAAGGGGTGCGCGTCCCCTGTCAGGCCCTCGAATGCCCTCCCACAGCCAGGCCCCTCTG
+AGGTTTCACTCTGGCCTGCTTGGCTCCTAGCAGCCACCAACCCATGATGCTGGGCCCTGA
+AAACACACGCAGACCTGGATGAGTGAGGCCACTGGGCACAACCAGGGCTCCCAGCTCACC
+AGAGCAGCCTGGGACACAGAGGGTGCTCAGAAACCTACCAGAGCAGCCCTGAACTCCGTC
+AGACTGAAATCCCCTGTTGCCGGGAGGAGGCGCCGGGCCTGGGGGACGGGTCCTGGGGTG
+ATCTGGCTCGTCTGTGTGTGTCACTCGTAATTAGGTCCAGAGTGAGTTAACTTTTTCCAA
+CAGAGGGAAACTAATAGTTGTCTCACTGCCTCATCTCTCACCATCCCAAGGTGCCTATCA
+AGTGGATGGCATTGGAATCAATTTTACACAGAATCTATACCCACCAGAGTGATGTCTGGA
+GCTACGGTGAGTCATAATCCTGATGCTAATGAGTTTGTACTGAGGCCAAGCTGGCTTTTA
+TTGTTAGTTAATTTACATTATATCCTCTGACATGCAAGTATTTTCTTTCGAGATAATGAC
+TAATGATAATGTAATCATTGCTGTCTATCTATTGTACTGAGAAAACACGGCAGAGGAAAT
+CGAGTCCAGCTGCCGTCCAAAAGTCACTGGAGATTGCAATGAGCTCGTCTGGCAGGGTGG
+GGGGTATGGGAGGGAAAGAGCTTAGGAAACGGCTCTCCCTGCAAAGTCCAACCAAACTTT
+AACGTTAACCAAACCATTAATGTTGCCATGAATTTGAAGTGAACCAGAGGGAGGTGGCAG
+AAGAAGCTTAATGGGGAATAGTTCCGGTAGAGAAATGAGGCTTAAGATGAACTACCCTGG
+CCCTTATGTGTCAGAGAGAACGGCTTGACAAACACACACTGAGGATGTCTGCAGGGATAA
+AAGAAGAAAGGGAGATGACCCTTGCTTCTCGCTCTCGGGAGGACCATCTGGTCCGGCCCT
+GGGGATTCTCTGTTTCCTCTTCTGAATCCCAGTGTTGCCCAGCACTGGCCTGTACCCATC
+CTCACGAGGGCCGCTCTCCTCACCCGGCCCTAGGTCCCTGCCCTGTCCTGAGCCTACAGG
+GGCCTCCCATGTTGAGAAAGTGTTGCTGACACATTGTCTCTGACCGCTGTGCCAGGCATT
+TTCTGCTGAATTACCGCACTTGGTCCTTGAATTTCACCCAGCAACTTACTGAAAGGCTGG
+AACCCATGAACCTACCCCTTCACTGAGGAAAATAAGTTACCCCAGCCATCTACAGCGACA
+GGAGCAAGGGAGGAGTCGCCTCACCTCTCTAGAAATGTGTATTTGAGGAGAACACTATTG
+AAATGAATTTCCAAGAATAATCTAGTCAGTATTACAAAAGCAAAATTATTTGGGATATCG
+TCCTTTTTTACTTAGTATTTTTTCTTTTTCCTATAGCATTATTAACTTTCTGATTTTCCA
+AATACATACACATTTTTAAATTTCCTGAGTCTTTATCTCTTCTGTTAAAATGTAAGATTT
+ATGATACAAAGGCAGAGATTTGTGTCCATGAATAAGTGAAGTTTGGTGTGCACCTGTGAG
+CTGAGCCACCTCAATTAATGGAACAGATAAGGAAATAAAGGTCTGCTGATGCATTGTTAT
+TTACAGCCATTTTCAGAATGTATCTCCTCTCCACGAGGGAACTGCAGGGTCCTGCCCCAA
+GCCATTTATTTTGTCCTCAAGCAGCCCGCCCCTCCCACTCCAGGCACAGCCCGGTCTCCT
+GCTGGTCTCCCCTCTTCCCACTTGCTCCCCCTCATCTATGCTCCAGACAGAGGCCACATA
+TATTTTTTAACTTTTTTTTTTTTTTTTTTGAGACAGAGTCTTGCCCTGTCACCCAGGCTG
+GAGTGCAGTGGTGCAGTCTCGGCTCACTGCAACCTCCACCTCCCGGGTTCAAGTGATTCT
+CCTGCCTCAGCCTCCTGAGTAGCTGGGATTACAGGCGCACACCACCATGCCCAGCTAATT
+TTTTGTATCTCTAGTTGAGACAGGGTTTCACTATGTTGGCCAGGCTGGTCTCGAACTCCT
+GACCTCATGATCTGCCCGCCTCGGCCTCCCAAAGTGCATATTTTTTAACTTTATCAGACT
+TTTCATTCTCTGCTCAACATCTTTCTTTGGTCCTCCAGGTATGTTCAGATAAAACCTGAG
+CACCTGGCCATGACTGATGGGTTGCTGGGCCATCTGGCCCTGGCAACTCTCCCGTCCACC
+AGGTCCCCCTCCCGTCACGCTCCAGGCATAGCCTGTGTGTGCCAGCGCAATGCCCACACT
+CCATGCACAAGTGGAAGCCCTCTCAAAGTCAGTGGCTTAGTGCCTTGATGTGGTCACACC
+CATTCTCAGGAAGTCCGTTCCCACTGAAAACATTGTGTGTTTTCAACATCATTGAGGCTG
+CCACGGCAGATTATAATCACTGGCCTAGGCAGCCCACTGGAACTACCAGACCATGAGCCT
+GAATTTTTTGTTTAAAAATCATATCCTGTTTTCTCTACTCTCTAGTCTCTAGTCAAGGTG
+AATTATTCAATTTAATAAATTAGGGGCCTAGTGTGTTGTACCAAGGAGCTAAAAAGAGAG
+AACTCGCAACACCTTCCAGCCCATTCTCCACCTAACACTGGCTATACTGGCTCTCCTCTC
+TCTCGCTGTTTGTTCCAAAATCTAATAACCTGTCTTCCCACTAGAATTCATCATACATGT
+TTAAAAACCTAGTTAAATAGTAGTTAAACTGACTGCATAGATCTGGAAATGAGACAGTCT
+TTCTTTTACAAATCCATATAGACTATGAGTTGGGGGCAGGGGATGACACAAGAATCTATT
+TTCTTGCCCCCAAACCATTGCTTTCCTTCCAATGTTAAGCTTGTATTCTGTGTATTAATT
+CAGGTGGTTCCGTTTGGGAATGGCCTCTGTTACCCAGAGATGGGAGGGCCATCAGAACTC
+GGGGTTGTCTGAAAAAACACTGGTTCTAAAATTATCACTGCTTTCACTTGTTTTTAACCA
+TCATAGTTGTTTGATTTTGAAGGAAAAACATGAGGGTTTTTATTCTATGCTTGTTATATC
+TATATTGTGGTTTCGTATTTTTTAGATTTTAGTACCTGACATTTTTTTAACTTTTATTTT
+AGGTTCAGGGGTACATGTGCAGGTTTGTTATATAGGTAAATTTGTGTCATGGGGGTTTGT
+TACACAGATTATTTTATCACCCAGGGATTAAGCCTAGTACCCATTAGTTATTTTTCCTGA
+TCCTCTCCCTCCTCCCATCCTCCACCGTCCTATAGACCCCAGTGTGTGTTGTTCCCCTCT
+AAGTGTCCATGTGTTCTCATCATTTAGCTCCCACTTATAAGTAAGAACATGCGGTATTTG
+ATTTTCTGTTCCTGCATTAGTTTGCTAGGGATGATGGCCTCTAGCTCCATCCATGTTCTT
+GCAAAGTACATGATCTCATTCTCTTTTGTGGCTGCCTAGTGTTCCATGGTGTATATGTAC
+CACATTTTCTTTATCCAGTCTGTCATTGATGGGCATTTAGGTTGATTCCATGTCTTTGCT
+ATTGTAAATAGTGCTGCAGTGAAAATACGCATGCATATGTCTTTATGGTAGAATGATTTA
+TATTCCTTTGAGTAATGGGATTGCCGGGTCAAATGGTAGTTCTGTTTTTAGCTATCTGAG
+AAATTGCCACACTCTTTTCCACAATAATTGAACTAATTTACATTCCCACCAACAGTGTAA
+AAGCATTCCTTTTTCTCCACAACCTCACCAGCATGTGTTGGGATTTTTTTTTTTTTTTAC
+TTTTCAATAATAGCCATCTGACTGGTATGAGATGGTATCTCAGTGTGGTTTTGATTTTTA
+TTTCTTTAATGATCAGTGATGTTAAGCTCTTTTTCATATACTTGTTGGCTGCATGTATGT
+CTTCTTCTAAAAAGTGTCTGCTCATGTCCTTTGCCCACTTTTTAATGGGATTGTTTAATT
+TTTTCTTGTGAATTTACTTAAGTTCCTTATAGATGCTGGTTATTAGACCCTTCTCAGATT
+TGTAGCTTGCAAAAATGTTCACCCATTCTGTGGGTTGTCTTCACTCTGATGATAGTTTCT
+TTTGCTGTGCAGAAGATCTTCAGTTTAGTTAGATCCCATTTGTCAATTTTTGCTTTTGTT
+GCAATTGCTTGATGTGTTTTCATCATGAAATCTTAGCCCATTCCTATATCCAGAATGGTA
+TTACCTAGGTTGTCTTCCAGGGTTTTTATAGTTTGGGGTTTTACATTTAAGTCTTTAATC
+CATGTTGAGTTTATTTTTGTGTATGGTGTAAGGAAGGAGTCCAGTTTCAATCTTCTTCAT
+GGCTAGCTAGTCATCATTTATTGAGTAGGGAGTCCTTTATTCATTGCTTTTTTTTTTTTG
+TCAACTTTGTCAACGATCACATGGTTGTAGGTGTGCAGCCTTATTTCTGGGCTCTCTATT
+CTGTTTCATTGGTCTGTATGTCTGTTTCTGTACTAGTACCATGCTGTTTTGGTTACTGTA
+TCCCTGTAGTTTAAAGTCAGGTAGCATCATGCTTCCAGCTTTGTTCTTTTTGCTTAGGAT
+TGCCTTGGCAATTCAGGCTCTTTTTTGGTTCCATGTGAATTTTTAAATTGTATTTTCTAG
+TTCTGTGAAGAATCTCATTGGTAGTGTGATAGGAGTAACATTGAATCTATAAAATACTTT
+GGGCAGTATAGTCATTTTAATGATATTGATTCTTTCTATCCATGAGCATGGAATGTTTTT
+CCATTTGTTTGTGTCATCTCTGATTTCTTTAAGCAGTGTTTTGTGGTTCTTATTGTAGAG
+ATCTTTCACTTTCCTGGTTTACTGTATTTCTAGGTATTTTATTCTTTTTGTGGCAATTGT
+GAATTGAATTGCATTCCTGATTTGGTTCTCAGCTTGACTGTTGTTGGCATATTGGAATGC
+TAATTATTTTTGTACATTGATTTTGTACAACTGAGTCTTCACTGAAGTTGTTTATCAGCT
+TAAGGGGTTTTGGGTCAAGACTATGGGGTTTTCTAGATATAGGATCATGTCATCTGCAAA
+CAGAGATAGCTGTTTTCCTCTCTTCCTGTTTGGATGTCCATTATTTCTTTCTCTCACCTG
+ATTTATCTGGCCAGGACTTCCAATACTATGTTAAATAGGAGTGTTGAGAGAGGGAATCCT
+TGTCTTGTGTCAATTTTCAAGGGGAATGTTTTCAACTTTTGCCCATTCAATATGATGTTG
+GCTGTGGGTTTGCCATAGATGGCTAATATGTTGAGGTTTGTTCTTTAAATACCTAGTTTA
+TTGAGAATTTTAAACATGTTGAATTTTATTGAGAGCCTTTTCTGCATCTATTGAGATGAT
+CATGTGGCTTTTGTCCTTAGTTCTGTTTGTGTGGTGAATCACATTTATTGATTTGCATAT
+GTTGAACCAATCTTGCATCCCAGGGATGAAGCCGACTTGATTGTGGTGGCTTAAGCTTTT
+TGATGTGCTGCTGGATTCGATTTGCCAGTATTTTGTTGAGGATTTTTATGTCTATGTTCA
+TCAGAGATATTGGCCTGAAGTTTTCTTTTTTTGTTGTATCTCTGCCAAGCTTTGGTATCA
+GGATGACATTGGCCTCATAGAATGAGTTAAGGAAGAGTCCCTCCTTCTCAATTTTTTTGG
+AATAGTTTCAGTAGGAATGGTACCAGCTTTTTTTGTACATCTTGTAGAATTTGGCTATGA
+ATCCATCTAGTCTTAGGCTTTGTTTTGGTTGGTAGGCTATTTATTACTGATTCAATTTTG
+GAGCTCATTATTGGTCTGTTCAGGGATTCAGTTTCTTCCTGAGGTTTTTATTTTTATCAA
+ATGGAACTTAAGCTTTTTCATTTCCAATTTTTTTATGATCTAAAAATGTGCAGTTTACAG
+CCCTGTTCAGAATCTGCATCTTCCTCATTCTGCAGATACAGGTCCCTCAGAGCAGGTGAC
+TGAGTGTGTATCCTGTCTGGAGCATAATACTTATGCTAGTAGAGTTACTGTTGTCTTTAT
+TGTTAATTACCAAAGTTTACCACTTATCAGTCACTTACTACTTGCTGGGCATTGCACTAA
+GCATTTCAGTTGTATTATCTTGTTGGGTCCTTACAGCAATCCTGTGAAACAGATACTGCT
+ATTACCCCACTTTATAGAGAGGTAGACTGAGGCTTCCAGCATTGAAGCAAATTGCCCAAG
+ACTACAGAAATGTAGGTTTCTAAACATCAAGAAACAGTAACCAGTAATGATGACTAAAGC
+AAGGGATTGTGATTGTTCATTCATGATCCCACTGCCTTCTTTTCTTGCTTCATCCTCTCA
+GGGGTGACTGTTTGGGAGTTGATGACCTTTGGATCCAAGCCATATGACGGAATCCCTGCC
+AGCGAGATCTCCTCCATCCTGGAGAAAGGAGAACGCCTCCCTCAGCCACCCATATGTACC
+ATCGATGTCTACATGATCATGGTCAAGTGTGAGTGACTGGTGGGTCTGTCCACACTGCCT
+AGCTGAGCCTTGGTGGCTGCTCTTAGCCAAACAGCTGAGGCCTTTGCATCCCTGGAGAAA
+TGTCATCACATTACTTAAGGCAGGCACACAAATCCAGAAACATCTGTAAATACCCCTTCA
+AGCATTCTTTTAAAGACACTTCTTGACTCATTGGGCAGTATGACCTGACATTTGCCCATG
+TTTGCAAGCAAATAAATAAAACTAAAGTCTTCCGCAAGCCATTACACCAAAATATTCTAT
+TCGCTGAGTTACTCAATGAAATACCGAGTTGCCCTATATTTTGAAGCCTGTTACCAGAGA
+GACTGAATGTTTTTAAATGCATGGCAGTGAGTAACAACATAAGGCTAATAGAGTCAACAT
+TTCTGCTTTGACTTAAACCTTTTAAACCAGTGGATTTATGTGAAGTCTCTGCAGTGTGGC
+ATTTAAACATTTCAATCTAAATAAGAGTGTGTAATTTGATTGATGCTATTATTCTACCAG
+ATTCACGAGTGCAGTGGGCTCTGGAGGTAGCATTACATGCATGGGATGAGCATTTGCAAA
+AGAAAGTTGTATAGGGAATATGACAGAGCCAAGTTAATGTAAATATTAATGCCTTTCTGA
+ACTCTAGGCCACAGAGTTGATCTTTTTTAACTTCCTTGGTTTGGGCTAAGGAAGCTGTGA
+TCCAGAGAAGCCACGTGATTTGTCTAAGGTCACATAGCAGTCTGGCCTAAAATAGCTTGA
+TATGCTGTGGATGGAAAATAAATGTGATCCCTCAAGAGGCATGAGGATTTCCAGGCAGTA
+GCCATACCTCCAAATTGTTTAATCTGGATTTAGATTGTTGGGTAGTCACATGCAGCAGCA
+CAGTTAACAGTGTGTCCTCCTGTGGAAGTTGCCAGCACAGCCAGCCCTCTCACTTGCATG
+CATGCCCACCAGCCTTCTCACTTGCATGCATGCCCACTGGGTATGTGCTGTACTGGAGAC
+GCCGGGGGTAGGGGCCCAGTCCCAACCCCAAATTCTTTAAAGCCTATTTTTCTAAGTTGC
+ATCTGGTTTCCTACCTGAAGGAATGCTAAGGGTGGATGTTGAGTGAGGACCTTGGTGCAG
+GGCACCCTGCAGTCAGGATAGTTCATGGAGAGCAATTGTACAGACCCACACTGCTCCATC
+CCCTCAGGCGTAACACAGGATGCTGACCCCAGGAAGAGTGGGCGTAGAAAAACTAGAGGG
+CATTATTGTTATTCTGATTCAAATGTACAGTGCTGGCATGGTCTTTAAACAGTAACCAGT
+ACTAGCTGGCCAAGACAGAAAAGTCTACCACAAAGACTTGGTTCTTTCATCACTTATTTG
+ACTGGAAGTGTCGCATCACCAATGCCTTCTTTAAGCAATGCCATCTTTATCATTTCTTCC
+AGTGTTCTAATTGCACTGTTTTTTCTCATTCCTTCCCCAGGCTGGATGATAGACGCAGAT
+AGTCGCCCAAAGTTCCGTGAGTTGATCATCGAATTCTCCAAAATGGCCCGAGACCCCCAG
+CGCTACCTTGTCATTCAGGTACAAATTGCAGTCTGTGCTTCCATTGGGAAGAGTCCCTCT
+AATGAGCATCTCATGTCACTGTGTTCTGTCACATGCCAGCCTGGCCTCCCTGTGTCCCAG
+ATCGCATTATTAAACCCTCCAGCGCATTAGAGCAAGCCTCAGTAAGGCGCAGGCCACATC
+GTGAACTAAGCAGCATCCGTGAGTGGGGCCCACCCAACTCCATCTCCCCCTCCCCGTCTG
+AACTCTCCTCTGGTGCTCGTCCTCACTGTCCGGCTAGCCAAAGCCTCAGCTGGGTCTAAG
+AGAGAAGCATGGTCTATTGGGCTTTGGTGTCAGGCAGACGTGGCTTCACACCCCTGACTC
+TCCACTTCTTCGCATCACCCAGGCAGCCGATCCACCTATCTCCTTCCATAACACAGGAAT
+ACCAAAACCAAGCTCACAGGATTGTCTCAAAGATTCAATAAAATATGTTGCAAAATACGC
+TCCCTAACACCTCACAGCAAGGTGCACACTCGATGAATGCTGCAGCTTCTTCCCTTTCTG
+TTTCCTCAGAAGCTATTTGAATCTCATGTAGGGGCTTTCAAGCATCAAAGGATGGTTCAT
+GTTTTATTTTAAGGCACCCACATCATGTCATGAGGGGAGGCAGCTATAATTTAGAGAACC
+AAGGGGGATTTCATTATAACAAAATTGGCAAACACACAGGCACCTGCTGGCAATAGACCC
+CTGCTCCTATAGCCAAGAAGTGGAATAGCATCTCTACGGGCCATTCTAATAGCCTCAAAA
+TCTCTGCACCAGGGGGATGAAAGAATGCATTTGCCAAGTCCTACAGACTCCAACTTCTAC
+CGTGCCCTGATGGATGAAGAAGACATGGACGACGTGGTGGATGCCGACGAGTACCTCATC
+CCACAGCAGGGCTTCTTCAGCAGCCCCTCCACGTCACGGACTCCCCTCCTGAGCTCTCTG
+GTATGAAATCTCTGTCTCTCTCTCTCTCTCAAGCTGTGTCTACTCATTTGAACAAATTGA
+ATTTTAGGGAAAATAACCATCTAGTGAAACTCACATGGATATGAAGTCAATTTTAACCAA
+ATGGTAAAATCAAAATCAAAATAAATTAAGTGTATTAATTATTTTGTTGCATTGCAACAA
+CTTGATTGTAAGCCTTTTAGGTCCACTATGGAATGTAATTAAATCAAAACTAAACCTAGT
+TGCTCTAAAACTAACGATTAAGACAAAAATTAAACACCTTCACAATATACCCTCCATGAG
+GCACACCACCTGCATTCAGGAAAAGTGGATGAGATGTGGTACAAGCATTCCATGGGCAAC
+TTCTCTGTTTCTTTTTCAGAGTGCAACCAGCAACAATTCCACCGTGGCTTGCATTGATAG
+AAATGGGGTATGTATGAACACCTTATAAGCCAGAATTTACAGCTCTCCACTATGGCTCTA
+TTTTACATGGAAAATGCCTTAACCTAAATAATTTTAACCCAGATAATCTTGAGTTTTCTT
+CCTGTGTGGGTTTTTCCCTGCACGGCTGTCACGCCTCACAGTGCCGTTCAAAGCGTGACT
+CCTGGACCAGTAGTAGCATCGCCTGGCCTTGTTAGAAACGCCATTTTTCAGGCCACTGCC
+CCAGTTTGACCAAATCAGGACCTCTGGGGGTGGCACCCAGTAGTCTATGTTTGAGCCACT
+TTCCAGGTGATGCTGATGTCTGTTGAAGTGTGAGGCCGTGGTCTAGACCGCACTGTGCCA
+TGCAGAAACCACTAGCCACATGTGGCTACTTCAACTTAAATGTTAATGAGTTAAAATGAA
+ATAAAATATAAAATTCAGTTTCTCACACATGTGAAGTGTCCAGTAGCCACACGTGGCTAG
+TGGTGACCGTATTGAAGAGCACCGCTCATAGCACACCTCCCTCACTGCGGAAAGTTCTGC
+TGTACAGCACCCAGCACAGCCCTGCTGCCCACCCTGCAGCCTGTGGCCCAGTAGCACCAG
+CACCCACCAGGGTGCAGACTCTCAGGCCTGCCCAACCTACTAATCAGAACCAGCATCTCA
+AGGAGATCTCGGGTGATTTTTGCAAACACTGAAGTTGGGGCAGCCCTGACCGGAGTAACC
+TTCCCTCATTTCCTCCTGCAGCTGCAAAGCTGTCCCATCAAGGAAGACAGCTTCTTGCAG
+CGATACAGCTCAGACCCCACAGGCGCCTTGACTGAGGACAGCATAGACGACACCTTCCTC
+CCAGTGCCTGGTGAGTGGCTTGTCTGGAAACAGTCCTGCTCCTCAACCTCCTCGACCCAC
+TCAGCAGCAGCCAGTCTCCAGTGTCCAAGCCAGGTGCTCCCTCCAGCATCTCCAGAGGGG
+GAAACAGTGGCAGATTTGCAGACACAGTGAAGGGCGTAAGGAGCAGATAAACACATGACC
+GAGCCTGCACAAGCTCTTTGTTGTGTCTGGTTGTTTGCTGTACCTCTGTTGTAAGAATGA
+ATCTGCAAAATTTCTAGCTTATGAAGCAAATCACGGACATACACATCTGTGTGTGTGAGT
+GTTCATGATGTGTGTACATCTGTGTATGTGTGTGTGTGTATGTGTGTGTTTGTGACAGAT
+TTGATCCCTGTTCTCTCTGCTGGCTCTATCTTGACCTGTGAAACGTATATTTAACTAATT
+AAATATTAGTTAATATTAATAAATTTTAAGCTTTATCCAGA
diff --git a/test/norm.right-align.gff b/test/norm.right-align.gff
new file mode 100644
index 000000000..4b2b426a0
--- /dev/null
+++ b/test/norm.right-align.gff
@@ -0,0 +1,4 @@
+7	ensembl_havana	gene	100	29201	.	+	.	ID=gene:ENSG00000146648;Name=EGFR;biotype=protein_coding;description=epidermal growth factor receptor [Source:HGNC Symbol%3BAcc:3236];gene_id=ENSG00000146648;logic_name=ensembl_havana_gene;version=11
+7	ensembl_havana	mRNA	100	29201	.	+	.	ID=transcript:ENST00000455089;Parent=gene:ENSG00000146648;Name=EGFR-004;biotype=protein_coding;havana_transcript=OTTHUMT00000343056;havana_version=1;tag=basic;transcript_id=ENST00000455089;version=1
+7	havana	gene	5875	15059	.	-	.	ID=gene:ENSG00000224057;Name=EGFR-AS1;biotype=antisense;description=EGFR antisense RNA 1 [Source:HGNC Symbol%3BAcc:40207];gene_id=ENSG00000224057;logic_name=havana;version=1
+7	havana	transcript	5875	15059	.	-	.	ID=transcript:ENST00000442411;Parent=gene:ENSG00000224057;Name=EGFR-AS1-001;biotype=antisense;havana_transcript=OTTHUMT00000343091;havana_version=1;tag=basic;transcript_id=ENST00000442411;version=1
diff --git a/test/norm.right-align.vcf b/test/norm.right-align.vcf
new file mode 100644
index 000000000..d8dd93f6c
--- /dev/null
+++ b/test/norm.right-align.vcf
@@ -0,0 +1,8 @@
+##fileformat=VCFv4.2
+##contig=<ID=7,length=249250621>
+##INFO=<ID=type,Number=.,Type=String,Description="">
+##INFO=<ID=EXP,Number=1,Type=String,Description="Expected consequence">
+##INFO=<ID=EXPL,Number=1,Type=String,Description="Expected consequence with bt/csq -l">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+7	897	.	GGAATTAAGA	G	.	.	.
+7	910	.	G	C	.	.	.
diff --git a/test/norm.symbolic.1.out b/test/norm.symbolic.1.out
new file mode 100644
index 000000000..cec427798
--- /dev/null
+++ b/test/norm.symbolic.1.out
@@ -0,0 +1,11 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##contig=<ID=20,length=2147483647>
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position">
+##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
+##INFO=<ID=ORI,Number=1,Type=String,Description="Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+20	15	.	TAC	T	.	.	ORI=20|24|ACA|A
+20	15	.	TAC	<DEL>	.	.	END=17;SVTYPE=DEL;ORI=20|24|A|<DEL>
+20	93	.	CAAA	C	.	.	ORI=20|98|AAAA|A
+20	93	.	CAAA	<DEL>	.	.	END=96;SVTYPE=DEL;ORI=20|98|A|<DEL>
diff --git a/test/norm.symbolic.fa b/test/norm.symbolic.fa
new file mode 100644
index 000000000..12bf50588
--- /dev/null
+++ b/test/norm.symbolic.fa
@@ -0,0 +1,3 @@
+>20
+AGGATGGGGCTCATTACACACACACACCTTGTCTCCAGAATCACTGGTGAGGAAGGGGAG
+TGCAGCCTGGGAGACAGAGCAAGACTCCATCTCAAAAAAAAAAAAAAAAAAAAAGGCCAT
diff --git a/test/norm.symbolic.vcf b/test/norm.symbolic.vcf
new file mode 100644
index 000000000..2dc375186
--- /dev/null
+++ b/test/norm.symbolic.vcf
@@ -0,0 +1,9 @@
+##fileformat=VCFv4.2
+##contig=<ID=20,length=2147483647>
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position">
+##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+20	24	.	ACA	A	.	.	.
+20	24	.	A	<DEL>	.	.	END=26;SVTYPE=DEL
+20	98	.	AAAA	A	.	.	.
+20	98	.	A	<DEL>	.	.	END=101;SVTYPE=DEL
diff --git a/test/query.95.out b/test/query.95.out
new file mode 100644
index 000000000..ec5476a91
--- /dev/null
+++ b/test/query.95.out
@@ -0,0 +1,3 @@
+#[1]CHROM [2]POS  [3]SAMPLE [4]DP [5]GT
+4 3258449  C 1 1/1
+4 3258449  D 0 0/0
diff --git a/test/query.96.out b/test/query.96.out
new file mode 100644
index 000000000..585c684bc
--- /dev/null
+++ b/test/query.96.out
@@ -0,0 +1,2 @@
+#[1]CHROM [2]POS  [3]SAMPLE [4]DP [5]GT[6]CHROM [7]POS  [8]SAMPLE [9]DP [10]GT
+4 3258449  C 1 1/14 3258449  D 0 0/0
diff --git a/test/query.97.out b/test/query.97.out
new file mode 100644
index 000000000..c409c224f
--- /dev/null
+++ b/test/query.97.out
@@ -0,0 +1,2 @@
+#[1]CHROM [2]POS [3]SAMPLE [4]DP [5]GT [6]SAMPLE [7]DP [8]GT
+4 3258449 C 1 1/1 D 0 0/0
diff --git a/test/query.98.out b/test/query.98.out
new file mode 100644
index 000000000..b53f5968c
--- /dev/null
+++ b/test/query.98.out
@@ -0,0 +1,2 @@
+#[1]CHROM [2]POS [3]SAMPLE [4]SAMPLE [5]DP [6]DP [7]GT [8]GT
+4 3258449 C D 1 0 1/1 0/0
diff --git a/test/query.header.vcf b/test/query.header.vcf
new file mode 100644
index 000000000..5328daad1
--- /dev/null
+++ b/test/query.header.vcf
@@ -0,0 +1,7 @@
+##fileformat=VCFv4.2
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##contig=<ID=4,assembly=b37,length=191154276>
+##reference=ref.fa
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	C	D
+4	3258449	.	A	C	.	.	.	GT:DP	1/1:1	0/0:0
diff --git a/test/reheader.3.fai b/test/reheader.3.fai
new file mode 100644
index 000000000..25234c1af
--- /dev/null
+++ b/test/reheader.3.fai
@@ -0,0 +1,12 @@
+1	2364278061	6	2364278061	2364278062
+2	2317450362	7159592787	2317450362	2317450363
+3	2291775479	9477043156	2291775479	2291775480
+4	2192534405	11768818642	2192534405	2192534406
+5	2148190925	13961353054	2148190925	2148190926
+6	2107674557	16109543986	2107674557	2107674558
+7	2082167746	18217218550	2082167746	2082167747
+8	2081484518	20299386303	2081484518	2081484519
+9	2024734096	22380870828	2024734096	2024734097
+10	1752849333	2364278075	1752849333	1752849334
+11	1650012615	4117127416	1650012615	1650012616
+12	1392452741	5767140039	1392452741	1392452742
diff --git a/test/reheader.3.vcf b/test/reheader.3.vcf
new file mode 100644
index 000000000..7f7a4ce83
--- /dev/null
+++ b/test/reheader.3.vcf
@@ -0,0 +1,4 @@
+##fileformat=VCFv4.3
+##contig=<ID=1,length=1>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+1	22	.	A	G	.	.	.
diff --git a/test/reheader.6.out b/test/reheader.6.out
new file mode 100644
index 000000000..9cb406cd7
--- /dev/null
+++ b/test/reheader.6.out
@@ -0,0 +1,16 @@
+##fileformat=VCFv4.3
+##FILTER=<ID=PASS,Description="All filters passed">
+##contig=<ID=1,length=2364278061>
+##contig=<ID=2,length=2317450362>
+##contig=<ID=3,length=2291775479>
+##contig=<ID=4,length=2192534405>
+##contig=<ID=5,length=2148190925>
+##contig=<ID=6,length=2107674557>
+##contig=<ID=7,length=2082167746>
+##contig=<ID=8,length=2081484518>
+##contig=<ID=9,length=2024734096>
+##contig=<ID=10,length=1752849333>
+##contig=<ID=11,length=1650012615>
+##contig=<ID=12,length=1392452741>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+1	22	.	A	G	.	.	.
diff --git a/test/split-vep.26.out b/test/split-vep.26.out
new file mode 100644
index 000000000..9eadaff25
--- /dev/null
+++ b/test/split-vep.26.out
@@ -0,0 +1,21 @@
+860416
+860417
+860431
+860461
+860498
+860504
+860511
+860521
+860564
+860604
+860607
+860637
+860684
+860688
+861172
+861177
+861178
+861196
+861209
+861211
+861213
diff --git a/test/split-vep.filter.1.out b/test/split-vep.filter.1.out
new file mode 100644
index 000000000..a0c86c26e
--- /dev/null
+++ b/test/split-vep.filter.1.out
@@ -0,0 +1 @@
+41334153 5_prime_utr_variant&nmd_transcript_variant ENST00000478993 nonsense_mediated_decay
diff --git a/test/split-vep.filter.2.out b/test/split-vep.filter.2.out
new file mode 100644
index 000000000..135f6c0ec
--- /dev/null
+++ b/test/split-vep.filter.2.out
@@ -0,0 +1 @@
+41334153 5_prime_utr_variant&nmd_transcript_variant ENST00000478993 nonsense_mediated_decay A|5_prime_UTR_variant&NMD_transcript_variant|MODIFIER|DDX3X|ENSG00000215301|Transcript|ENST00000478993|nonsense_mediated_decay|1/19||ENST00000478993.5:c.-100G>A||756||||||1||1||SNV|HGNC|HGNC:2745|||1||CCDS43931.1|ENSP00000478443|O00571.228||UPI000013CB6D|O00571-1|1|||||||||||||||||||||||||||||||||||loss/donor/41334153-41334154/Medium/1.787081|0|0|2|uAUG_gained_CapDistanceToStart:755&uAUG_gained_DistanceToCDS:100&uAUG_gained_DistanceToStop:27&uAUG_gained_KozakContext:GCGATGC&uAUG_gained_KozakStrength:Moderate&uAUG_gained_type:uORF|uAUG_gained|15.55|1.471728||||||-27|11|-27|-25|0.01|0.00|0.00|0.00|DDX3X
diff --git a/test/split-vep.filter.vcf b/test/split-vep.filter.vcf
new file mode 100644
index 000000000..2a058801d
--- /dev/null
+++ b/test/split-vep.filter.vcf
@@ -0,0 +1,7 @@
+##fileformat=VCFv4.2
+##contig=<ID=chrX,length=156040895>
+##VEP="v102" time="2020-12-14 19:43:49" cache="/opt/vep/.vep/homo_sapiens/102_GRCh38" ensembl-variation=102.2716d2e ensembl-io=102.ff1cf96 ensembl-funcgen=102.6bd93a0 ensembl=102.347f9ed 1000genomes="phase3" COSMIC="91" ClinVar="202006" ESP="V2-SSA137" HGMD-PUBLIC="20194" assembly="GRCh38.p13" dbSNP="153" gencode="GENCODE 36" genebuild="2014-07" gnomAD="r2.1" polyphen="2.2.2" regbuild="1.0" sift="sift5.2.2"
+##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|FLAGS|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|MANE|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|UNIPROT_ISOFORM|GENE_PHENO|SIFT|PolyPhen|DOMAINS|miRNA|HGVS_OFFSET|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|AA_AF|EA_AF|gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|MAX_AF|MAX_AF_POPS|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS|SpliceRegion|GeneSplicer|existing_InFrame_oORFs|existing_OutOfFrame_oORFs|existing_uORFs|five_prime_UTR_variant_annotation|five_prime_UTR_variant_consequence|CADD_PHRED|CADD_RAW|LoF|LoF_filter|LoF_flags|LoF_info|REVEL|SpliceAI_pred_DP_AG|SpliceAI_pred_DP_AL|SpliceAI_pred_DP_DG|SpliceAI_pred_DP_DL|SpliceAI_pred_DS_AG|SpliceAI_pred_DS_AL|SpliceAI_pred_DS_DG|SpliceAI_pred_DS_DL|SpliceAI_pred_SYMBOL">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+chrX	41334153	.	G	A	.	.	CSQ=A|5_prime_UTR_variant&NMD_transcript_variant|MODIFIER|DDX3X|ENSG00000215301|Transcript|ENST00000478993|nonsense_mediated_decay|1/19||ENST00000478993.5:c.-100G>A||756||||||1||1||SNV|HGNC|HGNC:2745|||1||CCDS43931.1|ENSP00000478443|O00571.228||UPI000013CB6D|O00571-1|1|||||||||||||||||||||||||||||||||||loss/donor/41334153-41334154/Medium/1.787081|0|0|2|uAUG_gained_CapDistanceToStart:755&uAUG_gained_DistanceToCDS:100&uAUG_gained_DistanceToStop:27&uAUG_gained_KozakContext:GCGATGC&uAUG_gained_KozakStrength:Moderate&uAUG_gained_type:uORF|uAUG_gained|15.55|1.471728||||||-27|11|-27|-25|0.01|0.00|0.00|0.00|DDX3X
+chrX	41334585	.	GGACGCGCATGCGC	G	.	.	CSQ=-|intron_variant|MODIFIER|DDX3X|ENSG00000215301|Transcript|ENST00000457138|protein_coding||1/15|ENST00000457138.7:c.45+291_45+303del||||||||1||1||deletion|HGNC|HGNC:2745|||2||CCDS55404.1|ENSP00000392494|O00571.228||UPI00017A8655|O00571-2|1||||||||||||||||||||||||||||||||||||||||||||||||||||||||,-|intron_variant&NMD_transcript_variant|MODIFIER|DDX3X|ENSG00000215301|Transcript|ENST00000478993|nonsense_mediated_decay||1/18|ENST00000478993.5:c.45+291_45+303del||||||||1||1||deletion|HGNC|HGNC:2745|||1||CCDS43931.1|ENSP00000478443|O00571.228||UPI000013CB6D|O00571-1|1||||||||||||||||||||||||||||||||||||||||||||||||||||||||
diff --git a/test/split-vep.mixed-list.txt b/test/split-vep.mixed-list.txt
new file mode 100644
index 000000000..aafe13f1b
--- /dev/null
+++ b/test/split-vep.mixed-list.txt
@@ -0,0 +1,3 @@
+ENST00000344229
+ENST00000317578
+RPL10
diff --git a/test/stats.counts.2.chk b/test/stats.counts.2.chk
index e86630791..812772c90 100644
--- a/test/stats.counts.2.chk
+++ b/test/stats.counts.2.chk
@@ -26,7 +26,7 @@ ST	0	T>A	0
 ST	0	T>C	0
 ST	0	T>G	0
 PSC	0	A	4	0	0	0	0	0	0.0	0	0	0	1
-PSC	0	B	0	1	2	2	1	0	0.0	0	2	0	0
+PSC	0	B	0	1	2	2	2	0	0.0	0	2	0	0
 PSC	0	C	0	3	0	2	1	0	0.0	0	0	1	1
 PSI	0	A	0	0	0	0.00	0	0	0	0
 PSI	0	B	0	0	0	0.00	0	0	0	0
diff --git a/test/stats.counts.chk b/test/stats.counts.chk
index ee798a03b..348c81e1d 100644
--- a/test/stats.counts.chk
+++ b/test/stats.counts.chk
@@ -27,11 +27,11 @@ ST	0	T>A	0
 ST	0	T>C	0
 ST	0	T>G	0
 PSC	0	A	11	0	0	0	0	0	0.0	0	0	0	1
-PSC	0	B	1	1	4	3	1	1	0.0	0	2	0	0
+PSC	0	B	1	1	4	3	2	1	0.0	0	2	0	0
 PSC	0	C	1	5	0	3	1	0	0.0	0	0	1	1
 PSI	0	A	0	0	0	0.00	0	0	0	0
 PSI	0	B	0	0	0	0.00	1	0	0	0
 PSI	0	C	0	0	0	0.00	0	0	0	0
-HWE	0	0.000000	3	0.000000	0.000000	0.000000
+HWE	0	0.000000	2	0.000000	0.000000	0.000000
 HWE	0	0.330000	1	0.000000	0.000000	0.000000
 HWE	0	0.490000	7	0.330000	0.330000	0.330000
diff --git a/test/stats.vaf.1.chk b/test/stats.vaf.1.chk
new file mode 100644
index 000000000..4310c8566
--- /dev/null
+++ b/test/stats.vaf.1.chk
@@ -0,0 +1,45 @@
+SN	0	number of samples:	2
+SN	0	number of records:	10
+SN	0	number of no-ALTs:	0
+SN	0	number of SNPs:	9
+SN	0	number of MNPs:	0
+SN	0	number of indels:	1
+SN	0	number of others:	0
+SN	0	number of multiallelic sites:	3
+SN	0	number of multiallelic SNP sites:	2
+TSTV	0	1	10	0.10	0	9	0.00
+SiS	0	1	4	1	3	2	0	0	2
+AF	0	0.000000	4	1	3	3	0	0	3
+AF	0	0.490000	7	0	7	0	0	0	0
+QUAL	0	.	9	0	9	1
+IDD	0	-5	1	0	.
+IDD	0	-4	1	1	0.43
+IDD	0	-3	1	1	0.29
+ST	0	A>C	0
+ST	0	A>G	0
+ST	0	A>T	0
+ST	0	C>A	8
+ST	0	C>G	1
+ST	0	C>T	1
+ST	0	G>A	0
+ST	0	G>C	0
+ST	0	G>T	1
+ST	0	T>A	0
+ST	0	T>C	0
+ST	0	T>G	0
+DP	0	1	8	40.000000	0	0.000000
+DP	0	2	3	15.000000	0	0.000000
+DP	0	3	2	10.000000	0	0.000000
+DP	0	5	2	10.000000	0	0.000000
+DP	0	6	2	10.000000	0	0.000000
+DP	0	7	1	5.000000	0	0.000000
+DP	0	8	1	5.000000	0	0.000000
+DP	0	10	1	5.000000	0	0.000000
+PSC	0	a	0	0	9	0	10	1	3.7	2	0	0	0
+PSC	0	b	2	0	8	1	7	0	3.0	0	0	0	0
+PSI	0	a	0	0	0	0.00	0	1	0	0
+PSI	0	b	0	0	0	0.00	0	0	0	0
+HWE	0	0.000000	3	0.000000	0.000000	0.990000
+HWE	0	0.490000	7	0.990000	0.990000	0.990000
+VAF	0	a	0,0,0,1,2,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,4	0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
+VAF	0	b	0,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,4	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
diff --git a/test/stats.vaf.vcf b/test/stats.vaf.vcf
new file mode 100644
index 000000000..fb3c1c431
--- /dev/null
+++ b/test/stats.vaf.vcf
@@ -0,0 +1,16 @@
+##fileformat=VCFv4.2
+##reference=ref.fa
+##contig=<ID=chr1,length=248956422>
+##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths (high-quality bases)">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	a	b
+chr1	603697	.	C	A,T	.	.	.	GT:AD	0/1:0,1,0	0/2:0,0,1
+chr1	847172	.	C	A	.	.	.	GT:AD	0/1:1,1	0/1:1,1
+chr1	895916	.	C	A,G	.	.	.	GT:AD	1/2:8,2,0	0/0:8,0,0
+chr1	940526	.	CTTTTT	C,CTT,CT	.	.	.	GT:AD	2/3:2,0,2,3	0/0:2,0,0,0
+chr1	1054587	.	C	A	.	.	.	GT:AD	0/1:5,1	0/1:5,1
+chr1	1123455	.	C	A	.	.	.	GT:AD	0/1:0,1	0/1:0,1
+chr1	1130454	.	C	A	.	.	.	GT:AD	0/1:4,1	0/1:4,1
+chr1	1145661	.	G	T	.	.	.	GT:AD	0/1:0,1	0/1:0,1
+chr1	1198538	.	C	A	.	.	.	GT:AD	0/1:0,1	0/1:0,1
+chr1	1349818	.	C	A	.	.	.	GT:AD	0/1:2,1	0/1:2,1
diff --git a/test/test.pl b/test/test.pl
index 37cb1a8d3..e2178a33b 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -48,6 +48,7 @@
 run_test(\&test_vcf_stats,$opts,in=>['stats.a','stats.b'],out=>'stats.B.chk',args=>'-s B');
 run_test(\&test_vcf_stats,$opts,in=>['stats.counts'],out=>'stats.counts.chk',args=>'-s -');
 run_test(\&test_vcf_stats,$opts,in=>['stats.counts'],out=>'stats.counts.2.chk',args=>q[-s - -i 'type="snp"']);
+run_test(\&test_vcf_stats,$opts,in=>['stats.vaf'],out=>'stats.vaf.1.chk',args=>q[-s -]);
 run_test(\&test_vcf_isec,$opts,in=>['isec.a','isec.b'],out=>'isec.ab.out',args=>'-n =2');
 run_test(\&test_vcf_isec,$opts,in=>['isec.a','isec.b'],out=>'isec.ab.flt.out',args=>'-n =2 -i"STRLEN(REF)==2"');
 run_test(\&test_vcf_isec,$opts,in=>['isec.a','isec.b'],out=>'isec.ab.both.out',args=>'-n =2 -c both');
@@ -90,8 +91,6 @@
 run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.10.a','merge.gvcf.10.b'],out=>'merge.gvcf.10.2.out',args=>'-m none');
 run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.10.a','merge.gvcf.10.b'],out=>'merge.gvcf.10.3.out',args=>'-g {PATH}/merge.gvcf.10.fa');
 run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.10.a','merge.gvcf.10.b'],out=>'merge.gvcf.10.4.out',args=>'-g {PATH}/merge.gvcf.10.fa -m none');
-run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.10.b','merge.gvcf.10.a'],out=>'merge.gvcf.10.5.out',args=>'-g {PATH}/merge.gvcf.10.fa');
-run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.10.b','merge.gvcf.10.a'],out=>'merge.gvcf.10.4.out',args=>'-g {PATH}/merge.gvcf.10.fa -m none');
 run_test(\&test_vcf_merge,$opts,in=>['merge.noidx.a','merge.noidx.b','merge.noidx.c'],out=>'merge.noidx.abc.out',args=>'');
 run_test(\&test_vcf_merge,$opts,in=>['merge.noidx.a','merge.noidx.b','merge.noidx.c'],out=>'merge.noidx.abc.out',args=>'--no-index',noidx=>1);
 run_test(\&test_vcf_merge,$opts,in=>['merge.8.a','merge.8.b'],out=>'merge.8.out',args=>'');
@@ -101,6 +100,11 @@
 run_test(\&test_vcf_merge,$opts,in=>['merge.10.a','merge.10.b'],out=>'merge.10.1.out',args=>'-m none');
 run_test(\&test_vcf_merge,$opts,in=>['merge.10.a','merge.10.b'],out=>'merge.10.2.out',args=>'-m both');
 run_test(\&test_vcf_merge,$opts,in=>['merge.10.a','merge.10.b'],out=>'merge.10.3.out',args=>'-m snp-ins-del');
+run_test(\&test_vcf_merge,$opts,in=>['merge.mrules.1.a','merge.mrules.1.b'],out=>'merge.mrules.1.1.out',args=>'--gvcf -');
+run_test(\&test_vcf_merge,$opts,in=>['merge.mrules.1.a','merge.mrules.1.b'],out=>'merge.mrules.1.2.out',args=>'--gvcf - -M AD:.,PL:.');
+run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.5.a','merge.gvcf.5.b'],out=>'merge.gvcf.5.1.out',args=>'--gvcf -');
+run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.5.a','merge.gvcf.5.b'],out=>'merge.gvcf.5.1.out',args=>'--gvcf - --merge none');
+run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.11.a','merge.gvcf.11.b','merge.gvcf.11.c'],out=>'merge.gvcf.11.1.out',args=>'--gvcf -');
 # run_test(\&test_vcf_merge_big,$opts,in=>'merge_big.1',out=>'merge_big.1.1',nsmpl=>79000,nfiles=>79,nalts=>486,args=>'');   # commented out for speed
 run_test(\&test_vcf_query,$opts,in=>'query.string',out=>'query.string.1.out',args=>q[-f '%CHROM\\t%POS\\t%CLNREVSTAT\\n' -i'CLNREVSTAT="criteria_provided,_conflicting_interpretations"']);
 run_test(\&test_vcf_query,$opts,in=>'query.string',out=>'query.string.1.out',args=>q[-f '%CHROM\\t%POS\\t%CLNREVSTAT\\n' -i'CLNREVSTAT="criteria_provided" || CLNREVSTAT="_conflicting_interpretations"']);
@@ -233,6 +237,13 @@
 run_test(\&test_vcf_query,$opts,in=>'filter.12',out=>'query.89.out',args=>q[-i'FILTER~"A;B"' -f'%FILTER\\n']);
 run_test(\&test_vcf_query,$opts,in=>'filter.12',out=>'query.90.out',args=>q[-i'FILTER!~"A;B"' -f'%FILTER\\n']);
 run_test(\&test_vcf_query,$opts,in=>'filter.10',out=>'query.91.out',args=>q[-i'DP%10==2' -f'[ %DP]\\n']);
+run_test(\&test_vcf_query,$opts,in=>'query.header',out=>'query.95.out',args=>q[-H -f'[%CHROM %POS  %SAMPLE %DP %GT\\n]']);
+run_test(\&test_vcf_query,$opts,in=>'query.header',out=>'query.95.out',args=>q[-H -f'[%CHROM %POS  %SAMPLE %DP %GT]']);
+run_test(\&test_vcf_query,$opts,in=>'query.header',out=>'query.96.out',args=>q[-H -f'[%CHROM %POS  %SAMPLE %DP %GT]\\n']);
+run_test(\&test_vcf_query,$opts,in=>'query.header',out=>'query.97.out',args=>q[-H -f'%CHROM %POS[ %SAMPLE %DP %GT]\\n']);
+run_test(\&test_vcf_query,$opts,in=>'query.header',out=>'query.97.out',args=>q[-H -f'%CHROM %POS[ %SAMPLE %DP %GT]']);
+run_test(\&test_vcf_query,$opts,in=>'query.header',out=>'query.98.out',args=>q[-H -f'%CHROM %POS[ %SAMPLE][ %DP][ %GT]\\n']);
+run_test(\&test_vcf_query,$opts,in=>'query.header',out=>'query.98.out',args=>q[-H -f'%CHROM %POS[ %SAMPLE][ %DP][ %GT]']);
 run_test(\&test_vcf_norm,$opts,in=>'norm',out=>'norm.out',fai=>'norm',args=>'-cx');
 run_test(\&test_vcf_norm,$opts,in=>'norm.split',out=>'norm.split.out',args=>'-m-');
 run_test(\&test_vcf_norm,$opts,in=>'norm.split.2',out=>'norm.split.2.out',args=>'-m-');
@@ -272,6 +283,24 @@
 run_test(\&test_vcf_norm,$opts,in=>'norm.5',out=>'norm.5.1.out',args=>'-m - --multi-overlaps 0');
 run_test(\&test_vcf_norm,$opts,in=>'norm.5',out=>'norm.5.2.out',args=>'-m - --multi-overlaps .');
 run_test(\&test_vcf_norm,$opts,in=>'norm.m-any',out=>'norm.m-any.1.out',args=>'-m -any');
+run_test(\&test_vcf_norm,$opts,in=>'norm.phased-split',out=>'norm.phased-split.1.out',args=>'-m -any');
+run_test(\&test_vcf_norm,$opts,in=>'norm.phased-join',out=>'norm.phased-join.1.out',args=>'-m +any');
+run_test(\&test_vcf_norm,$opts,in=>'norm.symbolic',fai=>'norm.symbolic',out=>'norm.symbolic.1.out',args=>'--old-rec-tag ORI');
+run_test(\&test_vcf_norm,$opts,in=>'norm.right-align',fai=>'norm.right-align',out=>'norm.right-align.1.out',args=>'--old-rec-tag ORI');
+run_test(\&test_vcf_norm,$opts,in=>'norm.right-align',fai=>'norm.right-align',out=>'norm.right-align.2.out',args=>'--old-rec-tag ORI -g {PATH}/norm.right-align.gff');
+run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.1.out',args=>'',reg=>'-r 1');
+run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.1.out',args=>'',reg=>'-r 1:1-2');
+run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.1.out',args=>'',reg=>'-r 1:1,1:2');
+run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.2.out',args=>'',reg=>'-r 1:1-1');
+run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.3.out',args=>'',reg=>'-r {1:1}');
+run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.3.out',args=>'',reg=>'-r {1:1}:1-2');
+run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.3.out',args=>'',reg=>'-r {1:1}:1,{1:1}:2');
+run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.4.out',args=>'',reg=>'-r {1:1}:1-1');
+run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.5.out',args=>'',reg=>'-r {1:1-1}');
+run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.5.out',args=>'',reg=>'-r {1:1-1}:1-2');
+run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.5.out',args=>'',reg=>'-r {1:1-1}:1,{1:1-1}:2');
+run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.6.out',args=>'',reg=>'-r {1:1-1}:1-1');
+run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',args=>'',reg=>'-r {1:1-1}-2',expected_failure=>1);
 run_test(\&test_vcf_view,$opts,in=>'view',out=>'view.1.out',args=>'-aUc1 -C1 -s NA00002 -v snps',reg=>'');
 run_test(\&test_vcf_view,$opts,in=>'view',out=>'view.2.out',args=>'-f PASS -Xks NA00003',reg=>'-r20,Y');
 run_test(\&test_vcf_view,$opts,in=>'view',out=>'view.3.out',args=>'-xs NA00003',reg=>'');
@@ -498,7 +527,7 @@
 run_test(\&test_vcf_annotate,$opts,in=>'annotate.missing',tab=>'annotate.missing',out=>'annotate.missing.5.out',args=>'-c CHROM,POS,REF,ALT,.=TSTR,.=TFLT,.=TINT');
 run_test(\&test_vcf_annotate,$opts,in=>'annotate.missing',tab=>'annotate.missing',out=>'annotate.missing.6.out',args=>'-c CHROM,POS,REF,ALT,=TSTR,=TFLT,=TINT');
 run_test(\&test_vcf_annotate,$opts,in=>'annotate.olap',tab=>'annots.olap',out=>'annotate.olap.1.out',args=>'-c CHROM,BEG,END,DB -l DB:unique');
-run_test(\&test_vcf_annotate,$opts,in=>'annotate.olap',tab=>'annots.olap',out=>'annotate.olap.2.out',args=>'-c CHROM,BEG,END,DB -l DB:unique --min-overlap 0.4:0.5');
+run_test(\&test_vcf_annotate,$opts,in=>'annotate.olap',tab=>'annots.olap',out=>'annotate.olap.2.out',args=>'-c CHROM,BEG,END,DB -l DB:unique --min-overlap 0.4:0.5 -m XXX');
 run_test(\&test_vcf_annotate,$opts,in=>'annotate.id',vcf=>'annots.id',out=>'annotate.id.1.out',args=>'-c ALT');
 run_test(\&test_vcf_annotate,$opts,in=>'annotate.id',vcf=>'annots.id',out=>'annotate.id.2.out',args=>'-c +ALT');
 run_test(\&test_vcf_annotate,$opts,in=>'annotate.id.2',vcf=>'annots.id.2',out=>'annotate.id.2.1.out',args=>'--pair-logic some -c +ID');
@@ -508,6 +537,8 @@
 run_test(\&test_vcf_annotate,$opts,in=>'annotate28',tab=>'annots28',out=>'annotate28.2.out',args=>'-c CHROM,POS,REF,ALT,FMT/TEST -s smpl2,smpl1');
 run_test(\&test_vcf_annotate,$opts,in=>'annotate28',tab=>'annots28',out=>'annotate28.3.out',args=>'-c CHROM,POS,REF,ALT,FMT/TEST -s smpl1');
 run_test(\&test_vcf_annotate,$opts,in=>'annotate28',tab=>'annots28',out=>'annotate28.4.out',args=>'-c CHROM,POS,REF,ALT,FMT/TEST -s smpl2');
+run_test(\&test_vcf_annotate,$opts,in=>'annotate',out=>'annotate.33.out',args=>'-m XXX');
+run_test(\&test_vcf_annotate,$opts,in=>'annotate34',tab=>'annots34',out=>'annotate34.out',args=>q[-c CHROM,FROM,TO,INFO/END -H '##INFO=<ID=END,Number=1,Type=Integer,Description="End coordinate in reference for SV">']);
 run_test(\&test_vcf_plugin,$opts,in=>'checkploidy',out=>'checkploidy.out',cmd=>'+check-ploidy --no-version');
 run_test(\&test_vcf_plugin,$opts,in=>'checkploidy.2',out=>'checkploidy.2.out',cmd=>'+check-ploidy --no-version');
 run_test(\&test_vcf_plugin,$opts,in=>'checkploidy.2',out=>'checkploidy.3.out',cmd=>'+check-ploidy --no-version',args=>'-- -m');
@@ -588,11 +619,6 @@
 run_test(\&test_vcf_plugin,$opts,in=>'view',out=>'view.GTsubset.NA1.out',cmd=>'+GTsubset --no-version',args=>'-- -s NA00001');
 run_test(\&test_vcf_plugin,$opts,in=>'view',out=>'view.GTsubset.NA1NA2.out',cmd=>'+GTsubset --no-version',args=>'-- -s NA00001,NA00002');
 run_test(\&test_vcf_plugin,$opts,in=>'view',out=>'view.GTsubset.NA1NA2NA3.out',cmd=>'+GTsubset --no-version',args=>'-- -s NA00001,NA00002,NA00003');
-run_test(\&test_vcf_plugin,$opts,in=>'mendelian',out=>'mendelian.1.out',cmd=>'+mendelian',args=>'-t mom1,dad1,child1 -md');
-run_test(\&test_vcf_plugin,$opts,in=>'mendelian',out=>'mendelian.2.out',cmd=>'+mendelian',args=>'-t mom1,dad1,child1 -m+');
-run_test(\&test_vcf_plugin,$opts,in=>'mendelian',out=>'mendelian.3.out',cmd=>'+mendelian',args=>'-t mom1,dad1,child1 -mx');
-run_test(\&test_vcf_plugin,$opts,in=>'mendelian',out=>'mendelian.4.out',cmd=>'+mendelian',args=>'-t mom1,dad1,child1 -ma');
-run_test(\&test_vcf_plugin,$opts,in=>'mendelian',out=>'mendelian.5.out',cmd=>'+mendelian',args=>'-t mom1,dad1,child1 -mu');
 run_test(\&test_vcf_plugin,$opts,in=>'mendelian',out=>'mendelian.1.out',cmd=>'+mendelian2',args=>'-p child1,dad1,mom1 -md');
 run_test(\&test_vcf_plugin,$opts,in=>'mendelian',out=>'mendelian.6.out',cmd=>'+mendelian2',args=>'-p child1,dad1,mom1 -mg');
 run_test(\&test_vcf_plugin,$opts,in=>'mendelian',out=>'mendelian.3.out',cmd=>'+mendelian2',args=>'-p child1,dad1,mom1 -me');
@@ -658,9 +684,14 @@
 run_test(\&test_vcf_plugin,$opts,in=>'split-vep.10',out=>'split-vep.25.out',cmd=>'+split-vep',args=>qq[-a CSQ -f '%xM_CAP_pred %xM_CAP_score\\n' -p x | grep -v ^#]);
 run_test(\&test_vcf_plugin,$opts,in=>'split-vep.gene-list',out=>'split-vep.gene-list.1.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Gene %Consequence\\n']);
 run_test(\&test_vcf_plugin,$opts,in=>'split-vep.gene-list',out=>'split-vep.gene-list.2.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Gene %Consequence\\n' -g {PATH}/split-vep.gene-list.txt]);
+run_test(\&test_vcf_plugin,$opts,in=>'split-vep.gene-list',out=>'split-vep.gene-list.2.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Gene %Consequence\\n' -g {PATH}/split-vep.mixed-list.txt --gene-list-fields Feature,SYMBOL]);
 run_test(\&test_vcf_plugin,$opts,in=>'split-vep.gene-list',out=>'split-vep.gene-list.3.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Gene %Consequence\\n' -g +{PATH}/split-vep.gene-list.txt]);
+run_test(\&test_vcf_plugin,$opts,in=>'split-vep.gene-list',out=>'split-vep.gene-list.3.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Gene %Consequence\\n' -g +{PATH}/split-vep.mixed-list.txt --gene-list-fields Feature,SYMBOL]);
 run_test(\&test_vcf_plugin,$opts,in=>'split-vep.broken-LoF',out=>'split-vep.broken-LoF.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Consequence %LoF_info\\n' -a vep]);
 run_test(\&test_vcf_plugin,$opts,in=>'split-vep.broken-LoF',out=>'split-vep.broken-LoF.2.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %LoF_info\\n' -a vep -i 'Consequence=="frameshift_variant"']);
+run_test(\&test_vcf_plugin,$opts,in=>'split-vep',out=>'split-vep.26.out',cmd=>'+split-vep',args=>qq[-f'%POS\\n' -i'SYMBOL~"SAMD11"']);
+run_test(\&test_vcf_plugin,$opts,in=>'split-vep.filter',out=>'split-vep.filter.1.out',cmd=>'+split-vep',args=>qq[-s worst -i'CSQ~"nonsense"' -f '%POS %Consequence %Feature %BIOTYPE']);
+run_test(\&test_vcf_plugin,$opts,in=>'split-vep.filter',out=>'split-vep.filter.2.out',cmd=>'+split-vep',args=>qq[-s worst -i'CSQ~"nonsense"' -f '%POS %Consequence %Feature %BIOTYPE %CSQ']);
 run_test(\&test_vcf_plugin,$opts,in=>'parental-origin',out=>'parental-origin.1.out',cmd=>'+parental-origin',args=>qq[-r 20:100 -p proband,father,mother -t del | grep -v ^#]);
 run_test(\&test_vcf_plugin,$opts,in=>'parental-origin',out=>'parental-origin.2.out',cmd=>'+parental-origin',args=>qq[-r 20:101 -p proband,father,mother -t del | grep -v ^#]);
 run_test(\&test_vcf_plugin,$opts,in=>'parental-origin',out=>'parental-origin.3.out',cmd=>'+parental-origin',args=>qq[-r 20:102 -p proband,father,mother -t del | grep -v ^#]);
@@ -704,6 +735,8 @@
 run_test(\&test_vcf_concat,$opts,in=>['concat.5.a','concat.5.b','concat.5.c'],out=>'concat.5.1.out',do_bcf=>0,args=>'-l --ligate-warn');
 run_test(\&test_vcf_concat,$opts,in=>['concat.5.a','concat.5.b','concat.5.c'],out=>'concat.5.1.out',do_bcf=>1,args=>'-l --ligate-warn');
 run_test(\&test_vcf_concat,$opts,in=>['concat.5.a','concat.5.b','concat.5.c'],out=>'concat.5.2.out',do_bcf=>1,args=>'-l --ligate-force');
+run_test(\&test_vcf_concat,$opts,in=>['concat.5.a','concat.5.b','concat.5.c'],out=>'concat.5.3.out',do_bcf=>0,args=>'-G -a -D');
+run_test(\&test_vcf_concat,$opts,in=>['concat.5.a','concat.5.b','concat.5.c'],out=>'concat.5.3.out',do_bcf=>1,args=>'-G -a -D');
 run_test(\&test_vcf_reheader,$opts,in=>'reheader',out=>'reheader.1.out',header=>'reheader.hdr');
 run_test(\&test_vcf_reheader,$opts,in=>'reheader',out=>'reheader.2.out',samples=>'reheader.samples');
 run_test(\&test_vcf_reheader,$opts,in=>'reheader',out=>'reheader.2.out',samples=>'reheader.samples2');
@@ -712,6 +745,7 @@
 run_test(\&test_vcf_reheader,$opts,in=>'empty',out=>'reheader.empty.out',header=>'reheader.empty.hdr');
 run_test(\&test_vcf_reheader,$opts,in=>'reheader.2',out=>'reheader.5.out',args=>'-f {PATH}/reheader.fai',nostdin=>1);
 run_test(\&test_vcf_reheader,$opts,in=>'reheader.2',out=>'reheader.5.out',args=>'-h {PATH}/reheader.2.hdr -f {PATH}/reheader.fai',nostdin=>1);
+run_test(\&test_vcf_reheader,$opts,in=>'reheader.3',out=>'reheader.6.out',args=>'-f {PATH}/reheader.3.fai',nostdin=>1);
 run_test(\&test_rename_chrs,$opts,in=>'annotate');
 run_test(\&test_vcf_convert,$opts,in=>'convert',out=>'convert.gs.gt.gen',args=>'-g -,.');
 run_test(\&test_vcf_convert,$opts,in=>'convert',out=>'convert.gs.gt.ids.gen',args=>'-g -,. --vcf-ids');
@@ -743,6 +777,7 @@
 run_test(\&test_vcf_convert_hs2vcf,$opts,h=>'convert.hs.gt.ids.hap',s=>'convert.hs.gt.samples',out=>'convert.gt.noHead.ids.vcf',args=>'--vcf-ids --hapsample2vcf');
 run_test(\&test_vcf_convert_gvcf,$opts,in=>'convert.gvcf',out=>'convert.gvcf.out',fa=>'gvcf.fa',args=>'--gvcf2vcf -i\'FILTER="PASS"\'');
 run_test(\&test_vcf_convert_tsv2vcf,$opts,in=>'convert.23andme',out=>'convert.23andme.vcf',args=>'-c ID,CHROM,POS,AA -s SAMPLE1',fai=>'23andme');
+run_test(\&test_vcf_convert_tsv2vcf,$opts,in=>'convert.tsv',out=>'convert.tsv.vcf',args=>'-c -,CHROM,POS,REF,ALT',fai=>'23andme');
 run_test(\&test_vcf_consensus,$opts,in=>'consensus',out=>'consensus.1.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-s -');
 run_test(\&test_vcf_consensus_chain,$opts,in=>'consensus',out=>'consensus.1.chain',chain=>'consensus.1.chain',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-s -');
 run_test(\&test_vcf_consensus,$opts,in=>'consensus',out=>'consensus.2.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-H 1');
@@ -761,9 +796,13 @@
 run_test(\&test_vcf_consensus,$opts,in=>'consensus5',out=>'consensus5.out',fa=>'consensus5.fa',args=>'--haplotype LA');
 run_test(\&test_vcf_consensus,$opts,in=>'consensus6',out=>'consensus6.out',fa=>'consensus6.fa',args=>'-s -');
 run_test(\&test_vcf_consensus,$opts,in=>'consensus7',out=>'consensus7a.out',fa=>'consensus7.fa',args=>'-H 2');
+run_test(\&test_vcf_consensus,$opts,in=>'consensus7',out=>'consensus7a.out',fa=>'consensus7.fa',args=>'-H 4');
 run_test(\&test_vcf_consensus,$opts,in=>'consensus7',out=>'consensus7b.out',fa=>'consensus7.fa',args=>'-H 2pIu');
+run_test(\&test_vcf_consensus,$opts,in=>'consensus7',out=>'consensus7b.out',fa=>'consensus7.fa',args=>'-H 4pIu');
 run_test(\&test_vcf_consensus,$opts,in=>'consensus7',out=>'consensus7c.out',fa=>'consensus7.fa',args=>'-H 1');
+run_test(\&test_vcf_consensus,$opts,in=>'consensus7',out=>'consensus7c.out',fa=>'consensus7.fa',args=>'-H 3');
 run_test(\&test_vcf_consensus,$opts,in=>'consensus7',out=>'consensus7d.out',fa=>'consensus7.fa',args=>'-H 1pIu');
+run_test(\&test_vcf_consensus,$opts,in=>'consensus7',out=>'consensus7d.out',fa=>'consensus7.fa',args=>'-H 3pIu');
 run_test(\&test_vcf_consensus,$opts,in=>'consensus8',out=>'consensus.8a.out',fa=>'consensus.fa',args=>'-s -');
 run_test(\&test_vcf_consensus,$opts,in=>'consensus8',out=>'consensus.8b.out',fa=>'consensus.fa',args=>'-s - -a .');
 run_test(\&test_vcf_consensus,$opts,in=>'consensus8',out=>'consensus.8c.out',fa=>'consensus.fa',args=>q[-s - -a . -i 'type="snp" || type="ref"']);
@@ -780,6 +819,7 @@
 run_test(\&test_vcf_consensus,$opts,in=>'consensus.13',out=>'consensus.13.out',fa=>'consensus.13.fa',args=>'-s -');
 run_test(\&test_vcf_consensus,$opts,in=>'consensus.14',out=>'consensus.14.out',fa=>'consensus.14.fa',args=>'-s -');
 run_test(\&test_vcf_consensus,$opts,in=>'consensus.12',out=>'consensus.15.out',fa=>'consensus.12.fa',args=>'-s - --mark-del - --mark-ins uc --mark-snv uc');
+run_test(\&test_vcf_consensus,$opts,in=>'consensus.12',out=>'consensus.19.out',fa=>'consensus.12.fa',args=>'-s - --mark-del - --mark-ins + --mark-snv :');
 run_test(\&test_vcf_consensus,$opts,in=>'consensus.15',out=>'consensus.17.out',fa=>'consensus.15.fa',args=>'-H I --mark-ins lc --mark-snv lc');
 run_test(\&test_vcf_consensus,$opts,in=>'consensus.16',out=>'consensus.18.out',fa=>'consensus.fa',args=>'-s - -I');
 run_test(\&test_vcf_consensus,$opts,in=>'consensus.16',out=>'consensus.18.out',fa=>'consensus.fa',args=>'-H I');
@@ -790,6 +830,7 @@
 run_test(\&test_vcf_consensus,$opts,in=>'consensus.20',out=>'consensus20.2.out',fa=>'consensus.20.fa',args=>'');
 run_test(\&test_vcf_consensus,$opts,in=>'consensus.20',out=>'consensus20.3.out',fa=>'consensus.20.fa',args=>'-M . -s b');
 run_test(\&test_vcf_consensus,$opts,in=>'consensus.20',out=>'consensus20.4.out',fa=>'consensus.20.fa',args=>'-M . -s a');
+run_test(\&test_vcf_consensus,$opts,in=>'consensus.21',out=>'consensus21.1.out',fa=>'consensus.21.fa',args=>'');
 run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.1.out',args=>q[-r17:100-150],test_list=>1);
 run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.2.out',args=>q[-a DP,DV -r17:100-600]); # test files from samtools mpileup test suite
 run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1)],out=>'mpileup/mpileup.3.out',args=>q[-B --ff 0x14 -r17:1050-1060]); # test file converted to vcf from samtools mpileup test suite
@@ -824,6 +865,14 @@
 run_test(\&test_csq,$opts,in=>'csq',out=>'csq.1.out',cmd=>'-f {PATH}/csq.fa -g {PATH}/csq.chr.gff3');
 run_test(\&test_csq,$opts,in=>'csq.2',out=>'csq.2.out',cmd=>'-f {PATH}/csq.fa -g {PATH}/csq.2.gff',tbcsq=>1);
 run_test(\&test_csq,$opts,in=>'csq.2',out=>'csq.3.out',cmd=>'-f {PATH}/csq.fa -g {PATH}/csq.2.gff --ncsq 64',tbcsq=>1);
+run_test(\&test_csq,$opts,in=>'csq.nchr',out=>'csq.chr.out',cmd=>'-f {PATH}/csq.nchr.fa -g {PATH}/csq.nchr.gff',tbcsq=>1);
+run_test(\&test_csq,$opts,in=>'csq.nchr',out=>'csq.chr.out',cmd=>'-f {PATH}/csq.ychr.fa -g {PATH}/csq.nchr.gff',tbcsq=>1);
+run_test(\&test_csq,$opts,in=>'csq.nchr',out=>'csq.chr.out',cmd=>'-f {PATH}/csq.nchr.fa -g {PATH}/csq.ychr.gff',tbcsq=>1);
+run_test(\&test_csq,$opts,in=>'csq.nchr',out=>'csq.chr.out',cmd=>'-f {PATH}/csq.ychr.fa -g {PATH}/csq.ychr.gff',tbcsq=>1);
+run_test(\&test_csq,$opts,in=>'csq.ychr',out=>'csq.chr.out',cmd=>'-f {PATH}/csq.ychr.fa -g {PATH}/csq.ychr.gff',tbcsq=>1);
+run_test(\&test_csq,$opts,in=>'csq.ychr',out=>'csq.chr.out',cmd=>'-f {PATH}/csq.ychr.fa -g {PATH}/csq.nchr.gff',tbcsq=>1);
+run_test(\&test_csq,$opts,in=>'csq.ychr',out=>'csq.chr.out',cmd=>'-f {PATH}/csq.nchr.fa -g {PATH}/csq.ychr.gff',tbcsq=>1);
+run_test(\&test_csq,$opts,in=>'csq.ychr',out=>'csq.chr.out',cmd=>'-f {PATH}/csq.nchr.fa -g {PATH}/csq.nchr.gff',tbcsq=>1);
 run_test(\&test_csq_real,$opts,in=>'csq');
 run_test(\&test_roh,$opts,in=>'roh.1',out=>'roh.1.1.out',args=>q[-Or -G30 --AF-dflt 0.4]);
 run_test(\&test_roh,$opts,in=>'roh.1',out=>'roh.1.1.out',args=>q[-Or -G30 --AF-file {PATH}/roh.1.tab.gz]);
@@ -955,11 +1004,12 @@ sub run_test
         my %args = @args;
         my $run  = 0;
         if ( exists($$opts{run_function}{$name}) ) { $run = 1; }
-        if ( !$run && exists($args{cmd}) )
+        if ( !$run )
         {
             for my $func (keys %{$$opts{run_function}})
             {
-                if ( $args{cmd}=~/$func/ ) { $run = 1; last; }
+                if ( exists($args{cmd}) && $args{cmd}=~/$func/ ) { $run = 1; last; }
+                if ( $name=~/$func/ ) { $run = 1; last; }
             }
         }
         if ( !$run ) { return; }
@@ -1041,7 +1091,13 @@ sub test_cmd
 
     my ($ret,$out,$err) = _cmd3("$args{cmd}");
     if ( length($err) ) { $err =~ s/\n/\n\t\t/gs; $err = "\n\n\t\t$err\n"; }
-    if ( $ret ) { failed($opts,$test,"Non-zero status $ret$err"); return; }
+    if ( $ret && !$args{expected_failure} ) { failed($opts,$test,"Non-zero status $ret$err"); return; }
+    if ( $args{expected_failure} )
+    {
+        if ( !$ret ) { failed($opts,$test,"Expected failure but the test returned $ret$err"); }
+        else { passed($opts,$test,"ok, expected non-zero status"); }
+        return;
+    }
     if ( $$opts{redo_outputs} && -e "$$opts{path}/$args{out}" )
     {
         rename("$$opts{path}/$args{out}","$$opts{path}/$args{out}.old");
@@ -1113,9 +1169,10 @@ sub failed
 }
 sub passed
 {
-    my ($opts,$test) = @_;
+    my ($opts,$test,$reason) = @_;
     $$opts{nok}++;
-    print ".. ok\n\n";
+    if ( !defined $reason ) { $reason = 'ok'; }
+    print ".. $reason\n\n";
 }
 sub is_file_newer
 {
@@ -1144,6 +1201,18 @@ sub bgzip_tabix_vcf
     my ($opts,$file) = @_;
     bgzip_tabix($opts,file=>$file,suffix=>'vcf',args=>'-p vcf');
 }
+sub bgzip_index_bcf
+{
+    my ($opts,$file) = @_;
+    if ( !-e "$$opts{tmp}/$file.bcf" or is_file_newer("$$opts{path}/$file.vcf","$$opts{tmp}/$file.bcf") )
+    {
+        cmd("$$opts{bin}/bcftools view -Ob $$opts{path}/$file.vcf -o $$opts{tmp}/$file.bcf");
+    }
+    if ( !-e "$$opts{tmp}/$file.bcf.csi" or is_file_newer("$$opts{tmp}/$file.bcf","$$opts{tmp}/$file.bcf.csi") )
+    {
+        cmd("$$opts{bin}/bcftools index -f $$opts{tmp}/$file.bcf");
+    }
+}
 
 
 # The tests --------------------------
@@ -1278,7 +1347,7 @@ sub test_vcf_merge
         $args     =~ s/{PATH}/$$opts{path}/g;
         my $files = join(' ',@files);
         test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools merge --no-version $args $files", exp_fix=>1);
-        test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools merge -Ob $args $files | $$opts{bin}/bcftools view | grep -v ^##bcftools_", exp_fix => 1);
+        test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools merge --no-version -Ob $args $files | $$opts{bin}/bcftools view --no-version | grep -v ^##bcftools_", exp_fix => 1);
     }
 }
 sub test_vcf_isec
@@ -1362,6 +1431,7 @@ sub test_vcf_norm
     my ($opts,%args) = @_;
     bgzip_tabix_vcf($opts,$args{in});
     my $params = '';
+    $args{args} =~ s/{PATH}/$$opts{path}/g;
     if ( exists($args{args}) ) { $params .= " $args{args}"; }
     if ( exists($args{fai} ) ) { $params .= " -f $$opts{path}/$args{fai}.fa"; }
     test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools norm --no-version $params $$opts{tmp}/$args{in}.vcf.gz",exp_fix=>1);
@@ -1725,10 +1795,12 @@ sub test_vcf_consensus
 {
     my ($opts,%args) = @_;
     bgzip_tabix_vcf($opts,$args{in});
+    bgzip_index_bcf($opts,$args{in});
     $args{args} =~ s/{PATH}/$$opts{path}/g;
     my $mask = $args{mask} ? "-m $$opts{path}/$args{mask}" : '';
     my $chain = $args{chain} ? "-c $$opts{tmp}/$args{chain}" : '';
     test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools consensus $$opts{tmp}/$args{in}.vcf.gz -f $$opts{path}/$args{fa} $args{args} $mask $chain");
+    test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools consensus $$opts{tmp}/$args{in}.bcf    -f $$opts{path}/$args{fa} $args{args} $mask $chain");
 }
 sub test_vcf_consensus_chain
 {
diff --git a/test/weird-chr-names.1.out b/test/weird-chr-names.1.out
new file mode 100644
index 000000000..5705c7575
--- /dev/null
+++ b/test/weird-chr-names.1.out
@@ -0,0 +1,9 @@
+##fileformat=VCFv4.3
+##FILTER=<ID=PASS,Description="All filters passed">
+##reference=ref.fa
+##contig=<ID=1>
+##contig=<ID=1:1>
+##contig=<ID=1:1-1>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+1	1	.	C	T	.	.	.
+1	2	.	C	T	.	.	.
diff --git a/test/weird-chr-names.2.out b/test/weird-chr-names.2.out
new file mode 100644
index 000000000..980818a71
--- /dev/null
+++ b/test/weird-chr-names.2.out
@@ -0,0 +1,8 @@
+##fileformat=VCFv4.3
+##FILTER=<ID=PASS,Description="All filters passed">
+##reference=ref.fa
+##contig=<ID=1>
+##contig=<ID=1:1>
+##contig=<ID=1:1-1>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+1	1	.	C	T	.	.	.
diff --git a/test/weird-chr-names.3.out b/test/weird-chr-names.3.out
new file mode 100644
index 000000000..5b3ac8e18
--- /dev/null
+++ b/test/weird-chr-names.3.out
@@ -0,0 +1,9 @@
+##fileformat=VCFv4.3
+##FILTER=<ID=PASS,Description="All filters passed">
+##reference=ref.fa
+##contig=<ID=1>
+##contig=<ID=1:1>
+##contig=<ID=1:1-1>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+1:1	1	.	C	T	.	.	.
+1:1	2	.	C	T	.	.	.
diff --git a/test/weird-chr-names.4.out b/test/weird-chr-names.4.out
new file mode 100644
index 000000000..0d9e274ab
--- /dev/null
+++ b/test/weird-chr-names.4.out
@@ -0,0 +1,8 @@
+##fileformat=VCFv4.3
+##FILTER=<ID=PASS,Description="All filters passed">
+##reference=ref.fa
+##contig=<ID=1>
+##contig=<ID=1:1>
+##contig=<ID=1:1-1>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+1:1	1	.	C	T	.	.	.
diff --git a/test/weird-chr-names.5.out b/test/weird-chr-names.5.out
new file mode 100644
index 000000000..6cb41e14f
--- /dev/null
+++ b/test/weird-chr-names.5.out
@@ -0,0 +1,9 @@
+##fileformat=VCFv4.3
+##FILTER=<ID=PASS,Description="All filters passed">
+##reference=ref.fa
+##contig=<ID=1>
+##contig=<ID=1:1>
+##contig=<ID=1:1-1>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+1:1-1	1	.	C	T	.	.	.
+1:1-1	2	.	C	T	.	.	.
diff --git a/test/weird-chr-names.6.out b/test/weird-chr-names.6.out
new file mode 100644
index 000000000..a707ed85c
--- /dev/null
+++ b/test/weird-chr-names.6.out
@@ -0,0 +1,8 @@
+##fileformat=VCFv4.3
+##FILTER=<ID=PASS,Description="All filters passed">
+##reference=ref.fa
+##contig=<ID=1>
+##contig=<ID=1:1>
+##contig=<ID=1:1-1>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+1:1-1	1	.	C	T	.	.	.
diff --git a/test/weird-chr-names.vcf b/test/weird-chr-names.vcf
new file mode 100644
index 000000000..c367be477
--- /dev/null
+++ b/test/weird-chr-names.vcf
@@ -0,0 +1,12 @@
+##fileformat=VCFv4.3
+##reference=ref.fa
+##contig=<ID=1>
+##contig=<ID=1:1>
+##contig=<ID=1:1-1>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+1	1	.	C	T	.	.	.
+1	2	.	C	T	.	.	.
+1:1	1	.	C	T	.	.	.
+1:1	2	.	C	T	.	.	.
+1:1-1	1	.	C	T	.	.	.
+1:1-1	2	.	C	T	.	.	.
diff --git a/tsv2vcf.c b/tsv2vcf.c
index 596e75a0a..22dec3065 100644
--- a/tsv2vcf.c
+++ b/tsv2vcf.c
@@ -10,10 +10,10 @@
     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     copies of the Software, and to permit persons to whom the Software is
     furnished to do so, subject to the following conditions:
-    
+
     The above copyright notice and this permission notice shall be included in
     all copies or substantial portions of the Software.
-    
+
     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
diff --git a/variantkey.h b/variantkey.h
index ccd4d8dd0..a74935fb7 100644
--- a/variantkey.h
+++ b/variantkey.h
@@ -3,14 +3,15 @@
 // variantkey.h
 //
 // @category   Libraries
-// @author     Nicola Asuni <nicola.asuni@genomicsplc.com>
-// @copyright  2017-2018 GENOMICS plc
-// @license    MIT (see LICENSE)
-// @link       https://github.com/genomicsplc/variantkey
+// @author     Nicola Asuni <info@tecnick.com>
+// @link       https://github.com/tecnickcom/variantkey
+// @license    MIT [LICENSE](https://raw.githubusercontent.com/tecnickcom/variantkey/main/LICENSE)
+// @copyright  2017-2018 GENOMICS plc, 2018-2023 Nicola Asuni - Tecnick.com
 //
 // LICENSE
 //
 // Copyright (c) 2017-2018 GENOMICS plc
+// Copyright (c) 2018-2023 Nicola Asuni - Tecnick.com
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -54,6 +55,7 @@
 #define VKMASK_REFALT   0x000000007FFFFFFF  //!< VariantKey binary mask for REF+ALT   [ 00000000 00000000 00000000 00000000 01111111 11111111 11111111 11111111 ]
 #define VKSHIFT_CHROM   59 //!< CHROM LSB position from the VariantKey LSB
 #define VKSHIFT_POS     31 //!< POS LSB position from the VariantKey LSB
+#define MAXUINT32       0xFFFFFFFF //!< Maximum value for uint32_t
 
 /**
  * VariantKey struct.
@@ -75,16 +77,54 @@ typedef struct vkrange_t
     uint64_t max; //!< Maximum VariantKey value for any given REF+ALT encoding
 } vkrange_t;
 
-/** @brief Returns chromosome numerical encoding.
+/** @brief Returns the encoding for a numerical chromosome input.
  *
  * @param chrom  Chromosome. An identifier from the reference genome, no white-space permitted.
  * @param size   Length of the chrom string, excluding the terminating null byte.
  *
  * @return CHROM code
  */
+static inline uint8_t encode_numeric_chrom(const char *chrom, size_t size)
+{
+    size_t i;
+    uint8_t v = (chrom[0] - '0');
+    for (i = 1; i < size; i++)
+    {
+        if ((chrom[i] > '9') || (chrom[i] < '0'))
+        {
+            return 0; // NA: a character that is not a numebr was found.
+        }
+        v = ((v * 10) + (chrom[i] - '0'));
+    }
+    return v;
+}
+
+
+/** @brief Returns a true value (1) if the input chrom has 'chr' prefix (case insensitive).
+ *
+ * @param chrom  Chromosome. An identifier from the reference genome, no white-space permitted.
+ * @param size   Length of the chrom string, excluding the terminating null byte.
+ *
+ * @return True (1) if the chr prefix is present.
+ */
+static inline int has_chrom_chr_prefix(const char *chrom, size_t size)
+{
+    return ((size > 3)
+            && ((chrom[0] == 'c') || (chrom[0] == 'C'))
+            && ((chrom[1] == 'h') || (chrom[1] == 'H'))
+            && ((chrom[2] == 'r') || (chrom[2] == 'R')));
+}
+
+/** @brief Returns chromosome numerical encoding.
+ *
+ * @param chrom  Chromosome. An identifier from the reference genome, no white-space permitted.
+ * @param size   Length of the chrom string, excluding the terminating null byte.
+ *
+ * @return CHROM code or 0 in case of invalid input.
+ */
 static inline uint8_t encode_chrom(const char *chrom, size_t size)
 {
-    // X > 23 ; Y > 24 ; M > 25
+    // X = 23; Y = 24; M = 25; any other letter is mapped to 0:
     static const uint8_t onecharmap[] =
     {
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -98,12 +138,9 @@ static inline uint8_t encode_chrom(const char *chrom, size_t size)
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     };
-    // remove "chr" prefix
-    if ((size > 3)
-            && ((chrom[0] == 'c') || (chrom[0] == 'C'))
-            && ((chrom[1] == 'h') || (chrom[1] == 'H'))
-            && ((chrom[2] == 'r') || (chrom[2] == 'R')))
+    if (has_chrom_chr_prefix(chrom, size))
     {
+        // remove "chr" prefix
         chrom += 3;
         size -= 3;
     }
@@ -111,19 +148,9 @@ static inline uint8_t encode_chrom(const char *chrom, size_t size)
     {
         return 0;
     }
-    if ((chrom[0] <= '9') && (chrom[0] >= '0')) // Number
+    if ((chrom[0] <= '9') && (chrom[0] >= '0'))
     {
-        size_t i;
-        uint8_t v = (chrom[0] - '0');
-        for (i = 1; i < size; i++)
-        {
-            if ((chrom[i] > '9') || (chrom[i] < '0'))
-            {
-                return 0; // NA
-            }
-            v = ((v * 10) + (chrom[i] - '0'));
-        }
-        return v;
+        return encode_numeric_chrom(chrom, size);
     }
     if ((size == 1) || ((size == 2) && ((chrom[1] == 'T') || (chrom[1] == 't'))))
     {
@@ -159,10 +186,10 @@ static inline uint32_t encode_base(const uint8_t c)
 {
     /*
       Encode base:
-      A > 0
-      C > 1
-      G > 2
-      T > 3
+      A = 0
+      C = 1
+      G = 2
+      T = 3
     */
     static const uint32_t map[] =
     {
@@ -205,7 +232,7 @@ static inline uint32_t encode_refalt_rev(const char *ref, size_t sizeref, const
     uint8_t bitpos = 23;
     if ((encode_allele(&h, &bitpos, ref, sizeref) < 0) || (encode_allele(&h, &bitpos, alt, sizealt) < 0))
     {
-        return 0; // error code
+        return MAXUINT32; // error code
     }
     return h;
 }
@@ -318,7 +345,7 @@ static inline uint32_t encode_refalt(const char *ref, size_t sizeref, const char
     if ((sizeref + sizealt) <= 11)
     {
         uint32_t h = encode_refalt_rev(ref, sizeref, alt, sizealt);
-        if (h != 0)
+        if (h != MAXUINT32)
         {
             return h;
         }
@@ -486,7 +513,9 @@ static inline void decode_variantkey(uint64_t code, variantkey_t *vk)
     vk->refalt = extract_variantkey_refalt(code);
 }
 
-/** @brief Returns a 64 bit variant key based on CHROM, POS (0-based), REF, ALT.
+/**
+ * Returns a 64 bit variant key based on CHROM, POS (0-based), REF, ALT.
+ * The variant should be already normalized (see normalize_variant or use normalized_variantkey).
  *
  * @param chrom      Chromosome. An identifier from the reference genome, no white-space or leading zeros permitted.
  * @param sizechrom  Length of the chrom string, excluding the terminating null byte.
diff --git a/vcfannotate.c b/vcfannotate.c
index 495d2b5a3..b2e39ef7b 100644
--- a/vcfannotate.c
+++ b/vcfannotate.c
@@ -1,6 +1,6 @@
 /*  vcfannotate.c -- Annotate and edit VCF/BCF files.
 
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -118,6 +118,8 @@ typedef struct _args_t
     htsFile *out_fh;
     int output_type, n_threads, clevel;
     bcf_sr_regions_t *tgts;
+    char *index_fn;
+    int write_index;
 
     regidx_t *tgt_idx;  // keep everything in memory only with .tab annotation file and -c BEG,END columns
     regitr_t *tgt_itr;
@@ -2863,9 +2865,16 @@ static void init_data(args_t *args)
 
     if ( args->mark_sites )
     {
-        if ( !args->targets_fname ) error("The -a option not given\n");
-        bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites %slisted in %s\">",
-            args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
+        if ( !args->targets_fname )
+        {
+            if ( args->mark_sites_logic!=MARK_LISTED ) error("The -a option not given but -%s logic was requested\n",args->mark_sites);
+            fprintf(stderr,"Note: The -a option not given, all sites will be annotated with INFO/%s\n",args->mark_sites);
+            bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites marked with `bcftools annotate -m %s`\">",
+                    args->mark_sites,args->mark_sites);
+        }
+        else
+            bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites %slisted in %s\">",
+                args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
     }
 
     if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate");
@@ -2881,6 +2890,7 @@ static void init_data(args_t *args)
         if ( args->n_threads )
             hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p);
         if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: failed to write the header to %s\n", __func__,args->output_fname);
+        if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
     }
 }
 
@@ -2943,7 +2953,19 @@ static void destroy_data(args_t *args)
         convert_destroy(args->set_ids);
     if ( args->filter )
         filter_destroy(args->filter);
-    if (args->out_fh) hts_close(args->out_fh);
+    if (args->out_fh)
+    {
+        if ( args->write_index )
+        {
+            if ( bcf_idx_save(args->out_fh)<0 )
+            {
+                if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+                error("Error: cannot write to index %s\n", args->index_fn);
+            }
+            free(args->index_fn);
+        }
+        if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+    }
     free(args->sample_map);
     free(args->merge_method_str.s);
 }
@@ -3072,6 +3094,7 @@ static void annotate(args_t *args, bcf1_t *line)
         for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
         if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) )
         {
+            hts_pos_t vcf_end = line->pos + line->rlen - 1;
             while ( regitr_overlap(args->tgt_itr) )
             {
                 annot_line_t *tmp = &args->alines[0];
@@ -3082,7 +3105,7 @@ static void annotate(args_t *args, bcf1_t *line)
                 // Check min overlap
                 int len_ann = tmp->end - tmp->start + 1;
                 int len_vcf = line->rlen;
-                int isec = (tmp->end < line->pos+line->rlen-1 ? tmp->end : line->pos+line->rlen-1) - (tmp->start > line->pos ? tmp->start : line->pos) + 1;
+                int isec = (tmp->end < vcf_end ? tmp->end : vcf_end) - (tmp->start > line->pos ? tmp->start : line->pos) + 1;
                 assert( isec > 0 );
                 if ( args->min_overlap_ann && args->min_overlap_ann > (float)isec/len_ann ) continue;
                 if ( args->min_overlap_vcf && args->min_overlap_vcf > (float)isec/len_vcf ) continue;
@@ -3096,9 +3119,9 @@ static void annotate(args_t *args, bcf1_t *line)
                         error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
                     if ( ret==0 )
                         args->cols[j].done = 1;
+                    has_overlap = 1;
                 }
             }
-            has_overlap = 1;
         }
         for (j=0; j<args->ncols; j++)
         {
@@ -3273,6 +3296,8 @@ static void annotate(args_t *args, bcf1_t *line)
 
     if ( args->mark_sites )
     {
+        if ( !args->targets_fname ) has_overlap = 1;
+
         // ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87
         if ( args->mark_sites_logic==MARK_LISTED )
             bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?1:0);
@@ -3315,6 +3340,7 @@ static void usage(args_t *args)
     fprintf(stderr, "       --single-overlaps           Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
     fprintf(stderr, "   -x, --remove LIST               List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
     fprintf(stderr, "       --threads INT               Number of extra output compression threads [0]\n");
+    fprintf(stderr, "       --write-index               Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "Examples:\n");
     fprintf(stderr, "   http://samtools.github.io/bcftools/howtos/annotate.html\n");
@@ -3371,6 +3397,7 @@ int main_vcfannotate(int argc, char *argv[])
         {"min-overlap",required_argument,NULL,12},
         {"no-version",no_argument,NULL,8},
         {"force",no_argument,NULL,'f'},
+        {"write-index",no_argument,NULL,13},
         {NULL,0,NULL,0}
     };
     char *tmp;
@@ -3447,6 +3474,7 @@ int main_vcfannotate(int argc, char *argv[])
             case 10 : args->single_overlaps = 1; break;
             case 11 : args->rename_annots = optarg; break;
             case 12 : args->min_overlap_str = optarg; break;
+            case 13 : args->write_index = 1; break;
             case '?': usage(args); break;
             default: error("Unknown argument: %s\n", optarg);
         }
diff --git a/vcfcall.c b/vcfcall.c
index 1cd6f504c..d2f6e2c5f 100644
--- a/vcfcall.c
+++ b/vcfcall.c
@@ -1,6 +1,6 @@
 /*  vcfcall.c -- SNP/indel variant calling from VCF/BCF.
 
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -97,6 +97,8 @@ typedef struct
 
     int argc;
     char **argv;
+    char *index_fn;
+    int write_index;
 
     //  int flag, prior_type, n1, n_sub, *sublist, n_perm;
     //  uint32_t *trio_aux;
@@ -715,6 +717,7 @@ static void init_data(args_t *args)
 
     if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
     if ( bcf_hdr_write(args->out_fh, args->aux.hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(args->out_fh,args->aux.hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
 
     if ( args->flag&CF_INS_MISSED ) init_missed_line(args);
 }
@@ -753,6 +756,15 @@ static void destroy_data(args_t *args)
     free(args->str.s);
     if ( args->gvcf ) gvcf_destroy(args->gvcf);
     bcf_hdr_destroy(args->aux.hdr);
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(args->out_fh)<0 )
+        {
+            if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
     if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
     bcf_sr_destroy(args->aux.srs);
 }
@@ -908,6 +920,7 @@ static void usage(args_t *args)
     fprintf(stderr, "   -M, --keep-masked-ref           Keep sites with masked reference allele (REF=N)\n");
     fprintf(stderr, "   -V, --skip-variants TYPE        Skip indels/snps\n");
     fprintf(stderr, "   -v, --variants-only             Output variant sites only\n");
+    fprintf(stderr, "       --write-index               Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "Consensus/variant calling options:\n");
     fprintf(stderr, "   -c, --consensus-caller          The original calling method (conflicts with -m)\n");
@@ -990,6 +1003,7 @@ int main_vcfcall(int argc, char *argv[])
         {"chromosome-X",no_argument,NULL,'X'},
         {"chromosome-Y",no_argument,NULL,'Y'},
         {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,10},
         {NULL,0,NULL,0}
     };
 
@@ -1076,6 +1090,7 @@ int main_vcfcall(int argc, char *argv[])
                 args.regions_overlap = parse_overlap_option(optarg);
                 if ( args.regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg);
                 break;
+            case  10: args.write_index = 1; break;
             default: usage(&args);
         }
     }
diff --git a/vcfconcat.c b/vcfconcat.c
index 74fd036b8..8e25cc590 100644
--- a/vcfconcat.c
+++ b/vcfconcat.c
@@ -1,6 +1,6 @@
 /*  vcfconcat.c -- Concatenate or combine VCF/BCF files.
 
-    Copyright (C) 2013-2021 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -46,6 +46,8 @@ typedef struct _args_t
     int output_type, n_threads, record_cmd_line, clevel;
     bcf_hdr_t *out_hdr;
     int *seen_seq;
+    char *index_fn;
+    int write_index;
 
     // phasing
     int *start_pos, start_tid, ifname;
@@ -59,10 +61,21 @@ typedef struct _args_t
     int argc, nfnames, allow_overlaps, phased_concat, regions_is_file, regions_overlap;
     int compact_PS, phase_set_changed, naive_concat, naive_concat_trust_headers;
     int verbose, explicit_output_type, ligate_force, ligate_warn;
+    int sites_only;
     htsThreadPool *tpool;
 }
 args_t;
 
+static bcf_hdr_t *drop_hdr_genotypes(args_t *args, bcf_hdr_t *hdr)
+{
+    if ( !args->sites_only ) return hdr;
+    bcf_hdr_t *rmme = hdr;
+    hdr = bcf_hdr_subset(rmme, 0, 0, 0);
+    bcf_hdr_remove(hdr, BCF_HL_FMT, NULL);
+    bcf_hdr_destroy(rmme);
+    return hdr;
+}
+
 static void init_data(args_t *args)
 {
     bcf1_t *line = NULL;
@@ -83,6 +96,8 @@ static void init_data(args_t *args)
     {
         htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]);
         bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]);
+        hdr = drop_hdr_genotypes(args, hdr);
+
         args->out_hdr = bcf_hdr_merge(args->out_hdr,hdr);
         if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) )
             error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]);
@@ -142,6 +157,7 @@ static void init_data(args_t *args)
         hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->tpool);
     }
     if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
 
     if ( args->allow_overlaps )
     {
@@ -203,7 +219,16 @@ static void destroy_data(args_t *args)
     int i;
     if ( args->out_fh )
     {
-        if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n");
+        if ( args->write_index )
+        {
+            if ( bcf_idx_save(args->out_fh)<0 )
+            {
+                if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+                error("Error: cannot write to index %s\n", args->index_fn);
+            }
+            free(args->index_fn);
+        }
+        if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n",args->output_fname?args->output_fname:"stdout");
     }
     if ( args->tpool && !args->files )
     {
@@ -264,7 +289,7 @@ static void phased_flush(args_t *args)
         bcf1_t *brec = args->buf[i+1];
 
         int nGTs = bcf_get_genotypes(ahdr, arec, &args->GTa, &args->mGTa);
-        if ( nGTs < 0 ) 
+        if ( nGTs < 0 )
         {
             if ( !gt_absent_warned )
             {
@@ -359,7 +384,7 @@ static void phased_flush(args_t *args)
             bcf_update_format_int32(args->out_hdr,rec,"PQ",args->phase_qual,nsmpl);
             PQ_printed = 1;
             for (j=0; j<nsmpl; j++)
-                if ( args->phase_qual[j] < args->min_PQ ) 
+                if ( args->phase_qual[j] < args->min_PQ )
                 {
                     args->phase_set[j] = rec->pos+1;
                     args->phase_set_changed = 1;
@@ -582,13 +607,14 @@ static void concat(args_t *args)
             {
                 bcf1_t *line = bcf_sr_get_line(args->files,i);
                 if ( !line ) continue;
+                if ( args->sites_only ) bcf_subset(args->out_hdr, line, 0, 0);
                 bcf_translate(args->out_hdr, args->files->readers[i].header, line);
                 if ( bcf_write1(args->out_fh, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
                 if ( args->remove_dups ) break;
             }
         }
     }
-    else    // concatenating
+    else    // concatenate as is
     {
         struct timeval t0, t1;
         kstring_t tmp = {0,0,0};
@@ -604,6 +630,13 @@ static void concat(args_t *args)
             htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("\nFailed to open: %s\n", args->fnames[i]);
             if ( args->n_threads ) hts_set_opt(fp, HTS_OPT_THREAD_POOL, args->tpool);
             bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("\nFailed to parse header: %s\n", args->fnames[i]);
+            if ( args->sites_only )
+            {
+                bcf_hdr_t *hdr_ori = hdr;
+                hdr = bcf_hdr_subset(hdr_ori, 0, 0, 0);
+                bcf_hdr_remove(hdr, BCF_HL_FMT, NULL);
+                bcf_hdr_destroy(hdr_ori);
+            }
             if ( !fp->is_bin && args->output_type&FT_VCF )
             {
                 line->max_unpack = BCF_UN_STR;
@@ -611,6 +644,22 @@ static void concat(args_t *args)
                 while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 )
                 {
                     char *str = fp->line.s;
+
+                    // remove genotypes
+                    if ( args->sites_only )
+                    {
+                        int ntab = 0;
+                        while ( *str )
+                        {
+                            if ( *str == '\t' && ++ntab==8 )
+                            {
+                                *str = 0;
+                                break;
+                            }
+                            str++;
+                        }
+                        str = fp->line.s;
+                    }
                     while ( *str && *str!='\t' ) str++;
                     tmp.l = 0;
                     kputsn(fp->line.s,str-fp->line.s,&tmp);
@@ -639,6 +688,7 @@ static void concat(args_t *args)
                 line->max_unpack = 0;
                 while ( bcf_read(fp, hdr, line)==0 )
                 {
+                    if ( args->sites_only ) bcf_subset(args->out_hdr, line, 0, 0);
                     bcf_translate(args->out_hdr, hdr, line);
 
                     if ( prev_chr_id!=line->rid )
@@ -917,6 +967,7 @@ static void usage(args_t *args)
     fprintf(stderr, "   -d, --rm-dups STRING           Output duplicate records present in multiple files only once: <snps|indels|both|all|exact>\n");
     fprintf(stderr, "   -D, --remove-duplicates        Alias for -d exact\n");
     fprintf(stderr, "   -f, --file-list FILE           Read the list of files from a file.\n");
+    fprintf(stderr, "   -G, --drop-genotypes           Drop individual genotype information.\n");
     fprintf(stderr, "   -l, --ligate                   Ligate phased VCFs by matching phase at overlapping haplotypes\n");
     fprintf(stderr, "       --ligate-force             Ligate even non-overlapping chunks, keep all sites\n");
     fprintf(stderr, "       --ligate-warn              Drop sites in imperfect overlaps\n");
@@ -931,6 +982,7 @@ static void usage(args_t *args)
     fprintf(stderr, "       --regions-overlap 0|1|2    Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
     fprintf(stderr, "       --threads INT              Use multithreading with <int> worker threads [0]\n");
     fprintf(stderr, "   -v, --verbose 0|1              Set verbosity level [1]\n");
+    fprintf(stderr, "       --write-index              Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     exit(1);
 }
@@ -969,10 +1021,12 @@ int main_vcfconcat(int argc, char *argv[])
         {"file-list",required_argument,NULL,'f'},
         {"min-PQ",required_argument,NULL,'q'},
         {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,13},
+        {"drop-genotypes",no_argument,NULL,'G'},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cnv:",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:Gr:R:cnv:",loptions,NULL)) >= 0)
     {
         switch (c) {
             case 'c': args->compact_PS = 1; break;
@@ -980,7 +1034,7 @@ int main_vcfconcat(int argc, char *argv[])
             case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
             case 'd': args->remove_dups = optarg; break;
             case 'D': args->remove_dups = "exact"; break;
-            case 'q': 
+            case 'q':
                 args->min_PQ = strtol(optarg,&tmp,10);
                 if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg);
                 break;
@@ -988,6 +1042,7 @@ int main_vcfconcat(int argc, char *argv[])
             case 'a': args->allow_overlaps = 1; break;
             case 'l': args->phased_concat = 1; break;
             case 'f': args->file_list = optarg; break;
+            case 'G': args->sites_only = 1; break;
             case 'o': args->output_fname = optarg; break;
             case 'O':
                 args->explicit_output_type = 1;
@@ -1021,6 +1076,7 @@ int main_vcfconcat(int argc, char *argv[])
                       args->verbose = strtol(optarg, &tmp, 0);
                       if ( *tmp || args->verbose<0 || args->verbose>1 ) error("Error: currently only --verbose 0 or --verbose 1 is supported\n");
                       break;
+            case 13 : args->write_index = 1; break;
             case 'h':
             case '?': usage(args); break;
             default: error("Unknown argument: %s\n", optarg);
@@ -1035,6 +1091,7 @@ int main_vcfconcat(int argc, char *argv[])
     }
     if ( args->ligate_force && args->ligate_warn ) error("The options cannot be combined: --ligate-force and --ligate-warn\n");
     if ( args->allow_overlaps && args->phased_concat ) error("The options -a and -l should not be combined. Please run with -l only.\n");
+    if ( args->sites_only && args->phased_concat ) error("The options --drop-genotypes and --ligate cannot be combined\n");
     if ( args->compact_PS && !args->phased_concat ) error("The -c option is intended only with -l\n");
     if ( args->file_list )
     {
@@ -1049,6 +1106,7 @@ int main_vcfconcat(int argc, char *argv[])
     {
         if ( args->allow_overlaps ) error("The option --naive cannot be combined with --allow-overlaps\n");
         if ( args->phased_concat ) error("The option --naive cannot be combined with --ligate\n");
+        if ( args->sites_only ) error("The option --naive cannot be combined with --drop-genotypes\n");
         naive_concat(args);
         destroy_data(args);
         free(args);
diff --git a/vcfconvert.c b/vcfconvert.c
index ce5ed9981..76c4a325a 100644
--- a/vcfconvert.c
+++ b/vcfconvert.c
@@ -1,6 +1,6 @@
 /*  vcfconvert.c -- convert between VCF/BCF and related formats.
 
-    Copyright (C) 2013-2021 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -59,7 +59,7 @@ struct _args_t
     bcf_hdr_t *header;
     void (*convert_func)(struct _args_t *);
     struct {
-        int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing; 
+        int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing, written;
     } n;
     kstring_t str;
     int32_t *gts;
@@ -70,6 +70,11 @@ struct _args_t
     char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
     char *outfname, *infname, *ref_fname, *sex_fname;
     int argc, n_threads, record_cmd_line, keep_duplicates, clevel;
+    char *index_fn;
+    int write_index;
+    struct {
+        kstring_t ref,alt,refalt;
+    } tsv;
 };
 
 static void destroy_data(args_t *args)
@@ -139,6 +144,36 @@ static void open_vcf(args_t *args, const char *format_str)
     free(samples);
 }
 
+static int _set_ref_alt(args_t *args, bcf1_t *rec)
+{
+    args->tsv.refalt.l = 0;
+    kputs(args->tsv.ref.s, &args->tsv.refalt);
+    if ( strcmp(".",args->tsv.alt.s) && strcmp(args->tsv.ref.s,args->tsv.alt.s) )
+    {
+        kputc(',', &args->tsv.refalt);
+        kputs(args->tsv.alt.s, &args->tsv.refalt);
+    }
+    bcf_update_alleles_str(args->header, rec, args->tsv.refalt.s);
+    args->tsv.ref.l = 0;
+    args->tsv.alt.l = 0;
+    args->tsv.refalt.l = 0;
+    return 0;
+}
+static int tsv_setter_ref(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+    args_t *args = (args_t*) usr;
+    kputsn(tsv->ss,tsv->se - tsv->ss,&args->tsv.ref);
+    if ( args->tsv.alt.l ) return _set_ref_alt(args,rec);
+    return 0;
+}
+static int tsv_setter_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+    args_t *args = (args_t*) usr;
+    kputsn(tsv->ss,tsv->se - tsv->ss,&args->tsv.alt);
+    if ( args->tsv.ref.l ) return _set_ref_alt(args,rec);
+    return 0;
+}
+
 // Try to set CHROM:POS_REF_ALT[_END]. Return 0 on success, -1 on error
 static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
 {
@@ -160,7 +195,7 @@ static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
     // REF,ALT
     args->str.l = 0;
     se = ++ss;
-    while ( se < tsv->se && *se!='_' ) se++; 
+    while ( se < tsv->se && *se!='_' ) se++;
     if ( *se!='_' ) return -1;
     kputsn(ss,se-ss,&args->str);
     ss = ++se;
@@ -269,12 +304,12 @@ static int tsv_setter_gt_gp(tsv_t *tsv, bcf1_t *rec, void *usr)
         if ( aa >= ab )
         {
             if ( aa >= bb ) args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(0);
-            else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1); 
+            else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
         }
-        else if ( ab >= bb ) 
+        else if ( ab >= bb )
         {
             args->gts[2*i+0] = bcf_gt_unphased(0);
-            args->gts[2*i+1] = bcf_gt_unphased(1); 
+            args->gts[2*i+1] = bcf_gt_unphased(1);
         }
         else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
     }
@@ -293,7 +328,7 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr)
     else { a0 = bcf_gt_phased(0); a1 = bcf_gt_phased(1); }
 
     // up is short for "unphased"
-    int nup = 0; 
+    int nup = 0;
     for (i=0; i<nsamples; i++)
     {
         char *ss = tsv->ss + 4*i + nup;
@@ -324,11 +359,11 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr)
                 break;
             default :
                 fprintf(stderr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss);
-                return -1; 
+                return -1;
             }
             if( ss[all*2+up+1]=='*' ) up = up + 1;
         }
-        
+
         if(up && up != 2)
         {
             fprintf(stderr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss);
@@ -356,13 +391,13 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr)
 static void gensample_to_vcf(args_t *args)
 {
     /*
-     *  Inpute: IMPUTE2 output (indentation changed here for clarity): 
+     *  Inpute: IMPUTE2 output (indentation changed here for clarity):
      *
      *      20:62116619_C_T 20:62116619     62116619 C T 0.969 0.031 0 ...
      *      ---             20:62116698_C_A 62116698 C A 1     0     0 ...
      *
      *  Second column is expected in the form of CHROM:POS_REF_ALT. We use second
-     *  column because the first can be empty ("--") when filling sites from reference 
+     *  column because the first can be empty ("--") when filling sites from reference
      *  panel. When the option --vcf-ids is given, the first column is used to set the
      *  VCF ID.
      *
@@ -455,6 +490,7 @@ static void gensample_to_vcf(args_t *args)
     if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
     if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
     if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
+    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
     bcf1_t *rec = bcf_init();
 
     nsamples -= 2;
@@ -474,6 +510,15 @@ static void gensample_to_vcf(args_t *args)
     }
     while ( hts_getline(gen_fh, KS_SEP_LINE, &line)>0 );
 
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out_fh)<0 )
+        {
+            if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
     if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
     if ( hts_close(gen_fh) ) error("Close failed: %s\n", gen_fname);
     bcf_hdr_destroy(args->header);
@@ -589,6 +634,7 @@ static void haplegendsample_to_vcf(args_t *args)
     if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
     if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
     if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
+    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
     bcf1_t *rec = bcf_init();
 
     args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2);
@@ -616,6 +662,15 @@ static void haplegendsample_to_vcf(args_t *args)
         }
     }
 
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out_fh)<0 )
+        {
+            if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
     if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
     if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname);
     if ( hts_close(leg_fh) ) error("Close failed: %s\n", leg_fname);
@@ -731,6 +786,7 @@ static void hapsample_to_vcf(args_t *args)
     if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
     if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
     if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
     bcf1_t *rec = bcf_init();
 
     nsamples -= 2;
@@ -749,6 +805,15 @@ static void hapsample_to_vcf(args_t *args)
     }
     while ( hts_getline(hap_fh, KS_SEP_LINE, &line)>0 );
 
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out_fh)<0 )
+        {
+            if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
     if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
     if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname);
     bcf_hdr_destroy(args->header);
@@ -784,7 +849,7 @@ char *init_sample2sex(bcf_hdr_t *hdr, char *sex_fname)
     }
     for (i=0; i<nlines; i++) free(lines[i]);
     free(lines);
-    for (i=0; i<bcf_hdr_nsamples(hdr); i++) 
+    for (i=0; i<bcf_hdr_nsamples(hdr); i++)
         if ( !sample2sex[i] ) error("Missing sex for sample %s in %s\n", bcf_hdr_int2id(hdr, BCF_DT_SAMPLE, i),sex_fname);
     return sample2sex;
 }
@@ -847,7 +912,7 @@ static void vcf_to_gensample(args_t *args)
     if (sample_fname) fprintf(stderr, "Sample file: %s\n", sample_fname);
 
     // write samples file
-    if (sample_fname) 
+    if (sample_fname)
     {
         char *sample2sex = NULL;
         if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
@@ -915,7 +980,7 @@ static void vcf_to_gensample(args_t *args)
             nok++;
         }
     }
-    fprintf(stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n", 
+    fprintf(stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n",
         nok, no_alt+non_biallelic+filtered+ndup, no_alt, non_biallelic, filtered, ndup);
 
     if ( str.m ) free(str.s);
@@ -976,7 +1041,7 @@ static void vcf_to_haplegendsample(args_t *args)
     {
         char *sample2sex = NULL;
         if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
-        
+
         int i;
         BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
         str.l = 0;
@@ -1078,7 +1143,7 @@ static void vcf_to_hapsample(args_t *args)
         kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %ID %POS %REF %FIRST_ALT ", &str);
     else
         kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str);
-    
+
     if ( args->hap2dip )
         kputs("%_GT_TO_HAP2\n", &str);
     else
@@ -1213,7 +1278,7 @@ static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[]
 {
     if ( se - ss > 2 ) return -1;   // currently only SNPs
 
-    if ( ss[0]=='-' )
+    if ( ss[0]=='-' || ss[0]=='.' )
     {
         // missing GT
         gts[0] = bcf_gt_missing;
@@ -1229,7 +1294,7 @@ static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[]
     if ( alleles[a0]<0 ) alleles[a0] = (*nals)++;
     if ( alleles[a1]<0 ) alleles[a1] = (*nals)++;
 
-    gts[0] = bcf_gt_unphased(alleles[a0]); 
+    gts[0] = bcf_gt_unphased(alleles[a0]);
     gts[1] = ss[1] ? bcf_gt_unphased(alleles[a1]) : bcf_int32_vector_end;
 
     if ( ref==a0 && ref==a1  ) args->n.hom_rr++;    // hom ref: RR
@@ -1265,7 +1330,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr)
         }
         ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2);
         if ( ret==-1 ) error("Error parsing the site %s:%"PRId64", expected two characters\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1);
-        if ( ret==-2 ) 
+        if ( ret==-2 )
         {
             // something else than a SNP
             free(ref);
@@ -1275,7 +1340,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr)
 
     args->str.l = 0;
     kputc(ref[0], &args->str);
-    for (i=0; i<5; i++) 
+    for (i=0; i<5; i++)
     {
         if ( alleles[i]>0 )
         {
@@ -1293,7 +1358,6 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr)
 static void tsv_to_vcf(args_t *args)
 {
     if ( !args->ref_fname ) error("--tsv2vcf requires the --fasta-ref option\n");
-    if ( !args->sample_list ) error("--tsv2vcf requires the --samples option\n");
 
     args->ref = fai_load(args->ref_fname);
     if ( !args->ref ) error("Could not load the reference %s\n", args->ref_fname);
@@ -1303,17 +1367,21 @@ static void tsv_to_vcf(args_t *args)
     bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
     if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
 
-    int i, n;
-    char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
-    if ( !smpls ) error("Could not parse %s\n", args->sample_list);
-    for (i=0; i<n; i++)
+    int i, nsmpl;
+    char **smpl;
+    if ( args->sample_list )
     {
-        bcf_hdr_add_sample(args->header, smpls[i]);
-        free(smpls[i]);
+        smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl);
+        if ( !smpl ) error("Could not parse %s\n", args->sample_list);
+        for (i=0; i<nsmpl; i++)
+        {
+            bcf_hdr_add_sample(args->header, smpl[i]);
+            free(smpl[i]);
+        }
+        free(smpl);
+        bcf_hdr_add_sample(args->header, NULL);
+        args->gts = (int32_t *) malloc(sizeof(int32_t)*nsmpl*2);
     }
-    free(smpls);
-    bcf_hdr_add_sample(args->header, NULL);
-    args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2);
 
     char wmode[8];
     set_wmode(wmode,args->output_type,args->outfname,args->clevel);
@@ -1321,12 +1389,18 @@ static void tsv_to_vcf(args_t *args)
     if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
     if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
     if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
 
     tsv_t *tsv = tsv_init(args->columns ? args->columns : "ID,CHROM,POS,AA");
     if ( tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header) < 0 ) error("Expected CHROM column\n");
     if ( tsv_register(tsv, "POS", tsv_setter_pos, NULL) < 0 ) error("Expected POS column\n");
     if ( tsv_register(tsv, "ID", tsv_setter_id, args->header) < 0 && !args->columns ) error("Expected ID column\n");
-    if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 ) error("Expected AA column\n");
+    if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 )
+    {
+        if ( args->sample_list ) error("Expected AA column with -s/-S\n");
+        if ( tsv_register(tsv, "REF", tsv_setter_ref, args) < 0 || tsv_register(tsv, "ALT", tsv_setter_alt, args) < 0 )
+            error("Expected REF and ALT columns when AA was not given\n");
+    }
 
     bcf1_t *rec = bcf_init();
     bcf_float_set_missing(rec->qual);
@@ -1343,6 +1417,7 @@ static void tsv_to_vcf(args_t *args)
         if ( !tsv_parse(tsv, rec, line.s) )
         {
             if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+            args->n.written++;
         }
         else
             args->n.skipped++;
@@ -1350,20 +1425,36 @@ static void tsv_to_vcf(args_t *args)
     if ( hts_close(in_fh) ) error("Close failed: %s\n", args->infname);
     free(line.s);
 
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out_fh)<0 )
+        {
+            if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
     bcf_hdr_destroy(args->header);
     if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
     tsv_destroy(tsv);
     bcf_destroy(rec);
     free(args->str.s);
     free(args->gts);
+    free(args->tsv.ref.s);
+    free(args->tsv.alt.s);
+    free(args->tsv.refalt.s);
 
     fprintf(stderr,"Rows total: \t%d\n", args->n.total);
     fprintf(stderr,"Rows skipped: \t%d\n", args->n.skipped);
-    fprintf(stderr,"Missing GTs: \t%d\n", args->n.missing);
-    fprintf(stderr,"Hom RR: \t%d\n", args->n.hom_rr);
-    fprintf(stderr,"Het RA: \t%d\n", args->n.het_ra);
-    fprintf(stderr,"Hom AA: \t%d\n", args->n.hom_aa);
-    fprintf(stderr,"Het AA: \t%d\n", args->n.het_aa);
+    fprintf(stderr,"Sites written: \t%d\n", args->n.written);
+    if ( args->sample_list )
+    {
+        fprintf(stderr,"Missing GTs: \t%d\n", args->n.missing);
+        fprintf(stderr,"Hom RR: \t%d\n", args->n.hom_rr);
+        fprintf(stderr,"Het RA: \t%d\n", args->n.het_ra);
+        fprintf(stderr,"Hom AA: \t%d\n", args->n.hom_aa);
+        fprintf(stderr,"Het AA: \t%d\n", args->n.het_aa);
+    }
 }
 
 static void vcf_to_vcf(args_t *args)
@@ -1377,6 +1468,7 @@ static void vcf_to_vcf(args_t *args)
 
     bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
     if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
 
     while ( bcf_sr_next_line(args->files) )
     {
@@ -1389,6 +1481,15 @@ static void vcf_to_vcf(args_t *args)
         }
         if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
     }
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out_fh)<0 )
+        {
+            if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
     if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
 }
 
@@ -1409,6 +1510,7 @@ static void gvcf_to_vcf(args_t *args)
     bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
     if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert");
     if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+    if ( args->write_index && init_index(out_fh,hdr,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
 
     int32_t *itmp = NULL, nitmp = 0;
 
@@ -1419,7 +1521,7 @@ static void gvcf_to_vcf(args_t *args)
         {
             int pass = filter_test(args->filter, line, NULL);
             if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
-            if ( !pass ) 
+            if ( !pass )
             {
                 if ( bcf_write(out_fh,hdr,line)!=0  ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
                 continue;
@@ -1469,6 +1571,15 @@ static void gvcf_to_vcf(args_t *args)
         }
     }
     free(itmp);
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out_fh)<0 )
+        {
+            if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
     if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
 }
 
@@ -1497,6 +1608,7 @@ static void usage(void)
     fprintf(stderr, "   -o, --output FILE              Output file name [stdout]\n");
     fprintf(stderr, "   -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
     fprintf(stderr, "       --threads INT              Use multithreading with INT worker threads [0]\n");
+    fprintf(stderr, "       --write-index              Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n");
     fprintf(stderr, "   -G, --gensample2vcf ...        <PREFIX>|<GEN-FILE>,<SAMPLE-FILE>\n");
@@ -1528,7 +1640,7 @@ static void usage(void)
     fprintf(stderr, "\n");
     fprintf(stderr, "TSV conversion:\n");
     fprintf(stderr, "       --tsv2vcf FILE\n");
-    fprintf(stderr, "   -c, --columns STRING           Columns of the input tsv file [ID,CHROM,POS,AA]\n");
+    fprintf(stderr, "   -c, --columns STRING           Columns of the input tsv file, see man page for details [ID,CHROM,POS,AA]\n");
     fprintf(stderr, "   -f, --fasta-ref FILE           Reference sequence in fasta format\n");
     fprintf(stderr, "   -s, --samples LIST             List of sample names\n");
     fprintf(stderr, "   -S, --samples-file FILE        File of sample names\n");
@@ -1590,6 +1702,7 @@ int main_vcfconvert(int argc, char *argv[])
         {"fasta-ref",required_argument,NULL,'f'},
         {"no-version",no_argument,NULL,10},
         {"keep-duplicates",no_argument,NULL,12},
+        {"write-index",no_argument,NULL,16},
         {NULL,0,NULL,0}
     };
     char *tmp;
@@ -1618,6 +1731,7 @@ int main_vcfconvert(int argc, char *argv[])
             case  7 : args->convert_func = vcf_to_hapsample; args->outfname = optarg; break;
             case  8 : error("The --chrom option has been deprecated, please use --3N6 instead\n"); break;
             case 15 : args->gen_3N6 = 1; break;
+            case 16 : args->write_index = 1; break;
             case 'H': args->convert_func = haplegendsample_to_vcf; args->infname = optarg; break;
             case 'f': args->ref_fname = optarg; break;
             case 'c': args->columns = optarg; break;
@@ -1667,7 +1781,7 @@ int main_vcfconvert(int argc, char *argv[])
         else args->infname = argv[optind];
     }
     if ( !args->infname ) usage();
-    
+
     if ( args->convert_func ) args->convert_func(args);
     else vcf_to_vcf(args);
 
diff --git a/vcffilter.c b/vcffilter.c
index 68d867247..8665409d1 100644
--- a/vcffilter.c
+++ b/vcffilter.c
@@ -1,6 +1,6 @@
 /*  vcffilter.c -- Apply fixed-threshold filters.
 
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -77,6 +77,8 @@ typedef struct _args_t
     char **argv, *output_fname, *targets_list, *regions_list, *mask_list;
     int argc, record_cmd_line, mask_is_file, mask_overlap, mask_negate;
     regidx_t *mask;
+    char *index_fn;
+    int write_index;
 }
 args_t;
 
@@ -491,6 +493,7 @@ static void usage(args_t *args)
     fprintf(stderr, "    -T, --targets-file FILE        Similar to -R but streams rather than index-jumps\n");
     fprintf(stderr, "        --targets-overlap 0|1|2    Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
     fprintf(stderr, "        --threads INT              Use multithreading with <int> worker threads [0]\n");
+    fprintf(stderr, "        --write-index              Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     exit(1);
 }
@@ -533,13 +536,14 @@ int main_vcffilter(int argc, char *argv[])
         {"SnpGap",required_argument,NULL,'g'},
         {"IndelGap",required_argument,NULL,'G'},
         {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,12},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:o:O:g:G:S:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:",loptions,NULL)) >= 0) {
         switch (c) {
             case 'g':
-                args->snp_gap = strtol(optarg,&tmp,10); 
+                args->snp_gap = strtol(optarg,&tmp,10);
                 if ( *tmp && *tmp!=':' ) error("Could not parse argument: --SnpGap %s\n", optarg);
                 if ( *tmp==':' )
                 {
@@ -625,6 +629,7 @@ int main_vcffilter(int argc, char *argv[])
                 else if ( !strcasecmp(optarg,"2") ) args->mask_overlap = 2;
                 else error("Could not parse: --mask-overlap %s\n",optarg);
                 break;
+            case  12 : args->write_index = 1; break;
             case 'h':
             case '?': usage(args); break;
             default: error("Unknown argument: %s\n", optarg);
@@ -672,6 +677,7 @@ int main_vcffilter(int argc, char *argv[])
 
     init_data(args);
     if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
     while ( bcf_sr_next_line(args->files) )
     {
         bcf1_t *line = bcf_sr_get_line(args->files, 0);
@@ -713,7 +719,15 @@ int main_vcffilter(int argc, char *argv[])
         }
     }
     buffered_filters(args, NULL);
-
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(args->out_fh)<0 )
+        {
+            if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
     if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
     destroy_data(args);
     bcf_sr_destroy(args->files);
diff --git a/vcfgtcheck.c b/vcfgtcheck.c
index f646e1f6d..561be62a5 100644
--- a/vcfgtcheck.c
+++ b/vcfgtcheck.c
@@ -1,6 +1,6 @@
 /*  vcfgtcheck.c -- Check sample identity.
 
-    Copyright (C) 2013-2021 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -59,6 +59,7 @@ typedef struct
     int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file;
     int regions_overlap, targets_overlap;
     int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl;
+    int nused[2][2];
     double *pdiff, *qry_prob, *gt_prob;
     uint32_t *ndiff,*ncnt,ncmp, npairs;
     int32_t *qry_arr,*gt_arr, nqry_arr,ngt_arr;
@@ -309,7 +310,7 @@ static void init_data(args_t *args)
         init_samples(args->qry_samples, args->qry_samples_is_file, &args->qry_smpl, &args->nqry_smpl, args->qry_hdr, args->qry_fname);
     }
     if ( args->gt_samples )
-    {   
+    {
         init_samples(args->gt_samples, args->gt_samples_is_file, &args->gt_smpl, &args->ngt_smpl,
             args->gt_hdr ? args->gt_hdr : args->qry_hdr,
             args->gt_fname ? args->gt_fname : args->qry_fname);
@@ -377,7 +378,7 @@ static void init_data(args_t *args)
         args->gt_prob  = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob));
 
         // dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing
-        // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding 
+        // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding
         // probabilities of 0/0, 0/1, and 1/1 genotypes
         for (i=0; i<8; i++)
             for (j=0; j<3; j++)
@@ -555,7 +556,9 @@ static void process_line(args_t *args)
         args->gt_arr = args->qry_arr;
     }
 
+    // stats: number of compared sites, and used tags
     args->ncmp++;
+    args->nused[qry_use_GT][gt_use_GT]++;
 
     double af,hwe_dsg[8];
     if ( args->calc_hwe_prob )
@@ -636,7 +639,7 @@ static void process_line(args_t *args)
                 gt_dsg = gt_use_GT ? gt_to_prob(args,ptr,gt_prob) : pl_to_prob(args,ptr,gt_prob);
                 if ( !gt_dsg ) continue;                        // missing value
                 if ( args->hom_only && !(gt_dsg&5) ) continue;  // not a hom
-               
+
                 ptr = args->qry_arr + args->pairs[i].iqry*nqry1;
                 qry_dsg = qry_use_GT ? gt_to_prob(args,ptr,qry_prob) : pl_to_prob(args,ptr,qry_prob);
                 if ( !qry_dsg ) continue;                       // missing value
@@ -797,11 +800,15 @@ static void report(args_t *args)
     fprintf(args->fp,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data);
     fprintf(args->fp,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT);
     fprintf(args->fp,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL);
+    fprintf(args->fp,"INFO\tsites-used-PL-vs-PL\t%u\n",args->nused[0][0]);
+    fprintf(args->fp,"INFO\tsites-used-PL-vs-GT\t%u\n",args->nused[0][1]);
+    fprintf(args->fp,"INFO\tsites-used-GT-vs-PL\t%u\n",args->nused[1][0]);
+    fprintf(args->fp,"INFO\tsites-used-GT-vs-GT\t%u\n",args->nused[1][1]);
     fprintf(args->fp,"# DC, discordance:\n");
     fprintf(args->fp,"#     - query sample\n");
     fprintf(args->fp,"#     - genotyped sample\n");
-    fprintf(args->fp,"#     - discordance (number of mismatches; smaller is better)\n");
-    fprintf(args->fp,"#     - negative log of HWE probability at matching sites (rare genotypes mataches are more informative, bigger is better)\n");
+    fprintf(args->fp,"#     - discordance (either an abstract score or number of mismatches, see -e/-u in the man page for details; smaller is better)\n");
+    fprintf(args->fp,"#     - negative log of HWE probability at matching sites (rare genotypes matches are more informative, bigger is better)\n");
     fprintf(args->fp,"#     - number of sites compared (bigger is better)\n");
     fprintf(args->fp,"#DC\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]-log P(HWE)\t[6]Number of sites compared\n");
 
@@ -1023,7 +1030,7 @@ static int is_input_okay(args_t *args, int nmatch)
     return 1;
 
 not_okay:
-    fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n", 
+    fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n",
         bcf_seqname(hdr,rec),rec->pos+1,msg);
     return 0;
 }
@@ -1097,7 +1104,7 @@ int main_vcfgtcheck(int argc, char *argv[])
     args->es_max_mem = strdup("500M");
 
     // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23
-    //    - min_inter: pairs with smaller err value will be considered identical 
+    //    - min_inter: pairs with smaller err value will be considered identical
     //    - max_intra: pairs with err value bigger than abs(max_intra_err) will be considered
     //                  different. If negative, the cutoff may be heuristically lowered
     args->min_inter_err =  0.23;
@@ -1169,7 +1176,7 @@ int main_vcfgtcheck(int argc, char *argv[])
             case 3 : args->calc_hwe_prob = 0; break;
             case 4 : error("The option -S, --target-sample has been deprecated\n"); break;
             case 5 : args->dry_run = 1; break;
-            case 6 : 
+            case 6 :
                 args->distinctive_sites = strtod(optarg,&tmp);
                 if ( *tmp )
                 {
@@ -1202,7 +1209,7 @@ int main_vcfgtcheck(int argc, char *argv[])
                 else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4;
                 else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
                 break;
-            case 'S': 
+            case 'S':
                 if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3, args->gt_samples_is_file = 1;
                 else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4, args->qry_samples_is_file = 1;
                 else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
diff --git a/vcfisec.c b/vcfisec.c
index a755a85b4..4ee29b4c8 100644
--- a/vcfisec.c
+++ b/vcfisec.c
@@ -1,6 +1,6 @@
 /*  vcfisec.c -- Create intersections, unions and complements of VCF files.
 
-    Copyright (C) 2012-2022 Genome Research Ltd.
+    Copyright (C) 2012-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -60,6 +60,8 @@ typedef struct
     char **argv, *prefix, *output_fname, **fnames, *write_files, *targets_list, *regions_list;
     char *isec_exact;
     int argc, record_cmd_line;
+    char *index_fn;
+    int write_index;
 }
 args_t;
 
@@ -148,6 +150,8 @@ void isec_vcf(args_t *args)
         if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
         if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
         if ( bcf_hdr_write(out_fh, files->readers[args->iwrite].header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
+        if ( args->write_index && init_index(out_fh,files->readers[args->iwrite].header,args->output_fname,&args->index_fn)<0 )
+            error("Error: failed to initialise index for %s\n",args->output_fname?args->output_fname:"standard output");
     }
     if ( !args->nwrite && !out_std && !args->prefix )
         fprintf(stderr,"Note: -w option not given, printing list of sites...\n");
@@ -253,7 +257,19 @@ void isec_vcf(args_t *args)
         }
     }
     if ( str.s ) free(str.s);
-    if ( out_fh && hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-");
+    if ( out_fh )
+    {
+        if ( args->write_index )
+        {
+            if ( bcf_idx_save(out_fh)<0 )
+            {
+                if ( hts_close(out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+                error("Error: cannot write to index %s\n", args->index_fn);
+            }
+            free(args->index_fn);
+        }
+        if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-");
+    }
 }
 
 static void add_filter(args_t *args, char *expr, int logic)
@@ -481,6 +497,7 @@ static void usage(void)
     fprintf(stderr, "        --targets-overlap 0|1|2    Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
     fprintf(stderr, "        --threads INT              Use multithreading with <int> worker threads [0]\n");
     fprintf(stderr, "    -w, --write LIST               List of files to write with -p given as 1-based indexes. By default, all files are written\n");
+    fprintf(stderr, "        --write-index              Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "Examples:\n");
     fprintf(stderr, "   # Create intersection and complements of two sets saving the output in dir/*\n");
@@ -537,6 +554,7 @@ int main_vcfisec(int argc, char *argv[])
         {"output-type",required_argument,NULL,'O'},
         {"threads",required_argument,NULL,9},
         {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,10},
         {NULL,0,NULL,0}
     };
     char *tmp;
@@ -608,6 +626,7 @@ int main_vcfisec(int argc, char *argv[])
                 break;
             case  9 : args->n_threads = strtol(optarg, 0, 0); break;
             case  8 : args->record_cmd_line = 0; break;
+            case 10 : args->write_index = 1; break;
             case 'h':
             case '?': usage(); break;
             default: error("Unknown argument: %s\n", optarg);
diff --git a/vcfmerge.c b/vcfmerge.c
index 621f4102c..87b6b8a39 100644
--- a/vcfmerge.c
+++ b/vcfmerge.c
@@ -1,6 +1,6 @@
 /*  vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
 
-    Copyright (C) 2012-2022 Genome Research Ltd.
+    Copyright (C) 2012-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -63,6 +63,19 @@ typedef khash_t(strdict) strdict_t;
 
 #define PL2PROB_MAX 1024
 
+// Rules for merging FORMAT Number=A,G,R vectors with missing values
+#define MERGE_MISSING_DOT   0   // leave as is, i.e. use a missing value "."
+#define MERGE_MISSING_CONST 1   // use a constant value
+#define MERGE_MISSING_MAX   2   // use the existing maximum value
+
+typedef struct _missing_rule_t
+{
+    char *hdr_tag;
+    int type;
+    float value;
+}
+missing_rule_t;
+
 // For merging INFO Number=A,G,R tags
 typedef struct
 {
@@ -103,29 +116,37 @@ typedef struct
     int *map;   // mapping from input alleles to the array of output alleles (set by merge_alleles)
     int mmap;   // size of map array (only buffer[i].n_allele is actually used)
     int als_differ;
+    int var_types;  // variant types in this record, shifted by <<1 to account for VCF_REF
 }
 maux1_t;
+
+// Buffered lines for a single reader
 typedef struct
 {
     int rid;        // current rid
     int beg,end;    // valid ranges in reader's buffer [beg,end). Maintained by maux_reset and gvcf_flush.
+    int unkn_allele;// the index of the unknown allele (<*>, <NON_REF>)
     int cur;        // current line or -1 if none
     int mrec;       // allocated size of buf
     maux1_t *rec;   // buffer to keep reader's lines
     bcf1_t **lines; // source buffer: either gvcf or readers' buffer
+    int var_types;  // reader's variant types in the active [beg,end] window
 }
 buffer_t;
 typedef struct
 {
-    int n, pos, var_types;  // number of readers, current position, currently available variant types
+    int n, pos, var_types;  // number of readers; current position; variant types at this position across all available records
+    int *als_types,         // allele type of each output allele
+        mals_types;
     char *chr;              // current chromosome
     char **als, **out_als;  // merged alleles (temp, may contain empty records) and merged alleles ready for output
     int nals, mals, nout_als, mout_als; // size of the output array
     int *cnt, ncnt; // number of records that refer to the alleles
     int *smpl_ploidy, *smpl_nGsize; // ploidy and derived number of values in Number=G tags, updated for each line (todo: cache for missing cases)
+    const char **fmt_key;// temporary short-lived array to store output tag names
     bcf_fmt_t **fmt_map; // i-th output FORMAT field corresponds in j-th reader to i*nreader+j, first row is reserved for GT
     int nfmt_map;        // number of rows in the fmt_map array
-    int *agr_map, nagr_map, magr_map;   // mapping between Number=AGR element indexes
+    int *agr_map, nagr_map, magr_map;   // mapping between Number=AGR element indexes, from src idxs to dst file idxs
     void *tmp_arr;
     size_t ntmp_arr;
     buffer_t *buf;
@@ -156,6 +177,9 @@ typedef struct
     faidx_t *gvcf_fai;
     info_rule_t *rules;
     int nrules;
+    char *missing_rules_str;
+    missing_rule_t *missing_rules;    // lookup for -M, --missing-rules
+    int nmissing_rules;
     strdict_t *tmph;
     kstring_t tmps;
     bcf_srs_t *files;
@@ -166,6 +190,8 @@ typedef struct
     int argc, n_threads, record_cmd_line, clevel;
     int local_alleles;    // the value of -L option
     int keep_AC_AN;
+    char *index_fn;
+    int write_index;
 }
 args_t;
 
@@ -298,6 +324,89 @@ static void info_rules_merge_join(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rul
     }
 }
 
+static int missing_rules_comp_key2(const void *a, const void *b)
+{
+    missing_rule_t *rule1 = (missing_rule_t*) a;
+    missing_rule_t *rule2 = (missing_rule_t*) b;
+    return strcmp(rule1->hdr_tag, rule2->hdr_tag);
+}
+static int missing_rules_comp_key(const void *a, const void *b)
+{
+    char *key = (char*) a;
+    missing_rule_t *rule = (missing_rule_t*) b;
+    return strcmp(key, rule->hdr_tag);
+}
+static void missing_rules_init(args_t *args)
+{
+    kstring_t str = {0,0,0};
+    if ( args->missing_rules_str )
+    {
+        if ( !strcmp("-",args->missing_rules_str) ) kputs("PL:.,AD:.",&str);
+        else kputs(args->missing_rules_str,&str);
+    }
+    else if ( args->do_gvcf ) kputs("PL:max,AD:0",&str);
+    else return;
+
+    args->nmissing_rules = 1;
+    char *ss = str.s, *tmp = ss;
+    int n = 0;
+    while ( *ss )
+    {
+        if ( *ss==':' ) { *ss = 0; n++; if ( n%2==0 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); }
+        else if ( *ss==',' ) { *ss = 0; args->nmissing_rules++; n++; if ( n%2==1 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); }
+        ss++;
+    }
+    if ( n%2==0 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str);
+    args->missing_rules = (missing_rule_t*) calloc(args->nmissing_rules,sizeof(missing_rule_t));
+
+    n = args->nmissing_rules;
+    args->nmissing_rules = 0;
+    ss = tmp;
+    while ( args->nmissing_rules < n  )
+    {
+        missing_rule_t *rule = &args->missing_rules[args->nmissing_rules];
+        rule->hdr_tag = strdup(ss);
+        int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag);
+        if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_FMT,id) )
+        {
+            if ( args->missing_rules_str ) error("The FORMAT tag is not defined in the header: \"%s\"\n", rule->hdr_tag);
+            free(rule->hdr_tag);
+            n--;
+            ss = strchr(ss, '\0'); ss++;
+            if ( !*ss ) error("Could not parse --missing-rules, missing logic of \"%s\"\n", rule->hdr_tag);
+            ss = strchr(ss, '\0'); ss++;
+            continue;
+        }
+
+        ss = strchr(ss, '\0'); ss++;
+        if ( !*ss ) error("Could not parse --missing-rules, missing logic of \"%s\"\n", rule->hdr_tag);
+
+        if ( !strcasecmp(ss,".") ) rule->type = MERGE_MISSING_DOT;
+        else if ( !strcasecmp(ss,"max") ) rule->type = MERGE_MISSING_MAX;
+        else
+        {
+            char *tmp = ss;
+            rule->value = strtod(ss, &tmp);
+            if ( *tmp ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str);
+            rule->type = MERGE_MISSING_CONST;
+        }
+        ss = strchr(ss, '\0'); ss++;
+        args->nmissing_rules++;
+    }
+    qsort(args->missing_rules, args->nmissing_rules, sizeof(*args->missing_rules), missing_rules_comp_key2);
+    free(str.s);
+}
+static void missing_rules_destroy(args_t *args)
+{
+    int i;
+    for (i=0; i<args->nmissing_rules; i++)
+    {
+        missing_rule_t *rule = &args->missing_rules[i];
+        free(rule->hdr_tag);
+    }
+    free(args->missing_rules);
+}
+
 static int info_rules_comp_key2(const void *a, const void *b)
 {
     info_rule_t *rule1 = (info_rule_t*) a;
@@ -770,6 +879,7 @@ void maux_destroy(maux_t *ma)
     int i,j;
     for (i=0; i<ma->nout_smpl; i++) free(ma->str[i].s);
     free(ma->str);
+    free(ma->als_types);
     for (i=0; i<ma->mals; i++)
     {
         free(ma->als[i]);
@@ -793,6 +903,7 @@ void maux_destroy(maux_t *ma)
     free(ma->AGR_info);
     if (ma->ntmp_arr) free(ma->tmp_arr);
     if (ma->nfmt_map) free(ma->fmt_map);
+    free(ma->fmt_key);
     // ma->inf freed in bcf_destroy1
     for (i=0; i<ma->mals; i++) free(ma->als[i]);
     if (ma->mout_als) free(ma->out_als);
@@ -820,7 +931,6 @@ void maux_reset(maux_t *ma, int *rid_tab)
 {
     int i,j;
     for (i=0; i<ma->n; i++) maux_expand1(&ma->buf[i],ma->files->readers[i].nbuffer+1);
-    for (i=0; i<ma->ncnt; i++) ma->cnt[i] = 0;
     for (i=0; i<ma->mals; i++)
     {
         free(ma->als[i]);
@@ -856,6 +966,7 @@ void maux_reset(maux_t *ma, int *rid_tab)
         for (j=ma->buf[i].beg; j<=ma->files->readers[i].nbuffer; j++)
         {
             ma->buf[i].rec[j].skip = 0;
+            ma->buf[i].rec[j].var_types = 0;
             bcf1_t *line = ma->files->readers[i].buffer[j];
             if ( line->rid!=ma->buf[i].rid || line->pos!=ma->pos ) break;
         }
@@ -959,12 +1070,14 @@ void merge_chrom2qual(args_t *args, bcf1_t *out)
         int ir, j;
         for (ir=0; ir<files->nreaders; ir++)
         {
+            ma->buf[ir].unkn_allele = 0;
             bcf1_t *line = maux_get_line(args,ir);
             if ( !line ) continue;
             for (j=1; j<line->n_allele; j++)
             {
                 int irec = ma->buf[ir].cur;
                 if ( ma->buf[ir].rec[irec].map[j]==i ) ma->buf[ir].rec[irec].map[j] = ma->nout_als;
+                if ( bcf_has_variant_type(line,j,VCF_REF) && line->d.allele[j][0]=='<' ) ma->buf[ir].unkn_allele = j;
             }
         }
     }
@@ -1985,7 +2098,7 @@ void merge_localized_numberAR_format_field(args_t *args, bcf_fmt_t **fmt_map, bc
         bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize);
     ma->laa_dirty = 1;
 }
-void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
+void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule, bcf1_t *out)
 {
     bcf_srs_t *files = args->files;
     bcf_hdr_t *out_hdr = args->out_hdr;
@@ -2135,12 +2248,32 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
                         for (l=1; l<nsize; l++) { tgt++; tgt_set_vector_end; } \
                         continue; \
                     } \
-                    int ngsize = ma->smpl_ploidy[ismpl+j]==1 ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \
-                    for (l=0; l<ngsize; l++) { tgt_set_missing; tgt++; } \
+                    int haploid = ma->smpl_ploidy[ismpl+j]==1 ? 1 : 0; \
+                    int ngsize = haploid ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \
+                    if ( ma->buf[i].unkn_allele )  /* Use value from the unknown allele when available */ \
+                    {  \
+                        src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+                        int iunkn = haploid ? ma->buf[i].unkn_allele : (ma->buf[i].unkn_allele+1)*(ma->buf[i].unkn_allele + 2)/2 - 1; \
+                        for (l=0; l<ngsize; l++) { *tgt = src[iunkn]; tgt++; } \
+                    } \
+                    else if ( mrule && mrule->type==MERGE_MISSING_CONST ) \
+                    { \
+                        for (l=0; l<ngsize; l++) { *tgt = mrule->value; tgt++; } \
+                    } \
+                    else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \
+                    { \
+                        src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+                        src_type_t max = src[0]; \
+                        for (l=1; l<fmt_ori->n; l++) if ( max < src[l] ) max = src[l]; \
+                        for (l=0; l<ngsize; l++) { *tgt = max; tgt++; } \
+                    } \
+                    else \
+                    { \
+                        for (l=0; l<ngsize; l++) { tgt_set_missing; tgt++; } \
+                    } \
                     for (; l<nsize; l++) { tgt_set_vector_end; tgt++; } \
-                    if ( ma->smpl_ploidy[ismpl+j]==1 ) \
+                    if ( haploid ) \
                     { \
-                        /* Haploid */ \
                         int iori, inew; \
                         for (iori=0; iori<line->n_allele; iori++) \
                         { \
@@ -2194,7 +2327,26 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
                         continue; \
                     } \
                     src = (src_type_t*) (fmt_ori->p + j*fmt_ori->size); \
-                    for (l=0; l<nsize; l++) { tgt_set_missing; tgt++; } \
+                    if ( ma->buf[i].unkn_allele )  /* Use value from the unknown allele when available */ \
+                    { \
+                        int iunkn = ma->buf[i].unkn_allele; \
+                        for (l=0; l<nsize; l++) { *tgt = src[iunkn]; tgt++; } \
+                    } \
+                    else if ( mrule && mrule->type==MERGE_MISSING_CONST ) \
+                    { \
+                        for (l=0; l<nsize; l++) { *tgt = mrule->value; tgt++; } \
+                    } \
+                    else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \
+                    { \
+                        src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+                        src_type_t max = src[0]; \
+                        for (l=1; l<fmt_ori->n; l++) if ( max < src[l] ) max = src[l]; \
+                        for (l=0; l<nsize; l++) { *tgt = max; tgt++; } \
+                    } \
+                    else \
+                    { \
+                        for (l=0; l<nsize; l++) { tgt_set_missing; tgt++; } \
+                    } \
                     int iori,inew; \
                     for (iori=ifrom; iori<line->n_allele; iori++) \
                     { \
@@ -2234,6 +2386,7 @@ void merge_format(args_t *args, bcf1_t *out)
     {
         ma->nfmt_map = 2;
         ma->fmt_map  = (bcf_fmt_t**) calloc(ma->nfmt_map*files->nreaders, sizeof(bcf_fmt_t*));
+        ma->fmt_key  = (const char**) malloc(ma->nfmt_map*sizeof(*ma->fmt_key));
     }
     else
         memset(ma->fmt_map, 0, ma->nfmt_map*files->nreaders*sizeof(bcf_fmt_t**));
@@ -2250,7 +2403,7 @@ void merge_format(args_t *args, bcf1_t *out)
         bcf_hdr_t *hdr = reader->header;
         for (j=0; j<line->n_fmt; j++)
         {
-            // Wat this tag already seen?
+            // Was this tag already seen?
             bcf_fmt_t *fmt = &line->d.fmt[j];
             const char *key = hdr->id[BCF_DT_ID][fmt->id].key;
             kitr = kh_get(strdict, tmph, key);
@@ -2269,9 +2422,11 @@ void merge_format(args_t *args, bcf1_t *out)
                     {
                         ma->fmt_map = (bcf_fmt_t**) realloc(ma->fmt_map, sizeof(bcf_fmt_t*)*(max_ifmt+1)*files->nreaders);
                         memset(ma->fmt_map+ma->nfmt_map*files->nreaders, 0, (max_ifmt-ma->nfmt_map+1)*files->nreaders*sizeof(bcf_fmt_t*));
+                        ma->fmt_key = (const char**) realloc(ma->fmt_key, sizeof(*ma->fmt_key)*(max_ifmt+1));
                         ma->nfmt_map = max_ifmt+1;
                     }
                     if ( key[0]=='P' && key[1]=='L' && key[2]==0  ) { has_PL = ifmt; }
+                    ma->fmt_key[max_ifmt] = key;
                 }
                 kitr = kh_put(strdict, tmph, key, &ret);
                 kh_value(tmph, kitr) = ifmt;
@@ -2298,7 +2453,10 @@ void merge_format(args_t *args, bcf1_t *out)
         update_AN_AC(out_hdr, out);
 
     for (i=1; i<=max_ifmt; i++)
-        merge_format_field(args, &ma->fmt_map[i*files->nreaders], out);
+    {
+        missing_rule_t *rule = (missing_rule_t*) bsearch(ma->fmt_key[i], args->missing_rules, args->nmissing_rules, sizeof(*args->missing_rules), missing_rules_comp_key);
+        merge_format_field(args, &ma->fmt_map[i*files->nreaders], rule, out);
+    }
 
     if ( ma->laa_dirty )
         update_local_alleles(args, out);
@@ -2406,6 +2564,9 @@ void gvcf_write_block(args_t *args, int start, int end)
     {
         int slen  = 0;
         char *seq = faidx_fetch_seq(args->gvcf_fai,maux->chr,out->pos,out->pos,&slen);
+        if (!seq)
+            exit(1); // faidx_fetch_seq has already reported the error.
+
         if (slen)
         {
             out->d.allele[0][0] = seq[0];
@@ -2520,16 +2681,6 @@ static inline int is_gvcf_block(bcf1_t *line)
     return 0;
 }
 
-// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h
-// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault) and
-// to accommodate for VCF_GVCF_REF defined below
-static const int
-    snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2),
-    indel_mask = VCF_INDEL<<2,
-    ins_mask = VCF_INS<<2,
-    del_mask = VCF_DEL<<2,
-    ref_mask = 2;
-
 /*
     Check incoming lines for new gVCF blocks, set pointer to the current source
     buffer (gvcf or readers).  In contrast to gvcf_flush, this function can be
@@ -2629,7 +2780,7 @@ void clean_buffer(args_t *args)
         {
             if ( ma->gvcf[ir].active )
             {
-                if ( ma->pos >= ma->gvcf[ir].end )  ma->gvcf[ir].active = 0;
+                if ( ma->pos > ma->gvcf[ir].end )  ma->gvcf[ir].active = 0;
                 else if ( ma->buf[ir].cur==-1 ) ma->buf[ir].cur = ma->buf[ir].beg;  // re-activate interrupted gVCF block
             }
             if ( !ma->gvcf[ir].active ) ma->buf[ir].cur = -1;
@@ -2664,13 +2815,16 @@ void debug_maux(args_t *args)
     {
         bcf_sr_t *reader = &files->readers[j];
         buffer_t *buf = &maux->buf[j];
-        fprintf(stderr," reader %d: ", j);
+        fprintf(stderr," reader %d (k=%d-%d): ", j,buf->beg,buf->end);
         for (k=buf->beg; k<buf->end; k++)
         {
-            if ( buf->rec[k].skip & SKIP_DONE ) continue;
-            bcf1_t *line = reader->buffer[k];
+            if ( buf->rec[k].skip & SKIP_DONE ) { fprintf(stderr," DONE"); continue; }
+            bcf1_t *line = reader->buffer[k];               // selected for merging by can_merge
             fprintf(stderr,"\t");
-            if ( buf->rec[k].skip ) fprintf(stderr,"[");  // this record will not be merged in this round
+            if ( buf->cur==k ) fprintf(stderr,"!");         // selected for merging by stage_line
+            if ( buf->rec[k].skip ) fprintf(stderr,"[");    // this record cannot be merged in this round
+            if ( !line->n_allele && maux->gvcf[j].active )
+                fprintf(stderr,"<*>");
             for (l=0; l<line->n_allele; l++)
                 fprintf(stderr,"%s%s", l==0?"":",", line->d.allele[l]);
             if ( buf->rec[k].skip ) fprintf(stderr,"]");
@@ -2686,9 +2840,10 @@ void debug_state(args_t *args)
 {
     maux_t *maux = args->maux;
     int i,j;
+    fprintf(stderr,"State after position=%d done:\n",maux->pos+1);
     for (i=0; i<args->files->nreaders; i++)
     {
-        fprintf(stderr,"reader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end);
+        fprintf(stderr,"\treader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end);
         if ( maux->buf[i].cur >=0 )
         {
             bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i);
@@ -2698,20 +2853,136 @@ void debug_state(args_t *args)
         }
         fprintf(stderr,"\n");
     }
-    fprintf(stderr,"gvcf_min=%d\n", args->maux->gvcf_min);
+    fprintf(stderr,"\tgvcf_min=%d\n", args->maux->gvcf_min);
     for (i=0; i<args->files->nreaders; i++)
     {
-        fprintf(stderr,"reader %d:\tgvcf_active=%d", i,maux->gvcf[i].active);
+        fprintf(stderr,"\t\treader %d:\tgvcf_active=%d", i,maux->gvcf[i].active);
         if ( maux->gvcf[i].active ) fprintf(stderr,"\tpos,end=%"PRId64",%"PRId64, (int64_t) maux->gvcf[i].line->pos+1,(int64_t) maux->gvcf[i].end+1);
         fprintf(stderr,"\n");
     }
     fprintf(stderr,"\n");
 }
 
+
+// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h
+// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault)
+static const int
+    snp_mask   = (VCF_SNP<<1)|(VCF_MNP<<1),
+    indel_mask = (VCF_INDEL<<1),
+    ins_mask   = VCF_INS<<1,
+    del_mask   = VCF_DEL<<1,
+    ref_mask   = 1;
+
+// Can these types be merged given the -m settings? Despite the function's name, its focus is on
+// excluding incompatible records, there will be a finer matching later in stage_line()
+static inline int types_compatible(args_t *args, int selected_types, buffer_t *buf, int irec)
+{
+    int k;
+    maux_t *maux = args->maux;
+    bcf1_t *rec = buf->lines[irec];
+    int rec_types = buf->rec[irec].var_types;
+
+    assert( selected_types );   // this is trivially true, set in can_merge()
+
+    if ( args->collapse & COLLAPSE_ANY ) return 1;  // can merge anything with anything
+
+    // REF and gVCF_REF with no other alleles present can be merged with anything
+    if ( (selected_types&ref_mask) && !(selected_types&(~ref_mask)) ) return 1;
+    if ( (rec_types&ref_mask) && !(rec_types&(~ref_mask)) ) return 1;
+
+    if ( args->collapse!=COLLAPSE_NONE )
+    {
+        // If we are here, one the following modes must have been set: both,snps,indels,snp-ins-del
+        // Include the new record if
+        //  - rec has SNV, we already have SNV, and -m is both,snps,snp-ins-del
+        //  - rec has indel, we already have an indel, and -m both,indels,snp-ins-del
+        if ( args->collapse&(COLLAPSE_SNPS|COLLAPSE_SNP_INS_DEL) )
+        {
+            if ( (rec_types&snp_mask) && (selected_types&snp_mask) ) return 1;
+        }
+        if ( args->collapse&COLLAPSE_INDELS )
+        {
+            if ( (rec_types&indel_mask) && (selected_types&indel_mask) ) return 1;
+        }
+        if ( args->collapse&COLLAPSE_SNP_INS_DEL )
+        {
+            if ( (rec_types&ins_mask) && (selected_types&ins_mask) ) return 1;
+            if ( (rec_types&del_mask) && (selected_types&del_mask) ) return 1;
+        }
+        // Whatever is left, allow to match if the alleles match exactly
+    }
+
+    // The -m none mode or exact matching requested
+    // Simple test first: are the variants of the same type?
+    int x = selected_types >> 1;        // remove REF
+    int y = rec_types >> 1;             // remove REF
+    while ( x && y ) { x>>=1; y>>=1; }
+    if ( x || y ) return 0;             // the types differ
+
+    if ( vcmp_set_ref(args->vcmp,maux->als[0],rec->d.allele[0]) < 0 ) return 0;   // refs are not compatible
+    for (k=1; k<rec->n_allele; k++)
+    {
+        if ( bcf_has_variant_type(rec,k,VCF_REF) ) continue;    // this must be gVCF_REF (<*> or <NON_REF>)
+        if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,rec->d.allele[k])>=0 ) break;
+    }
+    if ( k==rec->n_allele ) return 0;   // this record has a new allele rec->d.allele[k]
+    return 1;   // all alleles in rec are also in the records selected thus far, perhaps save for gVCF_REF
+}
+
+static void maux_update_alleles(args_t *args, int ireader, int irec)
+{
+    int k;
+    bcf_sr_t *reader = &args->files->readers[ireader];
+    maux_t *maux = args->maux;
+    buffer_t *buf = &maux->buf[ireader];
+    maux1_t *ma1 = &buf->rec[irec];
+    bcf1_t *line = buf->lines[irec];
+    hts_expand(int, line->n_allele, ma1->mmap, ma1->map);
+    if ( !maux->nals )  // first record to be merged, copy the alleles to the output
+    {
+        maux->nals = line->n_allele;
+        hts_expand0(char*, maux->nals, maux->mals, maux->als);
+        hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
+        hts_expand0(int, maux->nals, maux->mals_types, maux->als_types);
+        for (k=0; k<maux->nals; k++)
+        {
+            free(maux->als[k]);
+            maux->als[k] = strdup(line->d.allele[k]);
+            ma1->map[k]  = k;
+            maux->cnt[k] = 1;
+            int var_type = bcf_has_variant_type(line, k, VCF_ANY);
+            if ( args->collapse==COLLAPSE_SNP_INS_DEL ) var_type &= ~VCF_INDEL;
+            maux->als_types[k] = var_type ? var_type<<1 : ref_mask;
+        }
+        return;
+    }
+    // normalize alleles
+    maux->als = merge_alleles(line->d.allele, line->n_allele, ma1->map, maux->als, &maux->nals, &maux->mals);
+    if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname);
+    hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
+    hts_expand0(int, maux->nals, maux->mals_types, maux->als_types);
+    for (k=1; k<line->n_allele; k++)
+    {
+        int ik = ma1->map[k];
+        int var_type = bcf_has_variant_type(line, k, VCF_ANY);
+        if ( args->collapse==COLLAPSE_SNP_INS_DEL ) var_type &= ~VCF_INDEL;
+        maux->als_types[ik] = var_type ? var_type<<1 : ref_mask;
+        maux->cnt[ik]++;    // how many times an allele appears in the files
+    }
+    maux->cnt[0]++;
+}
+
 /*
-   Determine which line should be merged from which reader: go through all
-   readers and all buffered lines, expand REF,ALT and try to match lines with
-   the same ALTs.
+   Determine which lines remain to be merged across readers at the current position and
+   are compatible given the -m criteria. This is indicated by maux1_t.skip: 0=compatible,
+   SKIP_DONE=the record is done, SKIP_DIFF=not compatible and will be included next time.
+
+   At the same time count how many times is each allele present across the readers and records
+   so that we can prioritize the records with the same alleles to come first. In the end maximum
+   one record at a time can be selected from each reader and that witll be done in stage_line().
+
+   The function maux_reset already initialized structures for this position, so here each
+   reader comes with the beg,end indexes that point to records with the same maux_t.pos position.
  */
 int can_merge(args_t *args)
 {
@@ -2719,28 +2990,39 @@ int can_merge(args_t *args)
     maux_t *maux = args->maux;
     gvcf_aux_t *gaux = maux->gvcf;
     char *id = NULL, ref = 'N';
-    int i,j,k, ntodo = 0;
+    int i,j, ntodo = 0;
 
     for (i=0; i<maux->nals; i++)
     {
         free(maux->als[i]);
         maux->als[i] = NULL;
+        maux->cnt[i] = 0;
     }
     maux->var_types = maux->nals = 0;
 
-    // this is only for the `-m none -g` mode, ensure that <*> lines come last
-    #define VCF_GVCF_REF 1
-
+    // In this loop we do the following:
+    //  - remember the first encountered ID if matching by ID
+    //  - count the number of unprocessed records at this position
+    //  - collect all variant types at this position. This is to be able to perform -m matching and
+    //    print SNVs first, then indels, then gVCF blocks
+    //  - init the 'skip' variable to SKIP_DIFF for each record that has not been used yet
     for (i=0; i<files->nreaders; i++)
     {
         buffer_t *buf = &maux->buf[i];
+        buf->var_types = 0;
 
-        if ( gaux && gaux[i].active )
+        if ( gaux && gaux[i].active ) // active gvcf block
         {
-            // skip readers with active gvcf blocks
             buf->rec[buf->beg].skip = SKIP_DIFF;
+            maux->var_types |= ref_mask;
+            buf->var_types |= ref_mask;
+            buf->rec[buf->beg].var_types = ref_mask;
             continue;
         }
+
+        // for gvcf: find out REF at this position
+        if ( buf->beg < buf->end && ref=='N' ) ref = buf->lines[buf->beg]->d.allele[0][0];
+
         for (j=buf->beg; j<buf->end; j++)
         {
             if ( buf->rec[j].skip & SKIP_DONE ) continue;
@@ -2749,118 +3031,70 @@ int can_merge(args_t *args)
             ntodo++;
 
             bcf1_t *line = buf->lines[j];
-            if ( args->merge_by_id )
-                id = line->d.id;
-            else
+            if ( args->merge_by_id && !id ) { id = line->d.id; continue; }      // set ID when merging by id
+
+            if ( !buf->rec[j].var_types )
             {
                 int var_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap);
-                if (var_type < 0) error("bcf_has_variant_types() failed.");
+                if ( var_type < 0 ) error("bcf_has_variant_types() failed.");
                 if ( args->collapse==COLLAPSE_SNP_INS_DEL )
                 {
                     // need to distinguish between ins and del so strip the VCF_INDEL flag
                     var_type &= ~VCF_INDEL;
                 }
-                maux->var_types |= var_type ? var_type<<2 : 2;
-
-                // for the `-m none -g` mode
-                if ( args->collapse==COLLAPSE_NONE && args->do_gvcf && is_gvcf_block(line) )
-                    maux->var_types |= VCF_GVCF_REF;
+                var_type = var_type ? var_type<<1 : ref_mask;
+                if ( args->do_gvcf && is_gvcf_block(line) ) var_type |= ref_mask;
+                buf->rec[j].var_types = var_type;
             }
+            maux->var_types |= buf->rec[j].var_types;
+            buf->var_types |= buf->rec[j].var_types;
         }
-
-        // for gvcf: find out REF at this position
-        if ( buf->beg < buf->end && ref=='N' )
-            ref = buf->lines[buf->beg]->d.allele[0][0];
     }
     if ( !ntodo ) return 0;
 
+    int selected_types = 0;
+
     // In this loop we select from each reader compatible candidate lines.
     // (i.e. SNPs or indels). Go through all files and all lines at this
     // position and normalize relevant alleles.
     // REF-only sites may be associated with both SNPs and indels.
     for (i=0; i<files->nreaders; i++)
     {
-        bcf_sr_t *reader = &files->readers[i];
         buffer_t *buf = &maux->buf[i];
-
         if ( gaux && gaux[i].active )
         {
+            // gVCF records inherited from an upstream gVCF block have incorrect or missing allele and position
             gaux[i].line->d.allele[0][0] = ref;
             gaux[i].line->pos = maux->pos;
+            maux_update_alleles(args, i, buf->beg);
+            selected_types |= ref_mask;
+            continue;
         }
-
         for (j=buf->beg; j<buf->end; j++)
         {
             if ( buf->rec[j].skip & SKIP_DONE ) continue;
 
             bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer
-
-            int line_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap);
-            if (line_type < 0) error("bcf_has_variant_types() failed.");
-            line_type = line_type ? line_type<<2 : 2;
+            int line_types = buf->rec[j].var_types;
 
             // select relevant lines
             if ( args->merge_by_id )
             {
-                if ( strcmp(id,line->d.id) ) continue;
+                if ( strcmp(id,line->d.id) ) continue;      // matching by ID and it does not match the selected record
             }
+            else if ( selected_types && !types_compatible(args,selected_types,buf,j) ) continue;
             else
             {
-                // when merging gVCF in -m none mode, make sure that gVCF blocks with the same POS as variant
-                // records come last, otherwise infinite loop is created (#1164)
-                if ( args->collapse==COLLAPSE_NONE && args->do_gvcf )
-                {
-                    if ( is_gvcf_block(line) && (maux->var_types & (~(VCF_GVCF_REF|2))) ) continue;
-                }
-                if ( args->collapse==COLLAPSE_NONE && maux->nals )
-                {
-                    // All alleles of the tested record must be present in the
-                    // selected maux record plus variant types must be the same
-                    if ( (maux->var_types & line_type) != line_type ) continue;
-                    if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) < 0 ) continue;   // refs not compatible
-                    for (k=1; k<line->n_allele; k++)
-                    {
-                        if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,line->d.allele[k])>=0 ) break;
-                    }
-                    if ( !(line_type&ref_mask) && k==line->n_allele ) continue;  // not a REF-only site and there is no matching allele
-                }
-                if ( !(args->collapse&COLLAPSE_ANY) )
-                {
-                    // Merge:
-                    //  - SNPs+SNPs+MNPs+REF if -m both,snps
-                    //  - indels+indels+REF  if -m both,indels, REF only if SNPs are not present
-                    //  - SNPs come first
-                    if ( line_type & (indel_mask|ins_mask|del_mask) )
-                    {
-                        if ( !(line_type&snp_mask) && maux->var_types&snp_mask ) continue;  // SNPs come first
-                        if ( args->do_gvcf && maux->var_types&ref_mask ) continue;  // never merge indels with gVCF blocks
-                    }
-                }
+                // First time here, choosing the first line: prioritize SNPs when available in the -m snps,both modes
+                if ( (args->collapse&COLLAPSE_SNPS || args->collapse==COLLAPSE_NONE)     // asked to merge SNVs into multiallelics
+                        && (maux->var_types&snp_mask)                   // there are SNVs at the current position
+                        && !(buf->rec[j].var_types&(snp_mask|ref_mask)) // and this record is not a SNV nor ref
+                   ) continue;
             }
-            buf->rec[j].skip = 0;
+            selected_types |= line_types;
 
-            hts_expand(int, line->n_allele, buf->rec[j].mmap, buf->rec[j].map);
-            if ( !maux->nals )    // first record, copy the alleles to the output
-            {
-                maux->nals = line->n_allele;
-                hts_expand0(char*, maux->nals, maux->mals, maux->als);
-                hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
-                for (k=0; k<maux->nals; k++)
-                {
-                    free(maux->als[k]);
-                    maux->als[k] = strdup(line->d.allele[k]);
-                    buf->rec[j].map[k] = k;
-                    maux->cnt[k] = 1;
-                }
-                continue;
-            }
-            // normalize alleles
-            maux->als = merge_alleles(line->d.allele, line->n_allele, buf->rec[j].map, maux->als, &maux->nals, &maux->mals);
-            if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname);
-            hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
-            for (k=1; k<line->n_allele; k++)
-                maux->cnt[ buf->rec[j].map[k] ]++;    // how many times an allele appears in the files
-            maux->cnt[0]++;
+            buf->rec[j].skip = 0;   // the j-th record from i-th reader can be included. Final decision will be made in stage_line
+            maux_update_alleles(args, i, j);
         }
     }
     return 1;
@@ -2878,48 +3112,61 @@ void stage_line(args_t *args)
     bcf_srs_t *files = args->files;
     maux_t *maux = args->maux;
 
-    // debug_maux(args);
-
-    // take the most frequent allele present in multiple files, REF is skipped
-    int i,j,k,icnt = 1;
-    for (i=2; i<maux->nals; i++)
-        if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i;
+    // Take the most frequent allele present in multiple files, REF and gVCF_REF is skipped.
+    int i,j,k,icnt = -1;
+    for (i=1; i<maux->nals; i++)
+    {
+        if ( maux->als_types[i] & ref_mask ) continue;
+        if ( icnt==-1 || maux->cnt[icnt] < maux->cnt[i] ) icnt = i;
+    }
+    int selected_type = icnt>0 ? maux->als_types[icnt] : ref_mask;
 
     int nout = 0;
     for (i=0; i<files->nreaders; i++)
     {
         buffer_t *buf = &maux->buf[i];
         buf->cur = -1;
-        if ( buf->beg >= buf->end ) continue;   // no lines in the buffer
+        if ( buf->beg >= buf->end ) continue; // No lines in the buffer at this site
 
         // find lines with the same allele
         for (j=buf->beg; j<buf->end; j++)
         {
-            if ( buf->rec[j].skip ) continue;   // done or not compatible
-            if ( args->merge_by_id ) break;
-            if ( maux->nals==1 && buf->lines[j]->n_allele==1 ) break;   // REF-only record
+            if ( buf->rec[j].skip )
+            {
+                int is_gvcf = maux->gvcf && maux->gvcf[i].active ? 1 : 0;
+                if ( !is_gvcf && is_gvcf_block(buf->lines[j]) ) is_gvcf = 1;
+                if ( !is_gvcf ) continue;   // done or not compatible
+            }
+            if ( args->merge_by_id ) break;     // if merging by ID and the line is compatible, the this is THE line
+
+            // skip if the reader has a record that matches the most frequent allele and this record is not it
+            if ( (selected_type & buf->var_types) && !(selected_type & buf->rec[j].var_types) ) continue;
 
+            // if the reader does not have the most frequent allele type but is a ref, accept
+            if ( !(selected_type & buf->var_types) && (buf->rec[j].var_types & ref_mask) ) break;
+            if ( selected_type==ref_mask ) break;
+
+            // accept if the record has the most frequent allele
             for (k=0; k<buf->lines[j]->n_allele; k++)
                 if ( icnt==buf->rec[j].map[k] ) break;
-
             if ( k<buf->lines[j]->n_allele ) break;
         }
         if ( j>=buf->end )
         {
             // no matching allele found in this file
-            if ( args->collapse==COLLAPSE_NONE ) continue;
+            if ( args->collapse==COLLAPSE_NONE ) continue;  // exact matching requested, skip
 
+            // choose something compatible to create a multiallelic site given the -m criteria
             for (j=buf->beg; j<buf->end; j++)
             {
                 if ( buf->rec[j].skip ) continue;   // done or not compatible
                 if ( args->collapse&COLLAPSE_ANY ) break;   // anything can be merged
-                int line_type = bcf_has_variant_types(buf->lines[j], VCF_ANY, bcf_match_overlap);
-                if (line_type < 0) error("bcf_has_variant_types() failed.");
-                if ( maux->var_types&snp_mask && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
-                if ( maux->var_types&indel_mask && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
-                if ( maux->var_types&ins_mask && line_type&VCF_INS && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
-                if ( maux->var_types&del_mask && line_type&VCF_DEL && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
-                if ( line_type==VCF_REF )
+                int line_type = buf->rec[j].var_types;
+                if ( maux->var_types&snp_mask && line_type&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
+                if ( maux->var_types&indel_mask && line_type&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
+                if ( maux->var_types&ins_mask && line_type&ins_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
+                if ( maux->var_types&del_mask && line_type&del_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
+                if ( line_type&ref_mask )
                 {
                     if ( maux->var_types&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
                     if ( maux->var_types&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
@@ -2940,12 +3187,21 @@ void stage_line(args_t *args)
         {
             // found a suitable line for merging
             buf->cur = j;
-
-            // mark as finished so that it's ignored next time
-            buf->rec[j].skip  = SKIP_DONE;
-            nout++;
         }
     }
+
+    // debug_maux(args);
+
+    // Mark lines staged for merging as finished so that they are ignored next time
+    for (i=0; i<files->nreaders; i++)
+    {
+        buffer_t *buf = &maux->buf[i];
+        if ( buf->cur == -1 ) continue;
+
+        buf->rec[buf->cur].skip  = SKIP_DONE;
+        nout++;
+    }
+
     assert( nout );
 }
 
@@ -3078,6 +3334,7 @@ void merge_vcf(args_t *args)
             error_errno("[%s] Failed to update header", __func__);
     }
     info_rules_init(args);
+    missing_rules_init(args);
 
     bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header));
     if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
@@ -3087,6 +3344,7 @@ void merge_vcf(args_t *args)
         if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
         return;
     }
+    else if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
 
     if ( args->collapse==COLLAPSE_NONE ) args->vcmp = vcmp_init();
     args->maux = maux_init(args);
@@ -3122,9 +3380,19 @@ void merge_vcf(args_t *args)
         gvcf_flush(args,1);
 
     info_rules_destroy(args);
+    missing_rules_destroy(args);
     maux_destroy(args->maux);
     bcf_hdr_destroy(args->out_hdr);
-    if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(args->out_fh)<0 )
+        {
+            if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
+    if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname?args->output_fname:"stdout");
     bcf_destroy1(args->out_line);
     kh_destroy(strdict, args->tmph);
     if ( args->tmps.m ) free(args->tmps.s);
@@ -3146,11 +3414,12 @@ static void usage(void)
     fprintf(stderr, "    -0  --missing-to-ref              Assume genotypes at missing sites are 0/0\n");
     fprintf(stderr, "    -f, --apply-filters LIST          Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
     fprintf(stderr, "    -F, --filter-logic x|+            Remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n");
-    fprintf(stderr, "    -g, --gvcf -|REF.FA               Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n");
+    fprintf(stderr, "    -g, --gvcf -|REF.FA               Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max -M PL:max,AD:0\n");
     fprintf(stderr, "    -i, --info-rules TAG:METHOD,..    Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
     fprintf(stderr, "    -l, --file-list FILE              Read file names from the file\n");
     fprintf(stderr, "    -L, --local-alleles INT           EXPERIMENTAL: if more than <int> ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
     fprintf(stderr, "    -m, --merge STRING                Allow multiallelic records for <snps|indels|both|snp-ins-del|all|none|id>, see man page for details [both]\n");
+    fprintf(stderr, "    -M, --missing-rules TAG:METHOD    Rules for replacing missing values in numeric vectors (.,0,max) when unknown allele <*> is not present [.]\n");
     fprintf(stderr, "        --no-index                    Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n");
     fprintf(stderr, "        --no-version                  Do not append version and command line to the header\n");
     fprintf(stderr, "    -o, --output FILE                 Write output to a file [standard output]\n");
@@ -3159,6 +3428,7 @@ static void usage(void)
     fprintf(stderr, "    -R, --regions-file FILE           Restrict to regions listed in a file\n");
     fprintf(stderr, "        --regions-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
     fprintf(stderr, "        --threads INT                 Use multithreading with <int> worker threads [0]\n");
+    fprintf(stderr, "        --write-index                 Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     exit(1);
 }
@@ -3197,13 +3467,15 @@ int main_vcfmerge(int argc, char *argv[])
         {"regions-file",required_argument,NULL,'R'},
         {"regions-overlap",required_argument,NULL,4},
         {"info-rules",required_argument,NULL,'i'},
+        {"missing-rules",required_argument,NULL,'M'},
         {"no-version",no_argument,NULL,8},
         {"no-index",no_argument,NULL,10},
         {"filter-logic",required_argument,NULL,'F'},
+        {"write-index",no_argument,NULL,11},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0L:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:",loptions,NULL)) >= 0) {
         switch (c) {
             case 'L':
                 args->local_alleles = strtol(optarg,&tmp,10);
@@ -3227,6 +3499,7 @@ int main_vcfmerge(int argc, char *argv[])
                 break;
             case 'l': args->file_list = optarg; break;
             case 'i': args->info_rules = optarg; break;
+            case 'M': args->missing_rules_str = optarg; break;
             case 'o': args->output_fname = optarg; break;
             case 'O':
                 switch (optarg[0]) {
@@ -3254,7 +3527,7 @@ int main_vcfmerge(int argc, char *argv[])
                 else if ( !strcmp(optarg,"any") ) args->collapse |= COLLAPSE_ANY;
                 else if ( !strcmp(optarg,"all") ) args->collapse |= COLLAPSE_ANY;
                 else if ( !strcmp(optarg,"none") ) args->collapse = COLLAPSE_NONE;
-                else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL;
+                else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL|COLLAPSE_SNPS;
                 else if ( !strcmp(optarg,"id") ) { args->collapse = COLLAPSE_NONE; args->merge_by_id = 1; }
                 else error("The -m type \"%s\" is not recognised.\n", optarg);
                 break;
@@ -3271,6 +3544,7 @@ int main_vcfmerge(int argc, char *argv[])
             case  9 : args->n_threads = strtol(optarg, 0, 0); break;
             case  8 : args->record_cmd_line = 0; break;
             case 10 : args->no_index = 1; break;
+            case 11 : args->write_index = 1; break;
             case 'h':
             case '?': usage(); break;
             default: error("Unknown argument: %s\n", optarg);
diff --git a/vcfnorm.c b/vcfnorm.c
index 9538f8d01..02ad322d1 100644
--- a/vcfnorm.c
+++ b/vcfnorm.c
@@ -1,6 +1,6 @@
 /*  vcfnorm.c -- Left-align and normalize indels.
 
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -40,6 +40,8 @@ THE SOFTWARE.  */
 #include "bcftools.h"
 #include "rbuf.h"
 #include "abuf.h"
+#include "gff.h"
+#include "regidx.h"
 
 #define CHECK_REF_EXIT 1
 #define CHECK_REF_WARN 2
@@ -86,8 +88,8 @@ typedef struct
     int32_t *int32_arr;
     int ntmp_arr1, ntmp_arr2, nint32_arr;
     kstring_t *tmp_str;
-    kstring_t *tmp_als, tmp_kstr;
-    int ntmp_als;
+    kstring_t *tmp_als, *tmp_del, tmp_kstr;
+    int ntmp_als, ntmp_del;
     rbuf_t rbuf;
     int buf_win;            // maximum distance between two records to consider
     int aln_win;            // the realignment window size (maximum repeat size)
@@ -105,6 +107,13 @@ typedef struct
     int use_star_allele, ma_use_ref_allele;
     char *old_rec_tag;
     htsFile *out;
+    char *index_fn;
+    int write_index;
+    int right_align;
+    char *gff_fname;
+    gff_t *gff;
+    regidx_t *idx_tscript;
+    regitr_t *itr_tscript;
 }
 args_t;
 
@@ -344,6 +353,157 @@ static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt)
             error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
 }
 
+static int is_left_align(args_t *args, bcf1_t *line)
+{
+    if ( args->right_align ) return 0;
+    if ( !args->gff ) return 1;
+    const char *chr = bcf_seqname(args->hdr,line);
+    if ( !strncasecmp("chr",chr,3) ) chr += 3;  // strip 'chr' prefix, that's what we requested the GFF reader to do
+    if ( !regidx_overlap(args->idx_tscript,chr,line->pos,line->pos+line->rlen, args->itr_tscript) ) return 1;
+
+    // if there are two conflicting overlapping transcripts, go with the default left-alignment
+    int has_fwd = 0;
+    while ( regitr_overlap(args->itr_tscript) )
+    {
+        gf_tscript_t *tr = regitr_payload(args->itr_tscript, gf_tscript_t*);
+        if ( tr->strand==STRAND_FWD ) has_fwd = 1;
+        if ( tr->strand==STRAND_REV ) return 1;
+    }
+    // either no hit at all (then left-align) or everything was on fwd strand (then right-align)
+    return has_fwd ? 0 : 1;
+}
+static hts_pos_t realign_left(args_t *args, bcf1_t *line)
+{
+    // trim from right
+    char *ref = NULL;
+    int i;
+    hts_pos_t nref=0, new_pos = line->pos;
+    kstring_t *als = args->tmp_als;
+    while (1)
+    {
+        // is the rightmost base identical in all alleles?
+        int min_len = als[0].l;
+        for (i=1; i<line->n_allele; i++)
+        {
+            if ( toupper(als[0].s[ als[0].l-1 ]) != toupper(als[i].s[ als[i].l-1 ]) ) break;
+            if ( als[i].l < min_len ) min_len = als[i].l;
+        }
+        if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed
+        if ( min_len<=1 && new_pos==0 ) break;
+
+        int pad_from_left = 0;
+        for (i=0; i<line->n_allele; i++) // trim all alleles
+        {
+            als[i].l--;
+            if ( !als[i].l ) pad_from_left = 1;
+        }
+        if ( pad_from_left )
+        {
+            // extend all alleles to the left by aln_win bases (unless close to the chr start).
+            // Extra bases will be trimmed from the left after this loop is done
+            int npad = new_pos >= args->aln_win ? args->aln_win : new_pos;
+            free(ref);
+            ref = faidx_fetch_seq64(args->fai, bcf_seqname(args->hdr,line), new_pos-npad, new_pos-1, &nref);
+            if ( !ref ) error("faidx_fetch_seq64 failed at %s:%"PRId64"\n", bcf_seqname(args->hdr,line), (int64_t) new_pos-npad+1);
+            replace_iupac_codes(ref,nref);
+            for (i=0; i<line->n_allele; i++)
+            {
+                ks_resize(&als[i], als[i].l + npad);
+                if ( als[i].l ) memmove(als[i].s+npad,als[i].s,als[i].l);
+                memcpy(als[i].s,ref,npad);
+                als[i].l += npad;
+            }
+            new_pos -= npad;
+        }
+    }
+    free(ref);
+
+    // trim from left
+    int ntrim_left = 0;
+    while (1)
+    {
+        // is the first base identical in all alleles?
+        int min_len = als[0].l - ntrim_left;
+        for (i=1; i<line->n_allele; i++)
+        {
+            if ( toupper(als[0].s[ntrim_left]) != toupper(als[i].s[ntrim_left]) ) break;
+            if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
+        }
+        if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed
+        ntrim_left++;
+    }
+    if ( ntrim_left )
+    {
+        for (i=0; i<line->n_allele; i++)
+        {
+            memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left);
+            als[i].l -= ntrim_left;
+        }
+        new_pos += ntrim_left;
+    }
+    return new_pos;
+}
+
+static hts_pos_t realign_right(args_t *args, bcf1_t *line)
+{
+    char *ref = NULL;
+    int i;
+    hts_pos_t new_pos = line->pos, nref = 0;
+    kstring_t *als = args->tmp_als;
+
+    // trim from left
+    int ntrim_left = 0, npad_right = line->rlen, has_indel = 0;
+    while (1)
+    {
+        // is the leftmost base identical in all alleles?
+        int min_len = als[0].l - ntrim_left;
+        for (i=1; i<line->n_allele; i++)
+        {
+            if ( als[0].l!=als[i].l ) has_indel = 1;
+            if ( toupper(als[0].s[ntrim_left]) != toupper(als[i].s[ntrim_left]) ) break;
+            if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
+        }
+        if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed further
+
+        ntrim_left++;
+        if ( min_len<=1 ) // pad from the right
+        {
+            free(ref);
+            ref = faidx_fetch_seq64(args->fai, bcf_seqname(args->hdr,line), line->pos + npad_right, line->pos + npad_right + args->aln_win, &nref);
+            if ( !ref ) error("faidx_fetch_seq64 failed at %s:%"PRIhts_pos"\n",bcf_seqname(args->hdr,line), new_pos + ntrim_left);
+            npad_right += args->aln_win;
+            replace_iupac_codes(ref,nref);
+            for (i=0; i<line->n_allele; i++) kputs(ref, &als[i]);
+        }
+    }
+    ntrim_left -= has_indel;
+    if ( ntrim_left > 0 )
+    {
+        for (i=0; i<line->n_allele; i++)
+        {
+            memmove(als[i].s, als[i].s + ntrim_left, als[i].l - ntrim_left);
+            als[i].l -= ntrim_left;
+        }
+        new_pos += ntrim_left;
+    }
+    free(ref);
+
+    // trim from right
+    while (1)
+    {
+        // is the last base identical in all alleles?
+        int min_len = als[0].l;
+        for (i=1; i<line->n_allele; i++)
+        {
+            if ( toupper(als[0].s[ als[0].l-1 ]) != toupper(als[i].s[ als[i].l-1 ]) ) break;
+            if ( min_len > als[i].l ) min_len = als[i].l;
+        }
+        if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed more
+        for (i=0; i<line->n_allele; i++) { als[i].l--; als[i].s[als[i].l]=0; }
+    }
+    return new_pos;
+}
+
 #define ERR_DUP_ALLELE       -2
 #define ERR_REF_MISMATCH     -1
 #define ERR_OK                0
@@ -396,10 +556,32 @@ static int realign(args_t *args, bcf1_t *line)
 
     // make a copy of each allele for trimming
     hts_expand0(kstring_t,line->n_allele,args->ntmp_als,args->tmp_als);
+    hts_expand0(kstring_t,line->n_allele,args->ntmp_del,args->tmp_del);
     kstring_t *als = args->tmp_als;
+    kstring_t *del = args->tmp_del;
     for (i=0; i<line->n_allele; i++)
     {
-        if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC;  // symbolic allele
+        del[i].l = 0;
+        if ( line->d.allele[i][0]=='<' )
+        {
+            // symbolic allele, only <DEL.*> will be realigned
+            if ( strncmp("<DEL",line->d.allele[i],4) ) return ERR_SYMBOLIC;
+            if ( nref < line->rlen )
+            {
+                free(ref);
+                reflen = line->rlen;
+                ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref);
+                if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1);
+                seq_to_upper(ref,0);
+                replace_iupac_codes(ref,nref);  // any non-ACGT character in fasta ref is replaced with N
+                als[0].l = 0;
+                kputs(ref, &als[0]);
+                als[i].l = 0;
+                kputsn(ref,1,&als[i]);
+                kputs(line->d.allele[i],&del[i]);
+                continue;
+            }
+        }
         if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION;  // spanning deletion
         if ( has_non_acgtn(line->d.allele[i],line->shared.l) )
         {
@@ -416,69 +598,17 @@ static int realign(args_t *args, bcf1_t *line)
 
         if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE;
     }
-
-    // trim from right
-    int new_pos = line->pos;
-    while (1)
-    {
-        // is the rightmost base identical in all alleles?
-        int min_len = als[0].l;
-        for (i=1; i<line->n_allele; i++)
-        {
-            if ( toupper(als[0].s[ als[0].l-1 ])!=toupper(als[i].s[ als[i].l-1 ]) ) break;
-            if ( als[i].l < min_len ) min_len = als[i].l;
-        }
-        if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed
-        if ( min_len<=1 && new_pos==0 ) break;
-
-        int pad_from_left = 0;
-        for (i=0; i<line->n_allele; i++) // trim all alleles
-        {
-            als[i].l--;
-            if ( !als[i].l ) pad_from_left = 1;
-        }
-        if ( pad_from_left )
-        {
-            int npad = new_pos >= args->aln_win ? args->aln_win : new_pos;
-            free(ref);
-            ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, new_pos-npad, new_pos-1, &nref);
-            if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) new_pos-npad+1);
-            replace_iupac_codes(ref,nref);
-            for (i=0; i<line->n_allele; i++)
-            {
-                ks_resize(&als[i], als[i].l + npad);
-                if ( als[i].l ) memmove(als[i].s+npad,als[i].s,als[i].l);
-                memcpy(als[i].s,ref,npad);
-                als[i].l += npad;
-            }
-            new_pos -= npad;
-        }
-    }
     free(ref);
+    ref = NULL;
 
-    // trim from left
-    int ntrim_left = 0;
-    while (1)
-    {
-        // is the first base identical in all alleles?
-        int min_len = als[0].l - ntrim_left;
-        for (i=1; i<line->n_allele; i++)
-        {
-            if ( als[0].s[ntrim_left]!=als[i].s[ntrim_left] ) break;
-            if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
-        }
-        if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed
-        ntrim_left++;
-    }
-    if ( ntrim_left )
-    {
-        for (i=0; i<line->n_allele; i++)
-        {
-            memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left);
-            als[i].l -= ntrim_left;
-        }
-        new_pos += ntrim_left;
-    }
+    // which direction are we aligning?
+    int left_align = is_left_align(args, line);
+
+    hts_pos_t new_pos;
+    if ( left_align )
+        new_pos = realign_left(args, line);
+    else
+        new_pos = realign_right(args, line);
 
     // Have the alleles changed?
     als[0].s[ als[0].l ] = 0;  // in order for strcmp to work
@@ -491,7 +621,8 @@ static int realign(args_t *args, bcf1_t *line)
     for (i=0; i<line->n_allele; i++)
     {
         if (i>0) kputc(',',&args->tmp_kstr);
-        kputsn(als[i].s,als[i].l,&args->tmp_kstr);
+        if ( del[i].l ) kputs(del[i].s,&args->tmp_kstr);
+        else kputsn(als[i].s,als[i].l,&args->tmp_kstr);
     }
     args->tmp_kstr.s[ args->tmp_kstr.l ] = 0;
     bcf_update_alleles_str(args->out_hdr,line,args->tmp_kstr.s);
@@ -1281,10 +1412,12 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_
         ngts2 /= nsmpl;
         if ( ngts!=ngts2 ) error("Error at %s:%"PRId64": cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1);
 
-        int32_t *gt  = (int32_t*) args->tmp_arr1;
-        int32_t *gt2 = (int32_t*) args->tmp_arr2;
+        int32_t *gt  = (int32_t*) args->tmp_arr1;       // the first, destination line
+        int32_t *gt2 = (int32_t*) args->tmp_arr2;       // one of the subsequent lines, i.e. the source line
         for (j=0; j<nsmpl; j++)
         {
+            // Take each source allele and apply to the first line. We try to preserve the order and phasing and we
+            // never overwrite with ref allele
             for (k2=0; k2<ngts2; k2++)
             {
                 if ( gt2[k2]==bcf_int32_vector_end ) break;
@@ -1292,12 +1425,18 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_
                 int ial2 = bcf_gt_allele(gt2[k2]);
                 if ( ial2==0 ) continue;    // never overwrite with ref
                 if ( ial2>=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial2);
+
+                // The destination allele
                 int ial = args->maps[i].map[ial2];
-                for (k=0; k<ngts; k++)
-                    if ( gt[k]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k]) || !bcf_gt_allele(gt[k]) ) break;
-                if ( k<ngts )
+                if ( gt[k2]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k2]) || !bcf_gt_allele(gt[k2]) )
+                    gt[k2] = bcf_gt_is_phased(gt[k2]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial);
+                else
                 {
-                    gt[k] = bcf_gt_unphased(ial);
+                    // conflict, the first line has non-zero allele, use the old way, possibly disrupt the phasing
+                    for (k=0; k<ngts; k++)
+                        if ( gt[k]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k]) || !bcf_gt_allele(gt[k]) ) break;
+                    if ( k<ngts )
+                        gt[k] = bcf_gt_unphased(ial);
                 }
             }
             gt  += ngts;
@@ -1906,10 +2045,24 @@ static void init_data(args_t *args)
             abuf_set_opt(args->abuf, const char*, INFO_TAG, args->old_rec_tag);
         abuf_set_opt(args->abuf, int, STAR_ALLELE, args->use_star_allele);
     }
+    if ( args->gff_fname )
+    {
+        args->gff = gff_init(args->gff_fname);
+        gff_set(args->gff,verbosity,1);
+        gff_set(args->gff,strip_chr_names,1);
+        gff_parse(args->gff);
+        args->idx_tscript = gff_get(args->gff,idx_tscript);
+        args->itr_tscript = regitr_init(NULL);
+    }
 }
 
 static void destroy_data(args_t *args)
 {
+    if ( args->gff )
+    {
+        gff_destroy(args->gff);
+        regitr_destroy(args->itr_tscript);
+    }
     cmpals_destroy(&args->cmpals_in);
     cmpals_destroy(&args->cmpals_out);
     int i;
@@ -1929,7 +2082,10 @@ static void destroy_data(args_t *args)
         free(args->maps[i].map);
     for (i=0; i<args->ntmp_als; i++)
         free(args->tmp_als[i].s);
+    for (i=0; i<args->ntmp_del; i++)
+        free(args->tmp_del[i].s);
     free(args->tmp_als);
+    free(args->tmp_del);
     free(args->tmp_kstr.s);
     if ( args->tmp_str )
     {
@@ -2018,6 +2174,7 @@ static void normalize_vcf(args_t *args)
         hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
     if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_norm");
     if ( bcf_hdr_write(args->out, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(args->out,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
 
     bcf1_t *line;
     int prev_rid = -1, prev_pos = -1, prev_type = 0;
@@ -2081,6 +2238,15 @@ static void normalize_vcf(args_t *args)
         if ( j>0 ) flush_buffer(args, args->out, j);
     }
     flush_buffer(args, args->out, args->rbuf.n);
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(args->out)<0 )
+        {
+            if ( hts_close(args->out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
     if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
 
     fprintf(stderr,"Lines   total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped);
@@ -2104,6 +2270,7 @@ static void usage(void)
     fprintf(stderr, "    -d, --rm-dup TYPE               Remove duplicate snps|indels|both|all|exact\n");
     fprintf(stderr, "    -f, --fasta-ref FILE            Reference sequence\n");
     fprintf(stderr, "        --force                     Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n");
+    fprintf(stderr, "    -g, --gff-annot FILE            Follow HGVS 3'rule and right-align variants in transcripts on the forward strand\n");
     fprintf(stderr, "        --keep-sum TAG,..           Keep vector sum constant when splitting multiallelics (see github issue #360)\n");
     fprintf(stderr, "    -m, --multiallelics -|+TYPE     Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
     fprintf(stderr, "        --multi-overlaps 0|.        Fill in the reference (0) or missing (.) allele when splitting multiallelics [0]\n");
@@ -2121,6 +2288,7 @@ static void usage(void)
     fprintf(stderr, "        --targets-overlap 0|1|2     Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
     fprintf(stderr, "        --threads INT               Use multithreading with <int> worker threads [0]\n");
     fprintf(stderr, "    -w, --site-win INT              Buffer for sorting lines which changed position during realignment [1000]\n");
+    fprintf(stderr, "        --write-index               Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "Examples:\n");
     fprintf(stderr, "   # normalize and left-align indels\n");
@@ -2163,6 +2331,8 @@ int main_vcfnorm(int argc, char *argv[])
         {"old-rec-tag",required_argument,NULL,12},
         {"keep-sum",required_argument,NULL,10},
         {"fasta-ref",required_argument,NULL,'f'},
+        {"gff-annot",required_argument,NULL,'g'},
+        {"right-align",no_argument,NULL,15},            // undocumented, only for debugging
         {"do-not-normalize",no_argument,NULL,'N'},
         {"multiallelics",required_argument,NULL,'m'},
         {"multi-overlaps",required_argument,NULL,13},
@@ -2181,10 +2351,11 @@ int main_vcfnorm(int argc, char *argv[])
         {"check-ref",required_argument,NULL,'c'},
         {"strict-filter",no_argument,NULL,'s'},
         {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,14},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNa",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNag:",loptions,NULL)) >= 0) {
         switch (c) {
             case  10:
                 // possibly generalize this also to INFO/AD and other tags
@@ -2192,6 +2363,7 @@ int main_vcfnorm(int argc, char *argv[])
                     error("Error: only --keep-sum AD is currently supported. See https://github.com/samtools/bcftools/issues/360 for more.\n");
                 args->keep_sum_ad = 1;  // this will be set to the header id or -1 in init_data
                 break;
+            case 'g': args->gff_fname = optarg; break;
             case 'a': args->atomize = SPLIT; break;
             case 11 :
                 if ( optarg[0]=='*' ) args->use_star_allele = 1;
@@ -2204,6 +2376,8 @@ int main_vcfnorm(int argc, char *argv[])
                 else if ( optarg[0]=='.' ) args->ma_use_ref_allele = 0;
                 else error("Invalid argument to --multi-overlaps\n");
                 break;
+            case 14 : args->write_index = 1; break;
+            case 15 : args->right_align = 1; break;
             case 'N': args->do_indels = 0; break;
             case 'd':
                 if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS;
diff --git a/vcfplugin.c b/vcfplugin.c
index 45686680a..687751961 100644
--- a/vcfplugin.c
+++ b/vcfplugin.c
@@ -1,6 +1,6 @@
 /*  vcfplugin.c -- plugin modules for operating on VCF/BCF files.
 
-    Copyright (C) 2013-2021 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -149,6 +149,8 @@ typedef struct _args_t
 
     char **argv, *output_fname, *regions_list, *targets_list;
     int argc, drop_header, verbose, record_cmd_line, plist_only;
+    char *index_fn;
+    int write_index;
 }
 args_t;
 
@@ -548,6 +550,7 @@ static void init_data(args_t *args)
         if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
         if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
         if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+        if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
     }
 }
 
@@ -569,7 +572,19 @@ static void destroy_data(args_t *args)
     }
     if ( args->filter )
         filter_destroy(args->filter);
-    if (args->out_fh && hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+    if (args->out_fh )
+    {
+        if ( args->write_index )
+        {
+            if ( bcf_idx_save(args->out_fh)<0 )
+            {
+                if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+                error("Error: cannot write to index %s\n", args->index_fn);
+            }
+            free(args->index_fn);
+        }
+        if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+    }
 }
 
 static void usage(args_t *args)
@@ -598,6 +613,7 @@ static void usage(args_t *args)
     fprintf(stderr, "   -l, --list-plugins             List available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
     fprintf(stderr, "   -v, --verbose                  Print verbose information, -vv increases verbosity\n");
     fprintf(stderr, "   -V, --version                  Print version string and exit\n");
+    fprintf(stderr, "       --write-index              Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     exit(1);
 }
@@ -643,9 +659,9 @@ int main_plugin(int argc, char *argv[])
     if ( argv[1][0]!='-' )
     {
         args->verbose = is_verbose(argc, argv);
-        plugin_name = argv[1]; 
-        argc--; 
-        argv++; 
+        plugin_name = argv[1];
+        argc--;
+        argv++;
         load_plugin(args, plugin_name, 1, &args->plugin);
         if ( args->plugin.run )
         {
@@ -675,6 +691,7 @@ int main_plugin(int argc, char *argv[])
         {"targets-file",required_argument,NULL,'T'},
         {"targets-overlap",required_argument,NULL,2},
         {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,10},
         {NULL,0,NULL,0}
     };
     char *tmp;
@@ -723,6 +740,7 @@ int main_plugin(int argc, char *argv[])
                 break;
             case  9 : args->n_threads = strtol(optarg, 0, 0); break;
             case  8 : args->record_cmd_line = 0; break;
+            case 10 : args->write_index = 1; break;
             case '?':
             case 'h': usage_only = 1; break;
             default: error("Unknown argument: %s\n", optarg);
diff --git a/vcfquery.c b/vcfquery.c
index 889f36324..5f4eb07c6 100644
--- a/vcfquery.c
+++ b/vcfquery.c
@@ -1,6 +1,6 @@
 /*  vcfquery.c -- Extracts fields from VCF/BCF file.
 
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -94,6 +94,7 @@ static void init_data(args_t *args)
         smpl_ilist_destroy(ilist);
     }
     args->convert = convert_init(args->header, samples, nsamples, args->format_str);
+    convert_set_option(args->convert, force_newline, 1);
     convert_set_option(args->convert, subset_samples, &args->smpl_pass);
     if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
     free(samples);
diff --git a/vcfsort.c b/vcfsort.c
index 1de2b2867..3b208a0d3 100644
--- a/vcfsort.c
+++ b/vcfsort.c
@@ -1,6 +1,6 @@
 /*  vcfsort.c -- sort subcommand
 
-   Copyright (C) 2017-2022 Genome Research Ltd.
+   Copyright (C) 2017-2023 Genome Research Ltd.
 
    Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -62,6 +62,8 @@ typedef struct _args_t
     uint8_t *mem_block;
     size_t nbuf, mbuf, nblk;
     blk_t *blk;
+    char *index_fn;
+    int write_index;
 }
 args_t;
 
@@ -300,6 +302,7 @@ void merge_blocks(args_t *args)
     set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
     htsFile *out = hts_open(args->output_fname ? args->output_fname : "-", wmode);
     if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(out,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
     while ( bhp->ndat )
     {
         blk_t *blk = bhp->dat[0];
@@ -307,6 +310,15 @@ void merge_blocks(args_t *args)
         khp_delete(blk, bhp);
         blk_read(args, bhp, args->hdr, blk);
     }
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out)<0 )
+        {
+            if ( hts_close(out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
     if ( hts_close(out)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", args->output_fname);
 
     clean_files(args);
@@ -333,6 +345,7 @@ static void usage(args_t *args)
 #else
     fprintf(stderr, "    -T, --temp-dir DIR             temporary files [/tmp/bcftools.XXXXXX]\n");
 #endif
+    fprintf(stderr, "        --write-index              Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     exit(1);
 }
@@ -395,6 +408,7 @@ int main_sort(int argc, char *argv[])
         {"output-file",required_argument,NULL,'o'},
         {"output",required_argument,NULL,'o'},
         {"help",no_argument,NULL,'h'},
+        {"write-index",no_argument,NULL,1},
         {0,0,0,0}
     };
     char *tmp;
@@ -423,6 +437,7 @@ int main_sort(int argc, char *argv[])
                           if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
                       }
                       break;
+            case  1 : args->write_index = 1; break;
             case 'h':
             case '?': usage(args); break;
             default: error("Unknown argument: %s\n", optarg);
diff --git a/vcfstats.c b/vcfstats.c
index 10189fef9..e2744ab3c 100644
--- a/vcfstats.c
+++ b/vcfstats.c
@@ -70,6 +70,13 @@ typedef struct
 }
 idist_t;
 
+// variant allele frequency (fraction of alt allele in pileup as determined from AD) collected into 0.05 bins
+typedef struct
+{
+    int snv[21], indel[21];
+}
+vaf_t;
+
 typedef struct
 {
     uint64_t n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts;
@@ -93,7 +100,8 @@ typedef struct
     int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl;
     int *smpl_hapRef, *smpl_hapAlt, *smpl_missing;
     int *smpl_ins_hets, *smpl_del_hets, *smpl_ins_homs, *smpl_del_homs;
-    int *smpl_frm_shifts; // not-applicable, in-frame, out-frame
+    int *smpl_frm_shifts;   // not-applicable, in-frame, out-frame
+    vaf_t vaf, *smpl_vaf;   // total (INFO/AD) and per-sample (FMT/VAF) VAF distributions
     unsigned long int *smpl_dp;
     idist_t dp, dp_sites;
     int nusr;
@@ -141,7 +149,9 @@ typedef struct
     gtcmp_t *af_gts_snps, *af_gts_indels; // first bin of af_* stats are singletons
     bin_t *af_bins;
     float *farr;
-    int mfarr;
+    int32_t *iarr;
+    int mfarr, miarr;
+    int nref_tot, nhet_tot, nalt_tot, n_nref, i_nref;
 
     // indel context
     indel_ctx_t *indel_ctx;
@@ -447,6 +457,8 @@ static void init_stats(args_t *args)
     if ( args->af_tag && !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,bcf_hdr_id2int(hdr,BCF_DT_ID,args->af_tag)) )
         error("No such INFO tag: %s\n", args->af_tag);
 
+    int id, has_fmt_ad = ((id=bcf_hdr_id2int(hdr,BCF_DT_ID,"AD"))>=0 && bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id)) ? 1 : 0;
+
     #if QUAL_STATS
         args->m_qual = 999;
     #endif
@@ -501,6 +513,8 @@ static void init_stats(args_t *args)
             stats->smpl_dp     = (unsigned long int *) calloc(args->files->n_smpl,sizeof(unsigned long int));
             stats->smpl_ndp    = (int *) calloc(args->files->n_smpl,sizeof(int));
             stats->smpl_sngl   = (int *) calloc(args->files->n_smpl,sizeof(int));
+            if ( has_fmt_ad )
+                stats->smpl_vaf = (vaf_t*) calloc(args->files->n_smpl,sizeof(vaf_t));
             #if HWE_STATS
                 stats->af_hwe  = (int*) calloc(args->m_af*args->naf_hwe,sizeof(int));
             #endif
@@ -586,6 +600,7 @@ static void destroy_stats(args_t *args)
         free(stats->smpl_dp);
         free(stats->smpl_ndp);
         free(stats->smpl_sngl);
+        free(stats->smpl_vaf);
         idist_destroy(&stats->dp);
         idist_destroy(&stats->dp_sites);
         for (j=0; j<stats->nusr; j++)
@@ -602,6 +617,7 @@ static void destroy_stats(args_t *args)
     for (j=0; j<args->nusr; j++) free(args->usr[j].tag);
     if ( args->af_bins ) bin_destroy(args->af_bins);
     free(args->farr);
+    free(args->iarr);
     free(args->usr);
     free(args->tmp_frm);
     free(args->tmp_iaf);
@@ -615,6 +631,8 @@ static void destroy_stats(args_t *args)
     if (args->filter[1]) filter_destroy(args->filter[1]);
 }
 
+// The arary tmp_iaf keeps the index of AF bin for each allele, the first bin is for singletons.
+// The number of bins, either m_af (101) or as given by the user in --af-bins
 static void init_iaf(args_t *args, bcf_sr_t *reader)
 {
     bcf1_t *line = reader->buffer[0];
@@ -869,205 +887,279 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
     }
 }
 
-static inline void update_dvaf(stats_t *stats, bcf1_t *line, bcf_fmt_t *fmt, int ismpl, int ial, int jal)
+// Returns the max non-ref AD value
+static inline int get_ad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int *ial)
 {
-    if ( !fmt ) return;
-
-    float dvaf;
+    int iv, ad = 0;
+    *ial = 0;
     #define BRANCH_INT(type_t,missing,vector_end) { \
-        type_t *p = (type_t *) (fmt->p + fmt->size*ismpl); \
-        if ( p[ial]==vector_end || p[jal]==vector_end ) return; \
-        if ( p[ial]==missing || p[jal]==missing ) return; \
-        if ( !p[ial] && !p[jal] ) return; \
-        dvaf = (float)p[ial]/(p[ial]+p[jal]); \
+        type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \
+        for (iv=1; iv<ad_fmt_ptr->n; iv++) \
+        { \
+            if ( ptr[iv]==vector_end ) break; \
+            if ( ptr[iv]==missing ) continue; \
+            if ( ad < ptr[iv] ) { ad = ptr[iv]; *ial = iv; }\
+        } \
     }
-    switch (fmt->type) {
+    switch (ad_fmt_ptr->type) {
         case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
         case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
         case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
-        default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt->type); exit(1); break;
+        default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); exit(1); break;
     }
     #undef BRANCH_INT
-
+    return ad;
+}
+static inline int get_iad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int ial)
+{
+    #define BRANCH_INT(type_t,missing,vector_end) { \
+        type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \
+        if ( ptr[ial]==vector_end ) return 0; \
+        if ( ptr[ial]==missing ) return 0; \
+        return ptr[ial]; \
+    }
+    switch (ad_fmt_ptr->type) {
+        case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
+        case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
+        case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+        default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); exit(1); break;
+    }
+    #undef BRANCH_INT
+}
+static inline void update_dvaf(stats_t *stats, bcf1_t *line, int ial, float vaf)
+{
     int len = line->d.var[ial].n;
     if ( len < -stats->m_indel ) len = -stats->m_indel;
     else if ( len > stats->m_indel ) len = stats->m_indel;
     int bin = stats->m_indel + len;
     stats->nvaf[bin]++;
-    stats->dvaf[bin] += dvaf;
+    stats->dvaf[bin] += vaf;
+}
+#define vaf2bin(vaf) ((int)nearbyintf((vaf)/0.05))
+static inline void update_vaf(vaf_t *smpl_vaf, bcf1_t *line, int ial, float vaf)
+{
+    int idx = vaf2bin(vaf);
+    if ( bcf_get_variant_type(line,ial)==VCF_SNP ) smpl_vaf->snv[idx]++;
+    else smpl_vaf->indel[idx]++;
 }
 
-static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched)
+static inline int calc_sample_depth(args_t *args, int ismpl, bcf_fmt_t *ad_fmt_ptr, bcf_fmt_t *dp_fmt_ptr)
 {
-    bcf_srs_t *files = args->files;
-    bcf1_t *line = reader->buffer[0];
-    bcf_fmt_t *fmt_ptr;
-    int nref_tot = 0, nhet_tot = 0, nalt_tot = 0;
-    int line_type = bcf_get_variant_types(line);
+    if ( dp_fmt_ptr )
+    {
+        #define BRANCH_INT(type_t,missing,vector_end) { \
+            type_t *ptr = (type_t *) (dp_fmt_ptr->p + dp_fmt_ptr->size*ismpl); \
+            if ( *ptr==missing || *ptr==vector_end ) return -1; \
+            return *ptr; \
+        }
+        switch (dp_fmt_ptr->type) {
+            case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
+            case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
+            case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+            default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, dp_fmt_ptr->type); exit(1); break;
+        }
+        #undef BRANCH_INT
+    }
+    if ( ad_fmt_ptr )
+    {
+        int iv, dp = 0, has_value = 0;
+        #define BRANCH_INT(type_t,missing,vector_end) { \
+            type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \
+            for (iv=0; iv<ad_fmt_ptr->n; iv++) \
+            { \
+                if ( ptr[iv]==vector_end ) break; \
+                if ( ptr[iv]==missing ) continue; \
+                has_value = 1; \
+                dp += ptr[iv]; \
+            } \
+        }
+        switch (ad_fmt_ptr->type) {
+            case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
+            case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
+            case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+            default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); exit(1); break;
+        }
+        #undef BRANCH_INT
+        if ( !has_value ) return -1;
+        return dp;
+    }
+    return -1;
+}
+static inline void sample_gt_stats(args_t *args, stats_t *stats, bcf1_t *line, int ismpl, int gt, int ial, int jal)
+{
+    if ( gt==GT_UNKN )
+    {
+        stats->smpl_missing[ismpl]++;
+        return;
+    }
 
-    if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT")) )
+    int var_type = 0;
+    if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial);
+    if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal);
+    if ( gt==GT_HAPL_R || gt==GT_HAPL_A )
     {
-        bcf_fmt_t *ad_fmt_ptr = bcf_get_variant_types(line)&VCF_INDEL ? bcf_get_fmt(reader->header,reader->buffer[0],"AD") : NULL;
+        if ( var_type&VCF_INDEL && stats->smpl_frm_shifts )
+        {
+            assert( ial<line->n_allele );
+            stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[ial]]++;
+        }
+        if ( gt == GT_HAPL_R ) stats->smpl_hapRef[ismpl]++;
+        if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[ismpl]++;
+        return;
+    }
+    if ( gt != GT_HOM_RR ) { args->n_nref++; args->i_nref = ismpl; }
+    #if HWE_STATS
+        switch (gt)
+        {
+            case GT_HOM_RR: args->nref_tot++; break;
+            case GT_HET_RA: args->nhet_tot++; break;
+            case GT_HET_AA:
+            case GT_HOM_AA: args->nalt_tot++; break;
+        }
+    #endif
 
-        int ref = bcf_acgt2int(*line->d.allele[0]);
-        int is, n_nref = 0, i_nref = 0;
-        for (is=0; is<args->files->n_smpl; is++)
+    if ( var_type&VCF_SNP || var_type==VCF_REF )  // count ALT=. as SNP
+    {
+        if ( gt == GT_HET_RA ) stats->smpl_hets[ismpl]++;
+        else if ( gt == GT_HET_AA ) stats->smpl_hets[ismpl]++;
+        else if ( gt == GT_HOM_RR ) stats->smpl_homRR[ismpl]++;
+        else if ( gt == GT_HOM_AA ) stats->smpl_homAA[ismpl]++;
+        if ( gt != GT_HOM_RR && line->d.var[ial].type&VCF_SNP ) // this is safe, bcf_get_variant_types has been already called
         {
-            int ial, jal;
-            int gt = bcf_gt_type(fmt_ptr, reader->samples[is], &ial, &jal);
-            if ( gt==GT_UNKN )
-            {
-                stats->smpl_missing[is]++;
-                continue;
-            }
-            if ( gt==GT_HAPL_R || gt==GT_HAPL_A )
+            int ref = bcf_acgt2int(*line->d.allele[0]);
+            int alt = bcf_acgt2int(*line->d.allele[ial]);
+            if ( alt<0 ) return;
+            if ( abs(ref-alt)==2 )
+                stats->smpl_ts[ismpl]++;
+            else
+                stats->smpl_tv[ismpl]++;
+        }
+        if ( gt != GT_HOM_RR && line->d.var[jal].type&VCF_SNP && ial!=jal )
+        {
+            int ref = bcf_acgt2int(*line->d.allele[0]);
+            int alt = bcf_acgt2int(*line->d.allele[jal]);
+            if ( alt<0 ) return;
+            if ( abs(ref-alt)==2 )
+                stats->smpl_ts[ismpl]++;
+            else
+                stats->smpl_tv[ismpl]++;
+        }
+    }
+    if ( var_type&VCF_INDEL )
+    {
+        if ( gt != GT_HOM_RR )
+        {
+            stats->smpl_indels[ismpl]++;
+            if ( gt==GT_HET_RA || gt==GT_HET_AA )
             {
-                if ( line_type&VCF_INDEL && stats->smpl_frm_shifts )
+                int is_ins = 0, is_del = 0;
+                if ( bcf_get_variant_type(line,ial)&VCF_INDEL )
                 {
-                    assert( ial<line->n_allele );
-                    stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++;
+                    if ( line->d.var[ial].n < 0 ) is_del = 1;
+                    else is_ins = 1;
                 }
-                if ( gt == GT_HAPL_R ) stats->smpl_hapRef[is]++;
-                if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[is]++;
-                continue;
-            }
-            if ( gt != GT_HOM_RR ) { n_nref++; i_nref = is; }
-            #if HWE_STATS
-                switch (gt)
+                if ( bcf_get_variant_type(line,jal)&VCF_INDEL )
                 {
-                    case GT_HOM_RR: nref_tot++; break;
-                    case GT_HET_RA: nhet_tot++; break;
-                    case GT_HET_AA:
-                    case GT_HOM_AA: nalt_tot++; break;
-                }
-            #endif
-            int var_type = 0;
-            if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial);
-            if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal);
-            if ( var_type&VCF_SNP || var_type==VCF_REF )  // count ALT=. as SNP
-            {
-                if ( gt == GT_HET_RA ) stats->smpl_hets[is]++;
-                else if ( gt == GT_HET_AA ) stats->smpl_hets[is]++;
-                else if ( gt == GT_HOM_RR ) stats->smpl_homRR[is]++;
-                else if ( gt == GT_HOM_AA ) stats->smpl_homAA[is]++;
-                if ( gt != GT_HOM_RR && line->d.var[ial].type&VCF_SNP ) // this is safe, bcf_get_variant_types has been already called
-                {
-                    int alt = bcf_acgt2int(*line->d.allele[ial]);
-                    if ( alt<0 ) continue;
-                    if ( abs(ref-alt)==2 )
-                        stats->smpl_ts[is]++;
-                    else
-                        stats->smpl_tv[is]++;
+                    if ( line->d.var[jal].n < 0 ) is_del = 1;
+                    else is_ins = 1;
                 }
+                // Note that alt-het genotypes with both ins and del allele are counted twice!!
+                if ( is_del ) stats->smpl_del_hets[ismpl]++;
+                if ( is_ins ) stats->smpl_ins_hets[ismpl]++;
             }
-            if ( var_type&VCF_INDEL )
+            else if ( gt==GT_HOM_AA )
             {
-                if ( gt != GT_HOM_RR )
-                {
-                    stats->smpl_indels[is]++;
-
-                    if ( gt==GT_HET_RA || gt==GT_HET_AA )
-                    {
-                        int is_ins = 0, is_del = 0;
-                        if ( bcf_get_variant_type(line,ial)&VCF_INDEL )
-                        {
-                            if ( line->d.var[ial].n < 0 ) is_del = 1;
-                            else is_ins = 1;
-                            update_dvaf(stats,line,ad_fmt_ptr,is,ial,jal);
-                        }
-                        if ( bcf_get_variant_type(line,jal)&VCF_INDEL )
-                        {
-                            if ( line->d.var[jal].n < 0 ) is_del = 1;
-                            else is_ins = 1;
-                            update_dvaf(stats,line,ad_fmt_ptr,is,jal,ial);
-                        }
-                        // Note that alt-het genotypes with both ins and del allele are counted twice!!
-                        if ( is_del ) stats->smpl_del_hets[is]++;
-                        if ( is_ins ) stats->smpl_ins_hets[is]++;
-                    }
-                    else if ( gt==GT_HOM_AA )
-                    {
-                        if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[is]++;
-                        else stats->smpl_ins_homs[is]++;
-                    }
-                }
-                if ( stats->smpl_frm_shifts )
-                {
-                    assert( ial<line->n_allele && jal<line->n_allele );
-                    stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++;
-                    stats->smpl_frm_shifts[is*3 + args->tmp_frm[jal]]++;
-                }
+                if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[ismpl]++;
+                else stats->smpl_ins_homs[ismpl]++;
             }
         }
-        if ( n_nref==1 ) stats->smpl_sngl[i_nref]++;
+        if ( stats->smpl_frm_shifts )
+        {
+            assert( ial<line->n_allele && jal<line->n_allele );
+            stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[ial]]++;
+            stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[jal]]++;
+        }
     }
+}
+static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched)
+{
+    bcf_srs_t *files = args->files;
+    bcf1_t *line = reader->buffer[0];
 
-    #if HWE_STATS
-        if ( nhet_tot + nref_tot + nalt_tot )
+    args->nref_tot = 0;
+    args->nhet_tot = 0;
+    args->nalt_tot = 0;
+    args->n_nref   = 0;
+    args->i_nref   = 0;
+
+    bcf_fmt_t *gt_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT");
+    bcf_fmt_t *ad_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD");
+    bcf_fmt_t *dp_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"DP");
+
+    int is;
+    for (is=0; is<args->files->n_smpl; is++)
+    {
+        // Determine depth
+        int dp = calc_sample_depth(args,is,ad_fmt_ptr,dp_fmt_ptr);
+        if ( dp>0 )
         {
-            float het_frac = (float)nhet_tot/(nhet_tot + nref_tot + nalt_tot);
-            int idx = het_frac*(args->naf_hwe - 1);
-//check me: what is this?
-            if ( line->n_allele>1 ) idx += args->naf_hwe*args->tmp_iaf[1];
-            stats->af_hwe[idx]++;
+            (*idist(&stats->dp, dp))++;
+            stats->smpl_ndp[is]++;
+            stats->smpl_dp[is] += dp;
         }
-    #endif
 
-    if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"DP")) )
-    {
-        #define BRANCH_INT(type_t,missing,vector_end) { \
-            int is; \
-            for (is=0; is<args->files->n_smpl; is++) \
-            { \
-                type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \
-                if ( *p==vector_end ) continue; \
-                if ( *p!=missing ) \
-                { \
-                    (*idist(&stats->dp, *p))++; \
-                    stats->smpl_ndp[is]++; \
-                    stats->smpl_dp[is] += *p; \
-                } \
-            } \
+        // Determine genotype
+        int ial, jal, gt=GT_UNKN;
+        if ( gt_fmt_ptr )
+        {
+            gt = bcf_gt_type(gt_fmt_ptr, reader->samples[is], &ial, &jal);
+            sample_gt_stats(args,stats,line,is,gt,ial,jal);
         }
-        switch (fmt_ptr->type) {
-            case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
-            case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
-            case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
-            default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break;
+
+        // Determine variant allele frequency
+        if ( dp>0 && ad_fmt_ptr )
+        {
+            float iad = 0, jad = 0;
+            if ( gt==GT_UNKN )    // GT not available
+            {
+                iad = get_ad(line,ad_fmt_ptr,is,&ial);
+            }
+            else if ( gt!=GT_UNKN )
+            {
+                iad = ial==0 ? 0 : get_iad(line,ad_fmt_ptr,is,ial);
+                jad = jal==0 ? 0 : get_iad(line,ad_fmt_ptr,is,jal);
+            }
+            if ( iad )
+            {
+                update_dvaf(stats,line,ial,(float)iad/dp);
+                update_vaf(&stats->smpl_vaf[is],line,ial,(float)iad/dp);
+            }
+            if ( jad && iad!=jad )
+            {
+                update_dvaf(stats,line,jal,(float)jad/dp);
+                update_vaf(&stats->smpl_vaf[is],line,jal,(float)jad/dp);
+            }
         }
-        #undef BRANCH_INT
     }
-    else if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD")) )
+    if ( args->n_nref==1 ) stats->smpl_sngl[args->i_nref]++;
+
+#if HWE_STATS
+    if ( gt_fmt_ptr && line->n_allele > 1 && (args->nref_tot || args->nhet_tot || args->nalt_tot) )
     {
-        #define BRANCH_INT(type_t,missing,vector_end) { \
-            int is,iv; \
-            for (is=0; is<args->files->n_smpl; is++) \
-            { \
-                type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \
-                int dp = 0, has_value = 0; \
-                for (iv=0; iv<fmt_ptr->n; iv++) \
-                { \
-                    if ( p[iv]==vector_end ) break; \
-                    if ( p[iv]==missing ) continue; \
-                    has_value = 1; \
-                    dp += p[iv]; \
-                } \
-                if ( has_value ) \
-                { \
-                    (*idist(&stats->dp, dp))++; \
-                    stats->smpl_ndp[is]++; \
-                    stats->smpl_dp[is] += dp; \
-                } \
-            } \
-        }
-        switch (fmt_ptr->type) {
-            case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
-            case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
-            case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
-            default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break;
-        }
-        #undef BRANCH_INT
+        // Number of heterozygous genotypes observed for any given allele frequency. This is used
+        // by plot-vcfstats to show the observed vs expected number of hets. There the expected number
+        // of hets is calculated from the probability P(het) = 2*AF*(1-AF).
+        // The array af_hwe is organized as follows
+        //      m_af     .. number of allele frequency bins
+        //      naf_hwe  .. the number of het genotype frequency bins
+        //      iallele_freq*naf_hwe + ihet_freq
+        //
+        float het_frac = (float)args->nhet_tot / (args->nref_tot + args->nhet_tot + args->nalt_tot);
+        int ihet_freq = het_frac * (args->naf_hwe - 1);
+        int idx = ihet_freq + args->tmp_iaf[1] * args->naf_hwe;
+        stats->af_hwe[idx]++;
     }
+#endif
 
     if ( matched==3 )
     {
@@ -1200,8 +1292,8 @@ static void do_vcf_stats(args_t *args)
         if ( files->n_smpl )
             do_sample_stats(args, stats, reader, ret);
 
-        if ( bcf_get_info_int32(reader->header,line,"DP",&args->tmp_iaf,&args->ntmp_iaf)==1 )
-            (*idist(&stats->dp_sites, args->tmp_iaf[0]))++;
+        if ( bcf_get_info_int32(reader->header,line,"DP",&args->iarr,&args->miarr)==1 )
+            (*idist(&stats->dp_sites, args->iarr[0]))++;
     }
 }
 
@@ -1736,6 +1828,24 @@ static void print_stats(args_t *args)
         }
         #endif
     }
+
+    if ( args->stats[0].smpl_vaf )
+    {
+        printf("# VAF, Variant Allele Frequency determined as fraction of alternate reads in FORMAT/AD\n");
+        printf("# VAF\t[2]id\t[3]sample\t[4]SNV VAF distribution\t[5]indel VAF distribution\n");
+        for (id=0; id<args->nstats; id++)
+        {
+            stats_t *stats = &args->stats[id];
+            for (i=0; i<args->files->n_smpl; i++)
+            {
+                printf("VAF\t%d\t%s\t", id,args->files->samples[i]);
+                for (j=0; j<21; j++) printf("%s%d",j?",":"",stats->smpl_vaf[i].snv[j]);
+                printf("\t");
+                for (j=0; j<21; j++) printf("%s%d",j?",":"",stats->smpl_vaf[i].indel[j]);
+                printf("\n");
+            }
+        }
+    }
 }
 
 static void usage(void)
diff --git a/vcfview.c b/vcfview.c
index 96dcbc7b5..e09efa0bc 100644
--- a/vcfview.c
+++ b/vcfview.c
@@ -1,6 +1,6 @@
 /*  vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
 
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
 
     Author: Shane McCarthy <sm15@sanger.ac.uk>
 
@@ -76,6 +76,8 @@ typedef struct _args_t
     char *include_types, *exclude_types;
     int include, exclude;
     int record_cmd_line;
+    char *index_fn;
+    int write_index;
     htsFile *out;
 }
 args_t;
@@ -532,6 +534,7 @@ static void usage(args_t *args)
     fprintf(stderr, "    -u/U, --uncalled/--exclude-uncalled    Select/exclude sites without a called genotype\n");
     fprintf(stderr, "    -v/V, --types/--exclude-types LIST     Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
     fprintf(stderr, "    -x/X, --private/--exclude-private      Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
+    fprintf(stderr, "          --write-index                    Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     exit(1);
 }
@@ -548,6 +551,7 @@ int main_vcfview(int argc, char *argv[])
     args->output_type = FT_VCF;
     args->n_threads = 0;
     args->record_cmd_line = 1;
+    args->write_index = 0;
     args->min_ac = args->max_ac = args->min_af = args->max_af = -1;
     args->regions_overlap = 1;
     args->targets_overlap = 0;
@@ -596,6 +600,7 @@ int main_vcfview(int argc, char *argv[])
         {"phased",no_argument,NULL,'p'},
         {"exclude-phased",no_argument,NULL,'P'},
         {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,10},
         {NULL,0,NULL,0}
     };
     char *tmp;
@@ -727,6 +732,7 @@ int main_vcfview(int argc, char *argv[])
                 break;
             case  9 : args->n_threads = strtol(optarg, 0, 0); break;
             case  8 : args->record_cmd_line = 0; break;
+            case 10 : args->write_index = 1; break;
             case '?': usage(args); break;
             default: error("Unknown argument: %s\n", optarg);
         }
@@ -783,6 +789,8 @@ int main_vcfview(int argc, char *argv[])
     else if ( args->output_type & FT_BCF )
         error("BCF output requires header, cannot proceed with -H\n");
 
+    if ( args->write_index && init_index(args->out,out_hdr,args->fn_out,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->fn_out);
+
     int ret = 0;
     if (!args->header_only)
     {
@@ -795,7 +803,18 @@ int main_vcfview(int argc, char *argv[])
         ret = args->files->errnum;
         if ( ret ) fprintf(stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum));
     }
-    hts_close(args->out);
+
+    if (args->write_index)
+    {
+        if (bcf_idx_save(args->out) < 0)
+        {
+            if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
+
+    if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"stdout");
     destroy_data(args);
     bcf_sr_destroy(args->files);
     free(args);
diff --git a/version.c b/version.c
index 4306d4011..38417a78b 100644
--- a/version.c
+++ b/version.c
@@ -1,6 +1,6 @@
 /*  version.c -- report version numbers for plugins.
 
-    Copyright (C) 2014-2021 Genome Research Ltd.
+    Copyright (C) 2014-2023 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -72,22 +72,26 @@ const char *hts_bcf_wmode(int file_type)
 const char *hts_bcf_wmode2(int file_type, const char *fname)
 {
     if ( !fname ) return hts_bcf_wmode(file_type);
-    int len = strlen(fname);
-    if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) return hts_bcf_wmode(FT_BCF|FT_GZ);
-    if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) return hts_bcf_wmode(FT_VCF);
-    if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
-    if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
+    const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL;
+    if ( !end ) end = fname ? fname + strlen(fname) : fname;
+    int len = end - fname;
+    if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) return hts_bcf_wmode(FT_BCF|FT_GZ);
+    if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) return hts_bcf_wmode(FT_VCF);
+    if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
+    if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
     return hts_bcf_wmode(file_type);
 }
 
 void set_wmode(char dst[8], int file_type, const char *fname, int clevel)
 {
     const char *ret = NULL;
-    int len = fname ? strlen(fname) : 0;
-    if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ);
-    else if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) ret = hts_bcf_wmode(FT_VCF);
-    else if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
-    else if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
+    const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL;
+    if ( !end ) end = fname ? fname + strlen(fname) : fname;
+    int len = end - fname;
+    if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ);
+    else if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_VCF);
+    else if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
+    else if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
     else ret = hts_bcf_wmode(file_type);
     if ( clevel>=0 && clevel<=9 )
     {
@@ -107,3 +111,33 @@ int parse_overlap_option(const char *arg)
     else if ( strcasecmp(arg, "variant") == 0 || strcmp(arg, "2") == 0 ) return 2;
     else return -1;
 }
+
+// See also samtools/sam_utils.c auto_index()
+int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname)
+{
+    int min_shift = 14; // CSI
+
+    if ( !fname || !*fname || !strcmp(fname, "-") ) return -1;
+
+    char *delim = strstr(fname, HTS_IDX_DELIM);
+    if (delim)
+    {
+        delim += strlen(HTS_IDX_DELIM);
+        *idx_fname = strdup(delim);
+        if ( !*idx_fname ) return -1;
+
+        size_t l = strlen(*idx_fname);
+        if ( l >= 4 && strcmp(*idx_fname + l - 4, ".tbi")==0 ) min_shift = 0;
+    }
+    else
+    {
+        if ( !(*idx_fname = malloc(strlen(fname)+6)) ) return -1;
+        sprintf(*idx_fname, "%s.csi", fname);
+    }
+
+    if ( bcf_idx_init(fh, hdr, min_shift, *idx_fname) < 0 ) return -1;
+
+    return 0;
+}
+
+
diff --git a/version.sh b/version.sh
index 55d804296..69bf963de 100755
--- a/version.sh
+++ b/version.sh
@@ -24,7 +24,7 @@
 # DEALINGS IN THE SOFTWARE.
 
 # Master version, for use in tarballs or non-git source copies
-VERSION=1.17
+VERSION=1.18
 
 # If we have a git clone, then check against the current tag
 if [ -e .git ]