diff --git a/INSTALL b/INSTALL
index bcdd2f4e3..379c5aa7e 100644
--- a/INSTALL
+++ b/INSTALL
@@ -232,8 +232,10 @@ Alpine Linux
Note: To install gsl-dev, it may be necessary to enable the "community"
repository in /etc/apk/repositories.
+Note: some older Alpine versions use libressl-dev rather than openssl-dev.
+
doas apk update # Ensure the package list is up to date
-doas apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev libressl-dev gsl-dev perl-dev
+doas apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev openssl-dev gsl-dev perl-dev
OpenSUSE
--------
diff --git a/LICENSE b/LICENSE
index 6d40ae2d1..46dc0e0e3 100644
--- a/LICENSE
+++ b/LICENSE
@@ -723,11 +723,12 @@ Public License instead of this License. But first, please read
-----------------------------------------------------------------------------
-LICENSE FOR VariantKey (https://github.com/Genomicsplc/variantkey)
+LICENSE FOR VariantKey (https://github.com/tecnickcom/variantkey)
The MIT License
Copyright (c) 2017-2018 GENOMICS plc
+Copyright (c) 2018-2023 Nicola Asuni - Tecnick.com
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
diff --git a/Makefile b/Makefile
index b0cd99ead..7013cd594 100644
--- a/Makefile
+++ b/Makefile
@@ -42,7 +42,7 @@ OBJS = main.o vcfindex.o tabix.o \
regidx.o smpl_ilist.o csq.o vcfbuf.o \
mpileup.o bam2bcf.o bam2bcf_indel.o bam2bcf_iaux.o read_consensus.o bam_sample.o \
vcfsort.o cols.o extsort.o dist.o abuf.o \
- ccall.o em.o prob1.o kmin.o str_finder.o
+ ccall.o em.o prob1.o kmin.o str_finder.o gff.o
PLUGIN_OBJS = vcfplugin.o
prefix = /usr/local
@@ -104,7 +104,7 @@ endif
include config.mk
-PACKAGE_VERSION = 1.17
+PACKAGE_VERSION = 1.18
# If building from a Git repository, replace $(PACKAGE_VERSION) with the Git
# description of the working tree: either a release tag with the same value
@@ -246,7 +246,7 @@ vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htsli
vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h) $(htslib_kstring_h) $(htslib_bgzf_h) $(bcftools_h)
vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) $(filter_h)
vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) regidx.h $(bcftools_h) vcmp.h $(htslib_khash_h)
-vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h abuf.h
+vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h abuf.h gff.h
vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h) $(smpl_ilist_h)
vcfroh.o: vcfroh.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(bcftools_h) HMM.h $(smpl_ilist_h) $(filter_h)
vcfcnv.o: vcfcnv.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(bcftools_h) HMM.h rbuf.h
@@ -289,6 +289,7 @@ vcfbuf.o: vcfbuf.c $(htslib_vcf_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcf
abuf.o: abuf.c $(htslib_vcf_h) $(bcftools_h) rbuf.h abuf.h
extsort.o: extsort.c $(bcftools_h) extsort.h kheap.h
smpl_ilist.o: smpl_ilist.c $(bcftools_h) $(smpl_ilist_h)
+gff.o: gff.c gff.h regidx.h
csq.o: csq.c $(htslib_hts_h) $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_h) $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) regidx.h kheap.h $(smpl_ilist_h) rbuf.h
# test programs
diff --git a/NEWS b/NEWS
index 06c0593ca..62c4699ac 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,121 @@
+## Release 1.18 (25th July 2023)
+
+
+Changes affecting the whole of bcftools, or multiple commands:
+
+* Support auto indexing during writing BCF and VCF.gz via new `--write-index` option
+
+
+Changes affecting specific commands:
+
+* bcftools annotate
+
+ - The `-m, --mark-sites` option can be now used to mark all sites without the
+ need to provide the `-a` file (#1861)
+
+ - Fix a bug where the `-m` function did not respect the `--min-overlap` option (#1869)
+
+ - Fix a bug when update of INFO/END results in assertion error (#1957)
+
+* bcftools concat
+
+ - New option `--drop-genotypes`
+
+* bcftools consensus
+
+ - Support higher-ploidy genotypes with `-H, --haplotype` (#1892)
+
+ - Allow `--mark-ins` and `--mark-snv` with a character, similarly to `--mark-del`
+
+* bcftools convert
+
+ - Support for conversion from tab-delimited files (CHROM,POS,REF,ALT) to sites-only VCFs
+
+* bcftools csq
+
+ - New `--unify-chr-names` option to automatically unify different chromosome
+ naming conventions in the input GFF, fasta and VCF files (e.g. "chrX" vs "X")
+
+ - More versatility in parsing various flavors of GFF
+
+ - A new `--dump-gff` option to help with debugging and investigating the internals
+ of hGFF parsing
+
+ - When printing consequences in nonsense mediated decay transcripts, include 'NMD_transcript'
+ in the consequence part of the annotation. This is to make filtering easier and analogous to
+ VEP annotations. For example the consequence annotation
+ 3_prime_utr|PCGF3|ENST00000430644|NMD
+ is newly printed as
+ 3_prime_utr&NMD_transcript|PCGF3|ENST00000430644|NMD
+
+* bcftools gtcheck
+
+ - Add stats for the number of sites matched in the GT-vs-GT, GT-vs-PL, etc modes. This
+ information is important for interpretation of the discordance score, as only the
+ GT-vs-GT matching can be interpreted as the number of mismatching genotypes.
+
+* bcftools +mendelian2
+
+ - Fix in command line argument parsing, the `-p` and `-P` options were not
+ functioning (#1906)
+
+* bcftools merge
+
+ - New `-M, --missing-rules` option to control the behavior of merging of vector tags
+ to prevent mixtures of known and missing values in tags when desired
+
+ - Use values pertaining to the unknown allele (<*> or ) when available
+ to prevent mixtures of known and missing values (#1888)
+
+ - Revamped line matching code to fix problems in gVCF merging where split gVCF blocks
+ would not update genotypes (#1891, #1164).
+
+* bcftool mpileup
+
+ - Fix a bug in --indels-v2.0 which caused an endless loop when CIGAR operator 'H' or 'P'
+ was encountered
+
+* bcftools norm
+
+ - The `-m, --multiallelics +` mode now preserves phasing (#1893)
+
+ - Symbolic alleles are now normalized too (#1919)
+
+ - New `-g, --gff-annot` option to right-align indels in forward transcripts to follow
+ HGVS 3'rule (#1929)
+
+* bcftools query
+
+ - Force newline character in formatting expression when not given explicitly
+
+ - Fix `-H` header output in formatting expressions containing newlines
+
+* bcftools reheader
+
+ - Make `-f, --fai` aware of long contigs not representable by 32-bit integer (#1959)
+
+* bcftools +split-vep
+
+ - Prevent a segfault when `-i/-e` use a VEP subfield not included in `-f` or `-c` (#1877)
+
+ - New `-X, --keep-sites` option complementing the existing `-x, --drop-sites` options
+
+ - Force newline character in formatting expression when not given explicitly
+
+ - Fix a subtle ambiguity: identical rows must be returned when `-s` is applied regardless
+ of `-f` containing the `-a` VEP tag itself or not.
+
+* bcftools stats
+
+ - Collect new VAF (variant allele frequency) statistics from FORMAT/AD field
+
+ - When counting transitions/transversions, consider also alternate het genotypes
+
+* plot-vcfstats
+
+ - Add three new VAF plots
+
+
## Release 1.17 (21st February 2023)
diff --git a/bcftools.h b/bcftools.h
index c3f7ded16..bba71e3b6 100644
--- a/bcftools.h
+++ b/bcftools.h
@@ -1,6 +1,6 @@
/* bcftools.h -- utility function declarations.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek
@@ -49,6 +49,9 @@ void error(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2
// newline will be added by the function.
void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2);
+// For on the fly index creation with --write-index
+int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname);
+
void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd);
const char *hts_bcf_wmode(int file_type);
const char *hts_bcf_wmode2(int file_type, const char *fname);
diff --git a/cigar_state.h b/cigar_state.h
index a12a70995..dacac14ac 100644
--- a/cigar_state.h
+++ b/cigar_state.h
@@ -107,6 +107,12 @@ static inline int cstate_seek_fwd(cigar_state_t *cs, hts_pos_t *pos_ptr, int tri
cs->icig++;
continue;
}
+ if ( op==BAM_CHARD_CLIP || op==BAM_CPAD )
+ {
+ cs->icig++;
+ continue;
+ }
+ error("FIXME: not ready for CIGAR operator %d\n",op);
}
// the read starts after pos
if ( trim_left )
@@ -175,6 +181,12 @@ static inline int cstate_seek_op_fwd(cigar_state_t *cs, hts_pos_t pos, int seek_
cs->icig++;
continue;
}
+ if ( op==BAM_CHARD_CLIP || op==BAM_CPAD )
+ {
+ cs->icig++;
+ continue;
+ }
+ error("FIXME: not ready for CIGAR operator %d\n",op);
}
return cs->icig < cs->ncig ? -1 : -2;
}
diff --git a/consensus.c b/consensus.c
index 397d45f98..2b58670c7 100644
--- a/consensus.c
+++ b/consensus.c
@@ -54,8 +54,8 @@
#define PICK_SHORT 8
#define PICK_IUPAC 16
-#define TO_UPPER 0
-#define TO_LOWER 1
+#define TO_UPPER 1
+#define TO_LOWER 2
typedef struct
{
@@ -324,7 +324,7 @@ static void init_region(args_t *args, char *line)
{
char *ss, *se = line;
while ( *se && !isspace(*se) && *se!=':' ) se++;
- int from = 0, to = 0;
+ hts_pos_t from = 0, to = 0;
char tmp = 0, *tmp_ptr = NULL;
if ( *se )
{
@@ -356,7 +356,14 @@ static void init_region(args_t *args, char *line)
args->fa_frz_mod = -1;
args->fa_case = -1;
args->vcf_rbuf.n = 0;
- bcf_sr_seek(args->files,line,args->fa_ori_pos);
+
+ kstring_t str = {0,0,0};
+ if ( from==0 ) from = 1;
+ if ( to==0 ) to = HTS_POS_MAX;
+ ksprintf(&str,"%s:%"PRIhts_pos"-%"PRIhts_pos,line,from,to);
+ bcf_sr_set_regions(args->files,line,0);
+ free(str.s);
+
if ( tmp_ptr ) *tmp_ptr = tmp;
fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line);
if ( args->chain_fname )
@@ -466,25 +473,37 @@ static char *mark_del(char *ref, int rlen, char *alt, int mark)
static void mark_ins(char *ref, char *alt, char mark)
{
int i, nref = strlen(ref), nalt = strlen(alt);
- if ( mark=='l' )
+ if ( mark==TO_LOWER )
for (i=nref; imark_del = optarg[0]; break;
case 2 :
- if ( !strcasecmp(optarg,"uc") ) args->mark_ins = 'u';
- else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = 'l';
+ if ( !strcasecmp(optarg,"uc") ) args->mark_ins = TO_UPPER;
+ else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = TO_LOWER;
+ else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_ins = optarg[0];
else error("The argument is not recognised: --mark-ins %s\n",optarg);
break;
case 3 :
- if ( !strcasecmp(optarg,"uc") ) args->mark_snv = 'u';
- else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = 'l';
+ if ( !strcasecmp(optarg,"uc") ) args->mark_snv = TO_UPPER;
+ else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = TO_LOWER;
+ else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_snv = optarg[0];
else error("The argument is not recognised: --mark-snv %s\n",optarg);
break;
case 'p': args->chr_prefix = optarg; break;
@@ -1211,7 +1231,8 @@ int main_consensus(int argc, char *argv[])
{
char *tmp;
args->haplotype = strtol(optarg, &tmp, 10);
- if ( tmp==optarg || *tmp ) error("Error: Could not parse --haplotype %s, expected numeric argument\n", optarg);
+ if ( tmp==optarg || (*tmp && strcasecmp(tmp,"pIu")) ) error("Error: Could not parse \"--haplotype %s\", expected number of number followed with \"pIu\"\n", optarg);
+ if ( *tmp ) args->allele |= PICK_IUPAC;
if ( args->haplotype <=0 ) error("Error: Expected positive integer with --haplotype\n");
}
break;
diff --git a/convert.c b/convert.c
index 80e54747d..07ff01862 100644
--- a/convert.c
+++ b/convert.c
@@ -106,6 +106,7 @@ struct _convert_t
char **used_tags_list;
int nused_tags;
int allow_undef_tags;
+ int force_newline;
uint8_t **subset_samples;
};
@@ -648,6 +649,7 @@ static void process_type(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
vcf_format1(convert->header, line, str);
+ if ( str->s[str->l-1]=='\n' ) str->l--;
}
static void process_chrom_pos_id(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
@@ -1560,7 +1562,6 @@ void convert_destroy(convert_t *convert)
int convert_header(convert_t *convert, kstring_t *str)
{
int i, icol = 0, l_ori = str->l;
- bcf_hdr_t *hdr = convert->header;
// Supress the header output if LINE is present
for (i=0; infmt; i++)
@@ -1568,6 +1569,12 @@ int convert_header(convert_t *convert, kstring_t *str)
if ( i!=convert->nfmt )
return str->l - l_ori;
+ // Header formatting becomes problematic when the formatting expression contains a newline.
+ // Simple cases like
+ // -f'[%CHROM %POS %SAMPLE\n]'
+ // can be handled quite easily with has_fmt_newline. Note this will not work if multiple newlines
+ // are present.
+ int has_fmt_newline = 0;
kputc('#', str);
for (i=0; infmt; i++)
{
@@ -1578,18 +1585,25 @@ int convert_header(convert_t *convert, kstring_t *str)
while ( convert->fmt[j].is_gt_field ) j++;
for (js=0; jsnsamples; js++)
{
- int ks = convert->samples[js];
for (k=i; kfmt[k].type == T_SEP )
{
- if ( convert->fmt[k].key ) kputs(convert->fmt[k].key, str);
+ if ( convert->fmt[k].key )
+ {
+ char *tmp = convert->fmt[k].key;
+ while ( *tmp )
+ {
+ if ( *tmp=='\n' ) has_fmt_newline = 1;
+ else kputc(*tmp,str);
+ tmp++;
+ }
+ }
}
- else if ( convert->fmt[k].type == T_SAMPLE )
- ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key);
else
- ksprintf(str, "[%d]%s:%s", ++icol, hdr->samples[ks], convert->fmt[k].key);
+ ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key);
}
+ if ( has_fmt_newline ) break;
}
i = j-1;
continue;
@@ -1602,6 +1616,7 @@ int convert_header(convert_t *convert, kstring_t *str)
}
ksprintf(str, "[%d]%s", ++icol, convert->fmt[i].key);
}
+ if ( has_fmt_newline ) kputc('\n',str);
return str->l - l_ori;
}
@@ -1678,6 +1693,47 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
return str->l - l_ori;
}
+static void force_newline_(convert_t *convert)
+{
+ int i, has_newline = 0;
+ for (i=0; infmt; i++)
+ {
+ if ( !convert->fmt[i].key ) continue;
+ char *tmp = convert->fmt[i].key;
+ while (*tmp)
+ {
+ if ( *tmp=='\n' ) { has_newline = 1; break; }
+ tmp++;
+ }
+ if ( has_newline ) break;
+ }
+ if ( has_newline ) return;
+
+ // A newline is not present, force it. But where to add it?
+ // Consider
+ // -f'%CHROM[ %SAMPLE]\n'
+ // vs
+ // -f'[%CHROM %SAMPLE\n]'
+ for (i=0; infmt; i++)
+ if ( !convert->fmt[i].is_gt_field && convert->fmt[i].key ) break;
+
+ if ( i < convert->nfmt )
+ register_tag(convert, "\n", 0, T_SEP); // the first case
+ else
+ {
+ // the second case
+ i = convert->nfmt - 1;
+ if ( !convert->fmt[i].key )
+ {
+ convert->fmt[i].key = strdup("\n");
+ convert->fmt[i].is_gt_field = 1;
+ register_tag(convert, NULL, 0, T_SEP);
+ }
+ else
+ register_tag(convert, "\n", 1, T_SEP);
+ }
+}
+
int convert_set_option(convert_t *convert, enum convert_option opt, ...)
{
int ret = 0;
@@ -1692,6 +1748,10 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...)
case subset_samples:
convert->subset_samples = va_arg(args, uint8_t**);
break;
+ case force_newline:
+ convert->force_newline = va_arg(args, int);
+ if ( convert->force_newline ) force_newline_(convert);
+ break;
default:
ret = -1;
}
diff --git a/convert.h b/convert.h
index 5bbbc2cde..062607093 100644
--- a/convert.h
+++ b/convert.h
@@ -1,6 +1,6 @@
/* convert.h -- functions for converting between VCF/BCF and related formats.
- Copyright (C) 2014-2021 Genome Research Ltd.
+ Copyright (C) 2014-2023 Genome Research Ltd.
Author: Petr Danecek
@@ -32,6 +32,7 @@ enum convert_option
{
allow_undef_tags,
subset_samples,
+ force_newline,
};
convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *str);
diff --git a/csq.c b/csq.c
index 49812d4de..f619e061a 100644
--- a/csq.c
+++ b/csq.c
@@ -35,7 +35,7 @@
Read about transcript types here
http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
http://www.ensembl.org/info/genome/variation/predicted_data.html
- http://www.gencodegenes.org/gencode_biotypes.html
+ https://www.gencodegenes.org/pages/biotypes.html
List of supported biotypes
antisense
@@ -45,6 +45,7 @@
IG_LV_gene
IG_V_gene
lincRNA
+ lncRNA .. generic term for 3prime_overlapping_ncRNA, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic, sense_overlapping
macro_lncRNA
miRNA
misc_RNA
@@ -52,7 +53,7 @@
Mt_tRNA
polymorphic_pseudogene
processed_transcript
- protein_coding
+ protein_coding, mRNA
ribozyme
rRNA
sRNA
@@ -144,6 +145,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -153,6 +155,7 @@
#include "kheap.h"
#include "smpl_ilist.h"
#include "rbuf.h"
+#include "gff.h"
#ifndef __FUNCTION__
# define __FUNCTION__ __func__
@@ -162,20 +165,8 @@
#define FLT_INCLUDE 1
#define FLT_EXCLUDE 2
-// Definition of splice_region, splice_acceptor and splice_donor
-#define N_SPLICE_DONOR 2
-#define N_SPLICE_REGION_EXON 3
-#define N_SPLICE_REGION_INTRON 8
-
#define N_REF_PAD 10 // number of bases to avoid boundary effects
-#define STRAND_REV 0
-#define STRAND_FWD 1
-
-#define TRIM_NONE 0
-#define TRIM_5PRIME 1
-#define TRIM_3PRIME 2
-
// How to treat phased/unphased genotypes
#define PHASE_REQUIRE 0 // --phase r
#define PHASE_MERGE 1 // --phase m
@@ -223,6 +214,7 @@
#define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION)))
#define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING))
+#define CSQ_PRN_NMD (~(CSQ_INTRON|CSQ_NON_CODING))
#define CSQ_PRN_BIOTYPE CSQ_NON_CODING
// see kput_vcsq()
@@ -254,119 +246,6 @@ const char *csq_strings[] =
"start_retained"
};
-
-// GFF line types
-#define GFF_UNKN_LINE 0
-#define GFF_TSCRIPT_LINE 1
-#define GFF_GENE_LINE 2
-
-
-/*
- Genomic features, for fast lookup by position to overlapping features
-*/
-#define GF_coding_bit 6
-#define GF_is_coding(x) ((x) & (1<aux)
+typedef struct
{
- uint32_t id; // transcript id
- uint32_t beg,end; // transcript's beg and end coordinate (ref strand, 0-based, inclusive)
- uint32_t strand:1, // STRAND_REV or STRAND_FWD
- ncds:31, // number of exons
- mcds;
- gf_cds_t **cds; // ordered list of exons
char *ref; // reference sequence, padded with N_REF_PAD bases on both ends
char *sref; // spliced reference sequence, padded with N_REF_PAD bases on both ends
hap_node_t *root; // root of the haplotype tree
hap_node_t **hap; // pointer to haplotype leaves, two for each sample
int nhap, nsref; // number of haplotypes and length of sref, including 2*N_REF_PAD
- uint32_t trim:2, // complete, 5' or 3' trimmed, see TRIM_* types
- type:30; // one of GF_* types
- gf_gene_t *gene;
-};
-static inline int cmp_tscript(tscript_t **a, tscript_t **b)
+}
+tscript_t;
+static inline int cmp_tscript(gf_tscript_t **a, gf_tscript_t **b)
{
return ( (*a)->end < (*b)->end ) ? 1 : 0;
}
-KHEAP_INIT(trhp, tscript_t*, cmp_tscript)
+KHEAP_INIT(trhp, gf_tscript_t*, cmp_tscript)
typedef khp_trhp_t tr_heap_t;
typedef struct
{
@@ -494,7 +366,7 @@ typedef struct
{
int mstack;
hstack_t *stack;
- tscript_t *tr; // tr->ref: spliced transcript on ref strand
+ gf_tscript_t *tr; // tr->ref: spliced transcript on ref strand
kstring_t sseq; // spliced haplotype sequence on ref strand
kstring_t tseq; // the variable part of translated haplotype transcript, coding strand
kstring_t tref; // the variable part of translated reference transcript, coding strand
@@ -503,77 +375,20 @@ typedef struct
}
hap_t;
-
-/*
- Helper structures, only for initialization
-
- ftr_t
- temporary list of all exons, CDS, UTRs
-*/
-KHASH_MAP_INIT_INT(int2tscript, tscript_t*)
-KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
-typedef struct
-{
- int type; // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR
- uint32_t beg;
- uint32_t end;
- uint32_t trid;
- uint32_t strand:1; // STRAND_REV,STRAND_FWD
- uint32_t phase:2; // 0, 1, 2, or 3 for unknown
- uint32_t iseq:29;
-}
-ftr_t;
-/*
- Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001)
- to integer id. To keep the memory requirements low, the original version
- relied on IDs in the form of a string prefix and a numerical id. However,
- it turns out that this assumption is not valid for some ensembl GFFs, see
- for example Zea_mays.AGPv4.36.gff3.gz
- */
-typedef struct
-{
- void *str2id; // khash_str2int
- int nstr, mstr;
- char **str; // numeric id to string
-}
-id_tbl_t;
-typedef struct
-{
- // all exons, CDS, UTRs
- ftr_t *ftr;
- int nftr, mftr;
-
- // mapping from gene id to gf_gene_t
- kh_int2gene_t *gid2gene;
-
- // mapping from transcript id to tscript, for quick CDS anchoring
- kh_int2tscript_t *id2tr;
-
- // sequences
- void *seq2int; // str2int hash
- char **seq;
- int nseq, mseq;
-
- // ignored biotypes
- void *ignored_biotypes;
-
- id_tbl_t gene_ids; // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx
-}
-aux_t;
-
typedef struct _args_t
{
// the main regidx lookups, from chr:beg-end to overlapping features and
// index iterator
+ gff_t *gff;
regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
regitr_t *itr;
- // temporary structures, deleted after initializtion
- aux_t init;
-
// text tab-delimited output (out) or vcf/bcf output (out_fh)
FILE *out;
htsFile *out_fh;
+ char *index_fn;
+ int write_index;
+ char *dump_gff;
// vcf
bcf_srs_t *sr;
@@ -597,6 +412,13 @@ typedef struct _args_t
int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values)
int ncsq2_small_warned;
int brief_predictions;
+ int unify_chr_names;
+ char *chr_name;
+ int mchr_name;
+ struct {
+ int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id;
+ int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds;
+ } warned;
int rid; // current chromosome
tr_heap_t *active_tr; // heap of active transcripts for quick flushing
@@ -604,11 +426,10 @@ typedef struct _args_t
vbuf_t **vcf_buf; // buffered VCF lines to annotate with CSQ and flush
rbuf_t vcf_rbuf; // round buffer indexes to vcf_buf
kh_pos2vbuf_t *pos2vbuf; // fast lookup of buffered lines by position
- tscript_t **rm_tr; // buffer of transcripts to clean
+ gf_tscript_t **rm_tr; // buffer of transcripts to clean
int nrm_tr, mrm_tr;
csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs
int ncsq_buf, mcsq_buf;
- id_tbl_t tscript_ids; // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
int force; // force run under various conditions. Currently only to skip out-of-phase transcripts
int n_threads; // extra compression/decompression threads
@@ -645,818 +466,6 @@ const uint8_t cnt4[] =
#define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ]
#define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
-static const char *gf_strings_noncoding[] =
-{
- "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript",
- "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping",
- "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
- "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
- "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
- "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene",
- "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene",
- "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf"
-};
-static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"};
-static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" };
-
-const char *gf_type2gff_string(int type)
-{
- if ( !GF_is_coding(type) )
- {
- if ( type < (1<init;
- char c = chr_end[1];
- chr_end[1] = 0;
- int iseq;
- if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
- {
- // check for possible mismatch in chromosome naming convention such as chrX vs X
- char *new_chr = NULL;
- if ( faidx_has_seq(args->fai,chr_beg) )
- new_chr = strdup(chr_beg); // valid chr name, the same in gff and faidx
- else
- {
- int len = strlen(chr_beg);
- if ( !strncmp("chr",chr_beg,3) && len>3 )
- new_chr = strdup(chr_beg+3); // gff has the prefix, faidx does not
- else
- {
- new_chr = malloc(len+4); // gff does not have the prefix, faidx has
- memcpy(new_chr,"chr",3);
- memcpy(new_chr+3,chr_beg,len);
- new_chr[len+3] = 0;
- }
- if ( !faidx_has_seq(args->fai,new_chr) ) // modification did not help, this sequence is not in fai
- {
- static int unkwn_chr_warned = 0;
- if ( !unkwn_chr_warned && args->verbosity>0 )
- fprintf(stderr,"Warning: GFF chromosome \"%s\" not part of the reference genome\n",chr_beg);
- unkwn_chr_warned = 1;
- free(new_chr);
- new_chr = strdup(chr_beg); // use the original sequence name
- }
- }
- if ( khash_str2int_get(aux->seq2int, new_chr, &iseq)!=0 )
- {
- hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
- aux->seq[aux->nseq] = new_chr;
- iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
- aux->nseq++;
- assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq
- }
- else
- free(new_chr);
- }
- chr_end[1] = c;
- return iseq;
-}
-static inline char *gff_skip(const char *line, char *ss)
-{
- while ( *ss && *ss!='\t' ) ss++;
- if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- return ss+1;
-}
-static inline void gff_parse_chr(const char *line, char **chr_beg, char **chr_end)
-{
- char *se = (char*) line;
- while ( *se && *se!='\t' ) se++;
- if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- *chr_beg = (char*) line;
- *chr_end = se-1;
-}
-static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end)
-{
- char *se = ss;
- *beg = strtol(ss, &se, 10) - 1;
- if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss);
- ss = se+1;
- *end = strtol(ss, &se, 10) - 1;
- if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- return se+1;
-}
-static void gff_id_init(id_tbl_t *tbl)
-{
- memset(tbl, 0, sizeof(*tbl));
- tbl->str2id = khash_str2int_init();
-}
-static void gff_id_destroy(id_tbl_t *tbl)
-{
- khash_str2int_destroy_free(tbl->str2id);
- free(tbl->str);
-}
-// returns 0 on success, -1 on failure
-static inline int gff_id_parse(id_tbl_t *tbl, const char *needle, char *ss, uint32_t *id_ptr)
-{
- ss = strstr(ss,needle); // e.g. "ID=transcript:"
- if ( !ss ) return -1;
- ss += strlen(needle);
-
- char *se = ss;
- while ( *se && *se!=';' && !isspace(*se) ) se++;
- char tmp = *se;
- *se = 0;
-
- int id;
- if ( khash_str2int_get(tbl->str2id, ss, &id) < 0 )
- {
- id = tbl->nstr++;
- hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
- tbl->str[id] = strdup(ss);
- khash_str2int_set(tbl->str2id, tbl->str[id], id);
- }
- *se = tmp;
- *id_ptr = id;
- return 0;
-}
-static inline int gff_parse_type(char *line)
-{
- line = strstr(line,"ID=");
- if ( !line ) return -1;
- line += 3;
- if ( !strncmp(line,"transcript:",11) ) return GFF_TSCRIPT_LINE;
- else if ( !strncmp(line,"gene:",5) ) return GFF_GENE_LINE;
- return -1;
-}
-static inline int gff_parse_biotype(char *_line)
-{
- char *line = strstr(_line,"biotype=");
- if ( !line ) return -1;
-
- line += 8;
- switch (*line)
- {
- case 'p':
- if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING;
- else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE;
- else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT;
- else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE;
- else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE;
- break;
- case 'a':
- if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT;
- else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE;
- else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF;
- break;
- case 'I':
- if ( !strncmp(line,"IG_C_gene",9) ) return GF_IG_C;
- else if ( !strncmp(line,"IG_D_gene",9) ) return GF_IG_D;
- else if ( !strncmp(line,"IG_J_gene",9) ) return GF_IG_J;
- else if ( !strncmp(line,"IG_LV_gene",10) ) return GF_IG_LV;
- else if ( !strncmp(line,"IG_V_gene",9) ) return GF_IG_V;
- else if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE;
- else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE;
- else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE;
- else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE;
- break;
- case 'T':
- if ( !strncmp(line,"TR_C_gene",9) ) return GF_TR_C;
- else if ( !strncmp(line,"TR_D_gene",9) ) return GF_TR_D;
- else if ( !strncmp(line,"TR_J_gene",9) ) return GF_TR_J;
- else if ( !strncmp(line,"TR_V_gene",9) ) return GF_TR_V;
- else if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE;
- else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE;
- break;
- case 'M':
- if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE;
- else if ( !strncmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA;
- else if ( !strncmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA;
- break;
- case 'l':
- if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA;
- break;
- case 'm':
- if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA;
- else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE;
- else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE;
- else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA;
- else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA;
- break;
- case 'r':
- if ( !strncmp(line,"rRNA",4) ) return GF_rRNA;
- else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME;
- else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON;
- else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED;
- break;
- case 's':
- if ( !strncmp(line,"snRNA",5) ) return GF_snRNA;
- else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA;
- else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA;
- else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA;
- else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA;
- else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC;
- else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING;
- break;
- case 't':
- if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE;
- else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE;
- else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
- else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE;
- else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE;
- else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE;
- break;
- case 'n':
- if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD;
- else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY;
- break;
- case 'k':
- if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA;
- break;
- case 'u':
- if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE;
- else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE;
- break;
- case 'L':
- if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE;
- break;
- case '3':
- if ( !strncmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA;
- break;
- case 'd':
- if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN;
- break;
- case 'v':
- if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA;
- break;
- case 'b':
- if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA;
- break;
- }
- return 0;
-}
-static inline int gff_ignored_biotype(args_t *args, char *ss)
-{
- ss = strstr(ss,"biotype=");
- if ( !ss ) return 0;
-
- ss += 8;
- char *se = ss, tmp;
- while ( *se && *se!=';' ) se++;
- tmp = *se;
- *se = 0;
-
- char *key = ss;
- int n = 0;
- if ( khash_str2int_get(args->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss);
- khash_str2int_set(args->init.ignored_biotypes, key, n+1);
-
- *se = tmp;
- return 1;
-}
-gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id)
-{
- khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id);
- gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k);
- if ( !gene )
- {
- gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t));
- int ret;
- k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret);
- kh_val(aux->gid2gene,k) = gene;
- }
- return gene;
-}
-void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr)
-{
- aux_t *aux = &args->init;
- int biotype = gff_parse_biotype(ss);
- if ( biotype <= 0 )
- {
- if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored transcript, unknown biotype: %s\n",line);
- return;
- }
-
- // create a mapping from transcript_id to gene_id
- uint32_t trid, gene_id;
- if ( gff_id_parse(&args->tscript_ids, "ID=transcript:", ss, &trid) )
- {
- if ( gff_id_parse(&args->tscript_ids, "ID=", ss, &trid) )
- error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- static int warned = 0;
- if ( !warned && args->verbosity > 0 )
- {
- fprintf(stderr,"Warning: non-standard transcript ID notation in the GFF, expected \"ID=transcript:XXX\", found %s\n",line);
- warned = 1;
- }
- }
- if ( gff_id_parse(&args->init.gene_ids, "Parent=gene:", ss, &gene_id) )
- {
- if ( gff_id_parse(&args->init.gene_ids, "Parent=", ss, &gene_id) )
- error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- static int warned = 0;
- if ( !warned && args->verbosity > 0 )
- {
- fprintf(stderr,"Warning: non-standard transcript Parent notation in the GFF, expected \"Parent=gene:XXX\", found %s\n",line);
- warned = 1;
- }
- }
-
- tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t));
- tr->id = trid;
- tr->strand = ftr->strand;
- tr->gene = gene_init(aux, gene_id);
- tr->type = biotype;
- tr->beg = ftr->beg;
- tr->end = ftr->end;
-
- khint_t k;
- int ret;
- k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret);
- kh_val(aux->id2tr,k) = tr;
-}
-void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, char *chr_end, ftr_t *ftr)
-{
- int biotype = gff_parse_biotype(ss);
- if ( biotype <= 0 )
- {
- if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored gene, unknown biotype: %s\n",line);
- return;
- }
-
- aux_t *aux = &args->init;
-
- // substring search for "ID=gene:ENSG00000437963"
- uint32_t gene_id;
- if ( gff_id_parse(&aux->gene_ids, "ID=gene:", ss, &gene_id) )
- {
- if ( gff_id_parse(&aux->gene_ids, "ID=", ss, &gene_id) )
- error("[%s:%d %s] Could not parse the line, neither \"ID=gene:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- static int warned = 0;
- if ( !warned && args->verbosity > 0 )
- {
- fprintf(stderr,"Warning: non-standard gene ID notation in the GFF, expected \"ID=gene:XXX\", found %s\n",line);
- warned = 1;
- }
- }
-
- gf_gene_t *gene = gene_init(aux, gene_id);
- assert( !gene->name ); // the gene_id should be unique
-
- gene->iseq = feature_set_seq(args, chr_beg,chr_end);
-
- // substring search for "Name=OR4F5"
- ss = strstr(chr_end+2,"Name=");
- if ( ss )
- {
- ss += 5;
- char *se = ss;
- while ( *se && *se!=';' && !isspace(*se) ) se++;
- gene->name = (char*) malloc(se-ss+1);
- memcpy(gene->name,ss,se-ss);
- gene->name[se-ss] = 0;
- }
- else
- gene->name = strdup(aux->gene_ids.str[gene_id]); // Name= field is not present, use the gene ID instead
-}
-int gff_parse(args_t *args, char *line, ftr_t *ftr)
-{
- // - skip empty lines and commented lines
- // - columns
- // 1. chr
- // 2.
- // 3. CDS, transcript, gene, ...
- // 4-5. beg,end
- // 6.
- // 7. strand
- // 8. phase
- // 9. Parent=transcript:ENST(\d+);ID=... etc
-
- char *ss = line;
- if ( !*ss ) return -1; // skip blank lines
- if ( *ss=='#' ) return -1; // skip comments
-
- char *chr_beg, *chr_end;
- gff_parse_chr(line, &chr_beg, &chr_end);
- ss = gff_skip(line, chr_end + 2);
-
- // 3. column: is this a CDS, transcript, gene, etc.
- if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; }
- else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; }
- else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; }
- else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; }
- else
- {
- int type = GFF_UNKN_LINE;
- if ( !strncmp("gene\t",ss,4) ) type = GFF_GENE_LINE;
- else if ( !strncmp("transcript\t",ss,4) ) type = GFF_TSCRIPT_LINE;
- ss = gff_skip(line, ss);
- ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
- ss = gff_skip(line, ss);
- if ( type==GFF_UNKN_LINE ) type = gff_parse_type(ss); // determine type from ID=transcript: or ID=gene:
- if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE )
- {
- // we ignore these, debug print to see new types:
- ss = strstr(ss,"ID=");
- if ( !ss ) return -1; // no ID, ignore the line
- if ( !strncmp("chromosome",ss+3,10) ) return -1;
- if ( !strncmp("supercontig",ss+3,11) ) return -1;
- if ( args->verbosity > 0 ) fprintf(stderr,"ignored: %s\n", line);
- return -1;
- }
-
- // 7. column: strand
- if ( *ss == '+' ) ftr->strand = STRAND_FWD;
- else if ( *ss == '-' ) ftr->strand = STRAND_REV;
- else error("Unknown strand: %c .. %s\n", *ss,ss);
-
- if ( type==GFF_TSCRIPT_LINE )
- gff_parse_transcript(args, line, ss, ftr);
- else
- gff_parse_gene(args, line, ss, chr_beg, chr_end, ftr);
-
- return -1;
- }
- ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
- ss = gff_skip(line, ss);
-
- // 7. column: strand
- if ( *ss == '+' ) ftr->strand = STRAND_FWD;
- else if ( *ss == '-' ) ftr->strand = STRAND_REV;
- else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown strand: %c\n", *ss); return -1; }
- ss += 2;
-
- // 8. column: phase (codon offset)
- if ( *ss == '0' ) ftr->phase = 0;
- else if ( *ss == '1' ) ftr->phase = 1;
- else if ( *ss == '2' ) ftr->phase = 2;
- else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN; // exons and even CDS in some GFFs do not have phase
- else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; }
- ss += 2;
-
- // substring search for "Parent=transcript:ENST00000437963"
- if ( gff_id_parse(&args->tscript_ids, "Parent=transcript:", ss, &ftr->trid) )
- {
- if ( gff_id_parse(&args->tscript_ids, "Parent=", ss, &ftr->trid) )
- error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- static int warned = 0;
- if ( !warned && args->verbosity > 0 )
- {
- fprintf(stderr,"Warning: non-standard gene Parent notation in the GFF, expected \"Parent=transcript:XXX\", found %s\n",line);
- warned = 1;
- }
- }
-
- ftr->iseq = feature_set_seq(args, chr_beg,chr_end);
- return 0;
-}
-
-static int cmp_cds_ptr(const void *a, const void *b)
-{
- // comparison function for qsort of transcripts's CDS
- if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1;
- if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1;
- return 0;
-}
-
-static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end)
-{
- *chr_beg = *chr_end = aux->seq[iseq];
- while ( (*chr_end)[1] ) (*chr_end)++;
-}
-tscript_t *tscript_init(aux_t *aux, uint32_t trid)
-{
- khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid);
- tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k);
- assert( tr );
- return tr;
-}
-void register_cds(args_t *args, ftr_t *ftr)
-{
- // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet.
- // ftr is the result of parsing a gff CDS line
- aux_t *aux = &args->init;
-
- tscript_t *tr = tscript_init(aux, ftr->trid);
- if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand);
-
- gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t));
- cds->tr = tr;
- cds->beg = ftr->beg;
- cds->len = ftr->end - ftr->beg + 1;
- cds->icds = 0; // to keep valgrind on mac happy
- cds->phase = ftr->phase;
-
- hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds);
- tr->cds[tr->ncds++] = cds;
-}
-void register_utr(args_t *args, ftr_t *ftr)
-{
- aux_t *aux = &args->init;
- gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t));
- utr->which = ftr->type==GF_UTR3 ? prime3 : prime5;
- utr->beg = ftr->beg;
- utr->end = ftr->end;
- utr->tr = tscript_init(aux, ftr->trid);
-
- char *chr_beg, *chr_end;
- chr_beg_end(&args->init, utr->tr->gene->iseq, &chr_beg, &chr_end);
- regidx_push(args->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr);
-}
-void register_exon(args_t *args, ftr_t *ftr)
-{
- aux_t *aux = &args->init;
- gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t));
- exon->beg = ftr->beg;
- exon->end = ftr->end;
- exon->tr = tscript_init(aux, ftr->trid);
-
- char *chr_beg, *chr_end;
- chr_beg_end(&args->init, exon->tr->gene->iseq, &chr_beg, &chr_end);
- regidx_push(args->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon);
-}
-
-void tscript_init_cds(args_t *args)
-{
- aux_t *aux = &args->init;
-
- // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds)
- khint_t k;
- int warn_phase_unkn = 0;
- for (k=0; kid2tr); k++)
- {
- if ( !kh_exist(aux->id2tr, k) ) continue;
- tscript_t *tr = (tscript_t*) kh_val(aux->id2tr, k);
-
- // position-to-tscript lookup
- char *chr_beg, *chr_end;
- chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end);
- regidx_push(args->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr);
-
- if ( !tr->ncds ) continue; // transcript with no CDS
-
- // sort CDs
- qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr);
-
- // trim non-coding start
- int i, len = 0;
- if ( tr->strand==STRAND_FWD )
- {
- if ( tr->cds[0]->phase != CDS_PHASE_UNKN )
- {
- if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME;
- tr->cds[0]->beg += tr->cds[0]->phase;
- tr->cds[0]->len -= tr->cds[0]->phase;
- tr->cds[0]->phase = 0;
- }
-
- // sanity check phase; the phase number in gff tells us how many bases to skip in this
- // feature to reach the first base of the next codon
- int tscript_ok = 1;
- for (i=0; incds; i++)
- {
- if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
- {
- warn_phase_unkn = 1;
- len += tr->cds[i]->len;
- continue;
- }
- int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
- if ( phase!=len%3 )
- {
- if ( args->force )
- {
- if ( args->verbosity > 0 )
- fprintf(stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
- args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
- tscript_ok = 0;
- break;
- }
- error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
- args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
- }
- len += tr->cds[i]->len;
- }
- if ( !tscript_ok ) continue; // skip this transcript
- }
- else
- {
- if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN )
- {
- // Check that the phase is not bigger than CDS length. Curiously, this can really happen,
- // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141
- // todo: the same for the fwd strand
- i = tr->ncds - 1;
- int phase = tr->cds[i]->phase;
- if ( phase ) tr->trim |= TRIM_5PRIME;
- while ( i>=0 && phase > tr->cds[i]->len )
- {
- phase -= tr->cds[i]->len;
- tr->cds[i]->phase = 0;
- tr->cds[i]->len = 0;
- i--;
- }
- tr->cds[i]->len -= tr->cds[i]->phase;
- tr->cds[i]->phase = 0;
- }
-
- // sanity check phase
- int tscript_ok = 1;
- for (i=tr->ncds-1; i>=0; i--)
- {
- if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
- {
- warn_phase_unkn = 1;
- len += tr->cds[i]->len;
- continue;
- }
- int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
- if ( phase!=len%3)
- {
- if ( args->force )
- {
- if ( args->verbosity > 0 )
- fprintf(stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
- args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
- tscript_ok = 0;
- break;
- }
- error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
- args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
- }
- len += tr->cds[i]->len;
- }
- if ( !tscript_ok ) continue; // skip this transcript
- }
-
- // set len. At the same check that CDS within a transcript do not overlap
- len = 0;
- for (i=0; incds; i++)
- {
- tr->cds[i]->icds = i;
- len += tr->cds[i]->len;
- if ( !i ) continue;
-
- gf_cds_t *a = tr->cds[i-1];
- gf_cds_t *b = tr->cds[i];
- if ( a->beg + a->len - 1 >= b->beg )
- {
- if ( args->force )
- {
- fprintf(stderr,"Warning: GFF contains overlapping CDS %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32".\n",
- args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
- }
- else
- error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n"
- " Use the --force option to override (at your own risk).\n",
- args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
- }
- }
- if ( len%3 != 0 )
- {
- // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289
- // http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289
- // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one.
-
- tr->trim |= TRIM_3PRIME;
- if ( tr->strand==STRAND_FWD )
- {
- i = tr->ncds - 1;
- while ( i>=0 && len%3 )
- {
- int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
- tr->cds[i]->len -= dlen;
- len -= dlen;
- i--;
- }
- }
- else
- {
- i = 0;
- while ( incds && len%3 )
- {
- int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
- tr->cds[i]->len -= dlen;
- tr->cds[i]->beg += dlen;
- len -= dlen;
- i++;
- }
- }
- }
-
- // set CDS offsets and insert into regidx
- len=0;
- for (i=0; incds; i++)
- {
- tr->cds[i]->pos = len;
- len += tr->cds[i]->len;
- regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]);
- }
- }
- if ( warn_phase_unkn && args->verbosity > 0 )
- fprintf(stderr,"Warning: encountered CDS with phase column unset, could not verify reading frame\n");
-}
-
-void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); }
-void regidx_free_tscript(void *payload) { tscript_t *tr = *((tscript_t**)payload); free(tr->cds); free(tr); }
-
-void init_gff(args_t *args)
-{
- aux_t *aux = &args->init;
- aux->seq2int = khash_str2int_init(); // chrom's numeric id
- aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene
- aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t
- args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL);
- aux->ignored_biotypes = khash_str2int_init();
- gff_id_init(&aux->gene_ids);
- gff_id_init(&args->tscript_ids);
-
- // parse gff
- kstring_t str = {0,0,0};
- htsFile *fp = hts_open(args->gff_fname,"r");
- if ( !fp ) error("Failed to read %s\n", args->gff_fname);
- while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
- {
- hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr);
- int ret = gff_parse(args, str.s, aux->ftr + aux->nftr);
- if ( !ret ) aux->nftr++;
- }
- free(str.s);
- if ( hts_close(fp)!=0 ) error("Close failed: %s\n", args->gff_fname);
-
-
- // process gff information: connect CDS and exons to transcripts
- args->idx_cds = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL);
- args->idx_utr = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL);
- args->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL);
- args->itr = regitr_init(NULL);
-
- int i;
- for (i=0; inftr; i++)
- {
- ftr_t *ftr = &aux->ftr[i];
-
- // check whether to keep this feature: is there a mapping trid -> gene_id -> gene?
- khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid);
- if ( k==kh_end(aux->id2tr) ) continue; // no such transcript
-
- tscript_t *tr = kh_val(aux->id2tr,k);
- if ( !tr->gene->name )
- {
- // not a supported biotype (e.g. gene:pseudogene, transcript:processed_transcript)
- regidx_free_tscript(&tr);
- kh_del(int2tscript, aux->id2tr,k);
- continue;
- }
-
- // populate regidx by category:
- // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5
- // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ...
- if ( ftr->type==GF_CDS ) register_cds(args, ftr);
- else if ( ftr->type==GF_EXON ) register_exon(args, ftr);
- else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr);
- else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr);
- else
- error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,args->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
- }
- tscript_init_cds(args);
-
- if ( args->verbosity > 0 )
- {
- fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
- regidx_nregs(args->idx_tscript),
- regidx_nregs(args->idx_exon),
- regidx_nregs(args->idx_cds),
- regidx_nregs(args->idx_utr));
- }
- if ( !regidx_nregs(args->idx_tscript) )
- fprintf(stderr,
- "Warning: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n"
- " or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n"
- " of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n");
-
- free(aux->ftr);
- khash_str2int_destroy_free(aux->seq2int);
- // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
- kh_destroy(int2tscript,aux->id2tr);
- free(aux->seq);
- gff_id_destroy(&aux->gene_ids);
-
- if ( args->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) )
- {
- khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes;
- fprintf(stderr,"Ignored the following biotypes:\n");
- for (i = kh_begin(ign); i < kh_end(ign); i++)
- {
- if ( !kh_exist(ign,i)) continue;
- const char *biotype = kh_key(ign,i);
- if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")";
- fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype);
- }
- }
- khash_str2int_destroy_free(aux->ignored_biotypes);
-}
-
static inline int ncsq2_to_nfmt(int ncsq2)
{
return 1 + (ncsq2 - 1) / 30;
@@ -1474,8 +483,17 @@ void init_data(args_t *args)
args->fai = fai_load(args->fa_fname);
if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
- if ( args->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", args->gff_fname);
- init_gff(args);
+ args->gff = gff_init(args->gff_fname);
+ gff_set(args->gff,verbosity,args->verbosity);
+ gff_set(args->gff,strip_chr_names,args->unify_chr_names);
+ gff_set(args->gff,force_out_of_phase,args->force);
+ gff_set(args->gff,dump_fname,args->dump_gff);
+ gff_parse(args->gff);
+ args->idx_cds = gff_get(args->gff,idx_cds);
+ args->idx_utr = gff_get(args->gff,idx_utr);
+ args->idx_exon = gff_get(args->gff,idx_exon);
+ args->idx_tscript = gff_get(args->gff,idx_tscript);
+ args->itr = regitr_init(NULL);
args->rid = -1;
@@ -1536,6 +554,7 @@ void init_data(args_t *args)
if ( args->hdr_nsmpl )
bcf_hdr_printf(args->hdr,"##FORMAT=",args->bcsq_tag);
if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
+ if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
}
if ( args->verbosity > 0 ) fprintf(stderr,"Calling...\n");
}
@@ -1547,21 +566,8 @@ void destroy_data(args_t *args)
"Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n"
" the limit can be increased by running with `--ncsq %d`.\n",ncsq2_to_nfmt(args->ncsq2_max)/8,1+args->ncsq2_small_warned/2);
- regidx_destroy(args->idx_cds);
- regidx_destroy(args->idx_utr);
- regidx_destroy(args->idx_exon);
- regidx_destroy(args->idx_tscript);
regitr_destroy(args->itr);
-
- khint_t k,i,j;
- for (k=0; kinit.gid2gene); k++)
- {
- if ( !kh_exist(args->init.gid2gene, k) ) continue;
- gf_gene_t *gene = (gf_gene_t*) kh_val(args->init.gid2gene, k);
- free(gene->name);
- free(gene);
- }
- kh_destroy(int2gene,args->init.gid2gene);
+ gff_destroy(args->gff);
if ( args->filter )
filter_destroy(args->filter);
@@ -1569,9 +575,20 @@ void destroy_data(args_t *args)
khp_destroy(trhp,args->active_tr);
kh_destroy(pos2vbuf,args->pos2vbuf);
if ( args->smpl ) smpl_ilist_destroy(args->smpl);
- int ret;
+ int i,j,ret;
if ( args->out_fh )
+ {
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(args->out_fh)<0 )
+ {
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
ret = hts_close(args->out_fh);
+ }
else
ret = fclose(args->out);
if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
@@ -1602,7 +619,7 @@ void destroy_data(args_t *args)
free(args->gt_arr);
free(args->str.s);
free(args->str2.s);
- gff_id_destroy(&args->tscript_ids);
+ free(args->chr_name);
}
/*
@@ -1614,7 +631,7 @@ void destroy_data(args_t *args)
#define SPLICE_OVERLAP 3 // indel overlaps region boundary, csq set but could not determine csq
typedef struct
{
- tscript_t *tr;
+ gf_tscript_t *tr;
struct {
int32_t pos, rlen, alen, ial;
char *ref, *alt;
@@ -1678,7 +695,7 @@ fprintf(stderr,"build_hap: rbeg=%d + %d abeg=%d \n",rbeg,rlen,abeg);
if ( rbeg < splice->vcf.pos )
{
assert( splice->tr->beg <= rbeg ); // this can be extended thanks to N_REF_PAD
- kputsn(splice->tr->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref);
+ kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref);
roff = 0;
}
else
@@ -1703,7 +720,7 @@ fprintf(stderr,"r2: %s\n",splice->kref.s);
if ( end + rlen - splice->kref.l - 1 > splice->tr->end ) // trim, the requested sequence is too long (could be extended, see N_REF_PAD)
rlen -= end + rlen - splice->kref.l - 1 - splice->tr->end;
if ( splice->kref.l < rlen )
- kputsn(splice->tr->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref);
+ kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref);
}
#if XDBG
fprintf(stderr,"r3: %s\n",splice->kref.s);
@@ -1714,7 +731,7 @@ fprintf(stderr,"r3: %s\n",splice->kref.s);
if ( abeg < splice->vcf.pos )
{
assert( splice->tr->beg <= abeg );
- kputsn(splice->tr->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt);
+ kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt);
aoff = 0;
}
else
@@ -1742,7 +759,7 @@ fprintf(stderr,"a2: %s aoff=%d\n",splice->kalt.s,aoff);
if ( end + alen + aoff - splice->kalt.l - 1 > splice->tr->end ) // trim, the requested sequence is too long
alen -= end + alen + aoff - splice->kalt.l - 1 - splice->tr->end;
if ( alen > 0 && alen > splice->kalt.l )
- kputsn(splice->tr->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt);
+ kputsn(TSCRIPT_AUX(splice->tr)->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt);
}
#if XDBG
fprintf(stderr,"a3: %s\n",splice->kalt.s);
@@ -1755,7 +772,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32
while ( regitr_overlap(itr) )
{
gf_utr_t *utr = regitr_payload(itr, gf_utr_t*);
- tscript_t *tr = utr->tr;
+ gf_tscript_t *tr = utr->tr;
if ( tr->id != trid ) continue;
csq_t csq;
memset(&csq, 0, sizeof(csq_t));
@@ -1771,7 +788,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32
}
return 0;
}
-static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type, int ial)
+static inline void csq_stage_splice(args_t *args, bcf1_t *rec, gf_tscript_t *tr, uint32_t type, int ial)
{
#if XDBG
fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
@@ -1788,6 +805,21 @@ fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
csq.type.gene = tr->gene->name;
csq_stage(args, &csq, rec);
}
+static inline const char *drop_chr_prefix(args_t *args, const char *chr)
+{
+ if ( !args->unify_chr_names ) return chr;
+ if ( !strncasecmp("chr",chr,3) ) return chr+3;
+ return chr;
+}
+static inline const char *add_chr_prefix(args_t *args, const char *chr)
+{
+ if ( !args->unify_chr_names ) return chr;
+ int len = strlen(chr);
+ hts_expand(char,len+4,args->mchr_name,args->chr_name);
+ memcpy(args->chr_name,"chr",3);
+ memcpy(args->chr_name+3,chr,len+1);
+ return args->chr_name;
+}
static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
{
// coordinates that matter for consequences, eg AC>ACG trimmed to C>CG, 1bp
@@ -1813,7 +845,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr
{
ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
@@ -1851,7 +883,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr
{
ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
@@ -1924,7 +956,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut
int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
{
static int small_ref_padding_warned = 0;
- tscript_t *tr = splice->tr;
+ gf_tscript_t *tr = splice->tr;
// We know the VCF record overlaps the exon, but does it overlap the start codon?
if ( tr->strand==STRAND_REV && splice->vcf.pos + splice->vcf.rlen + 2 <= ex_end ) return 0;
@@ -1956,7 +988,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint
}
char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele
- char *ptr_ref = splice->tr->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg); // the first ref base after the ndel bases deleted
+ char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg); // the first ref base after the ndel bases deleted
#if XDBG
fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref);
#endif
@@ -1985,7 +1017,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint
}
char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele
- char *ptr_ref = splice->tr->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg; // the replacement ref block
+ char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg; // the replacement ref block
#if XDBG
fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref);
#endif
@@ -2030,7 +1062,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
@@ -2086,7 +1118,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
@@ -2175,7 +1207,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
@@ -2205,7 +1237,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
@@ -2291,7 +1323,7 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds,
{
int i;
kstring_t str = {0,0,0};
- tscript_t *tr = cds->tr;
+ gf_tscript_t *tr = cds->tr;
child->icds = cds->icds; // index of cds in the tscript's list of exons
child->vcf_ial = ial;
@@ -2313,8 +1345,8 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds,
}
if ( splice.check_start ) // do not check starts in incomplete CDS, defined as not starting with M
{
- if ( tr->strand==STRAND_FWD ) { if ( dna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
- else { if ( cdna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
+ if ( tr->strand==STRAND_FWD ) { if ( dna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
+ else { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
}
if ( child->icds!=0 ) splice.check_region_beg = 1;
if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1;
@@ -2373,12 +1405,12 @@ fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n
// the variant is on a new exon, finish up the previous
int len = tr->cds[i]->len - parent->rbeg - parent->rlen + tr->cds[i]->beg;
if ( len > 0 )
- kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+ kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
}
// append any skipped non-variant exons
while ( ++i < cds->icds )
- kputsn_(tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str);
+ kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str);
if ( parent->icds==child->icds )
{
@@ -2390,10 +1422,10 @@ fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n
free(splice.kalt.s);
return 1;
}
- kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+ kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
}
else
- kputsn_(tr->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str);
+ kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str);
}
kputs(splice.kalt.s + dbeg, &str);
@@ -2645,28 +1677,28 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,
#endif
}
-void tscript_splice_ref(tscript_t *tr)
+void tscript_splice_ref(gf_tscript_t *tr)
{
int i, len = 0;
for (i=0; incds; i++)
len += tr->cds[i]->len;
- tr->nsref = len + 2*N_REF_PAD;
- tr->sref = (char*) malloc(len + 1 + 2*N_REF_PAD);
+ TSCRIPT_AUX(tr)->nsref = len + 2*N_REF_PAD;
+ TSCRIPT_AUX(tr)->sref = (char*) malloc(len + 1 + 2*N_REF_PAD);
len = 0;
- memcpy(tr->sref, tr->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD);
+ memcpy(TSCRIPT_AUX(tr)->sref, TSCRIPT_AUX(tr)->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD);
len += N_REF_PAD;
for (i=0; incds; i++)
{
- memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len);
+ memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len);
len += tr->cds[i]->len;
}
- memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD);
+ memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD);
len += N_REF_PAD;
- tr->sref[len] = 0;
+ TSCRIPT_AUX(tr)->sref[len] = 0;
}
// returns: 0 if consequence was added, 1 if it already exists or could not be added
@@ -2800,18 +1832,25 @@ void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str)
if ( csq->type & CSQ_UPSTREAM_STOP )
kputc_('*',str);
- int i, n = sizeof(csq_strings)/sizeof(char*);
+ int has_csq = 0, i, n = sizeof(csq_strings)/sizeof(char*);
for (i=1; itype&(1<type&(1<type&(1<type&(1<biotype==GF_NMD) && (csq->type & CSQ_PRN_NMD) )
+ {
+ if ( has_csq ) kputc_('&',str); // just in case, this should always be true
+ kputs("NMD_transcript",str);
+ }
kputc_('|', str);
if ( csq->gene ) kputs(csq->gene , str);
kputc_('|', str);
- if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str);
+// if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str);
+ if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(gff_id2string(args->gff,transcript,csq->trid), str);
kputc_('|', str);
kputs(gf_type2gff_string(csq->biotype), str);
@@ -2840,7 +1879,7 @@ void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str)
void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel)
{
int i;
- tscript_t *tr = hap->tr;
+ gf_tscript_t *tr = hap->tr;
int ref_node = tr->strand==STRAND_FWD ? ibeg : iend;
int icsq = node->ncsq_list++;
hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
@@ -2954,7 +1993,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
str.l = 0;
// create the aa variant string
- int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (hap->tr->nsref - 2*N_REF_PAD - node2rend(iend))/3+1;
+ int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (TSCRIPT_AUX(hap->tr)->nsref - 2*N_REF_PAD - node2rend(iend))/3+1;
int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1;
kputc_('|', &str);
kputw(aa_rbeg, &str);
@@ -3020,13 +2059,13 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
void hap_finalize(args_t *args, hap_t *hap)
{
- tscript_t *tr = hap->tr;
- if ( !tr->sref )
+ gf_tscript_t *tr = hap->tr;
+ if ( !TSCRIPT_AUX(tr)->sref )
tscript_splice_ref(tr);
kstring_t sref;
- sref.s = tr->sref;
- sref.l = tr->nsref;
+ sref.s = TSCRIPT_AUX(tr)->sref;
+ sref.l = TSCRIPT_AUX(tr)->nsref;
sref.m = sref.l;
int istack = 0;
@@ -3034,7 +2073,7 @@ void hap_finalize(args_t *args, hap_t *hap)
hap->sseq.l = 0;
hap->tseq.l = 0;
- hap->stack[0].node = tr->root;
+ hap->stack[0].node = TSCRIPT_AUX(tr)->root;
hap->stack[0].ichild = -1;
hap->stack[0].slen = 0;
hap->stack[0].dlen = 0;
@@ -3214,7 +2253,7 @@ static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap)
kput_vcsq(args, &csq->type, &args->str);
fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
}
-static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+static inline void hap_print_text(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
{
if ( !node || !node->ncsq_list ) return;
@@ -3240,7 +2279,7 @@ static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ih
}
}
-static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+static inline void hap_stage_vcf(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
{
if ( !node || !node->ncsq_list || ismpl<0 ) return;
@@ -3276,23 +2315,23 @@ void hap_flush(args_t *args, uint32_t pos)
tr_heap_t *heap = args->active_tr;
while ( heap->ndat && heap->dat[0]->end<=pos )
{
- tscript_t *tr = heap->dat[0];
+ gf_tscript_t *tr = heap->dat[0];
khp_delete(trhp, heap);
args->hap->tr = tr;
- if ( tr->root && tr->root->nchild ) // normal, non-localized calling
+ if ( TSCRIPT_AUX(tr)->root && TSCRIPT_AUX(tr)->root->nchild ) // normal, non-localized calling
{
hap_finalize(args, args->hap);
if ( args->output_type==FT_TAB_TEXT ) // plain text output, not a vcf
{
if ( args->phase==PHASE_DROP_GT )
- hap_print_text(args, tr, -1,0, tr->hap[0]);
+ hap_print_text(args, tr, -1,0, TSCRIPT_AUX(tr)->hap[0]);
else
{
for (i=0; ismpl->n; i++)
{
for (j=0; j<2; j++)
- hap_print_text(args, tr, args->smpl->idx[i],j+1, tr->hap[i*2+j]);
+ hap_print_text(args, tr, args->smpl->idx[i],j+1, TSCRIPT_AUX(tr)->hap[i*2+j]);
}
}
}
@@ -3301,7 +2340,7 @@ void hap_flush(args_t *args, uint32_t pos)
for (i=0; ismpl->n; i++)
{
for (j=0; j<2; j++)
- hap_stage_vcf(args, tr, args->smpl->idx[i],j, tr->hap[i*2+j]);
+ hap_stage_vcf(args, tr, args->smpl->idx[i],j, TSCRIPT_AUX(tr)->hap[i*2+j]);
}
}
}
@@ -3309,7 +2348,7 @@ void hap_flush(args_t *args, uint32_t pos)
// mark the transcript for deletion. Cannot delete it immediately because
// by-position VCF output will need them when flushed by vcf_buf_push
args->nrm_tr++;
- hts_expand(tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr);
+ hts_expand(gf_tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr);
args->rm_tr[args->nrm_tr-1] = tr;
}
}
@@ -3424,24 +2463,33 @@ void vbuf_flush(args_t *args, uint32_t pos)
for (i=0; inrm_tr; i++)
{
- tscript_t *tr = args->rm_tr[i];
- if ( tr->root ) hap_destroy(tr->root);
- tr->root = NULL;
- free(tr->hap);
- free(tr->ref);
- free(tr->sref);
+ gf_tscript_t *tr = args->rm_tr[i];
+ tscript_t *aux = TSCRIPT_AUX(tr);
+ if ( aux->root ) hap_destroy(aux->root);
+ aux->root = NULL;
+ free(aux->hap);
+ free(aux->ref);
+ free(aux->sref);
+ free(aux);
+ tr->aux = NULL;
}
args->nrm_tr = 0;
args->ncsq_buf = 0;
}
-void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr)
+void tscript_init_ref(args_t *args, gf_tscript_t *tr, const char *chr)
{
int i, len;
int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg;
- tr->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
- if ( !tr->ref )
+ const char *tmp_chr = chr;
+ if ( !faidx_has_seq(args->fai,tmp_chr) )
+ {
+ tmp_chr = drop_chr_prefix(args,chr);
+ if ( !faidx_has_seq(args->fai,tmp_chr) ) tmp_chr = add_chr_prefix(args,chr);
+ }
+ TSCRIPT_AUX(tr)->ref = faidx_fetch_seq(args->fai, tmp_chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
+ if ( !TSCRIPT_AUX(tr)->ref )
error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1);
int pad_end = len - (tr->end - tr->beg + 1 + pad_beg);
@@ -3449,23 +2497,23 @@ void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr)
{
char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD + 1);
for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N';
- memcpy(ref+i, tr->ref, len);
+ memcpy(ref+i, TSCRIPT_AUX(tr)->ref, len);
len += i;
for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N';
ref[i+len] = 0;
- free(tr->ref);
- tr->ref = ref;
+ free(TSCRIPT_AUX(tr)->ref);
+ TSCRIPT_AUX(tr)->ref = ref;
}
}
-static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec)
+static void sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec)
{
int vbeg = 0;
int rbeg = rec->pos - tr->beg + N_REF_PAD;
if ( rbeg < 0 ) { vbeg += abs(rbeg); rbeg = 0; }
- char *ref = tr->ref + rbeg;
+ char *ref = TSCRIPT_AUX(tr)->ref + rbeg;
char *vcf = rec->d.allele[0] + vbeg;
- assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - tr->ref < tr->end - tr->beg + 2*N_REF_PAD );
+ assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - TSCRIPT_AUX(tr)->ref < tr->end - tr->beg + 2*N_REF_PAD );
int i = 0;
while ( ref[i] && vcf[i] )
{
@@ -3479,7 +2527,7 @@ static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec)
int test_cds_local(args_t *args, bcf1_t *rec)
{
int i,j, ret = 0;
- const char *chr = bcf_seqname(args->hdr,rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
// note that the off-by-one extension of rlen is deliberate to account for insertions
if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
@@ -3491,12 +2539,13 @@ int test_cds_local(args_t *args, bcf1_t *rec)
while ( regitr_overlap(args->itr) )
{
gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
- tscript_t *tr = cds->tr;
+ gf_tscript_t *tr = cds->tr;
if ( !GF_is_coding(tr->type) ) continue;
ret = 1;
- if ( !tr->ref )
+ if ( !TSCRIPT_AUX(tr) )
{
+ tr->aux = calloc(sizeof(tscript_t),1);
tscript_init_ref(args, tr, chr);
tscript_splice_ref(tr);
khp_insert(trhp, args->active_tr, &tr); // only to clean the reference afterwards
@@ -3505,8 +2554,8 @@ int test_cds_local(args_t *args, bcf1_t *rec)
sanity_check_ref(args, tr, rec);
kstring_t sref;
- sref.s = tr->sref;
- sref.l = tr->nsref;
+ sref.s = TSCRIPT_AUX(tr)->sref;
+ sref.l = TSCRIPT_AUX(tr)->nsref;
sref.m = sref.l;
for (i=1; in_allele; i++)
@@ -3614,8 +2663,8 @@ int test_cds_local(args_t *args, bcf1_t *rec)
{
// create the aa variant string
kstring_t str = {0,0,0};
- int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1;
- int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
+ int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1;
+ int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
kputc_('|', &str);
kputw(aa_rbeg, &str);
kprint_aa_prediction(args,aa_rbeg,tref,&str);
@@ -3633,11 +2682,11 @@ int test_cds_local(args_t *args, bcf1_t *rec)
csq_stage(args, &csq, rec);
// all this only to clean vstr when vrec is flushed
- if ( !tr->root )
- tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
- tr->root->ncsq_list++;
- hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list);
- csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1;
+ if ( !TSCRIPT_AUX(tr)->root )
+ TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+ TSCRIPT_AUX(tr)->root->ncsq_list++;
+ hts_expand0(csq_t,TSCRIPT_AUX(tr)->root->ncsq_list,TSCRIPT_AUX(tr)->root->mcsq_list,TSCRIPT_AUX(tr)->root->csq_list);
+ csq_t *rm_csq = TSCRIPT_AUX(tr)->root->csq_list + TSCRIPT_AUX(tr)->root->ncsq_list - 1;
rm_csq->type.vstr = str;
}
if ( csq_type & ~CSQ_COMPOUND )
@@ -3659,27 +2708,28 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
static int overlaps_warned = 0, multiploid_warned = 0;
int i, ret = 0, hap_ret;
- const char *chr = bcf_seqname(args->hdr,rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
// note that the off-by-one extension of rlen is deliberate to account for insertions
if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
while ( regitr_overlap(args->itr) )
{
gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
- tscript_t *tr = cds->tr;
+ gf_tscript_t *tr = cds->tr;
if ( !GF_is_coding(tr->type) ) continue;
if ( vbuf->keep_until < tr->end ) vbuf->keep_until = tr->end;
ret = 1;
- if ( !tr->root )
+ if ( !TSCRIPT_AUX(tr) )
{
// initialize the transcript and its haplotype tree, fetch the reference sequence
+ tr->aux = calloc(sizeof(tscript_t),1);
tscript_init_ref(args, tr, chr);
- tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
- tr->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid
- tr->hap = (hap_node_t**) malloc(tr->nhap*sizeof(hap_node_t*));
- for (i=0; inhap; i++) tr->hap[i] = NULL;
- tr->root->nend = tr->nhap;
- tr->root->type = HAP_ROOT;
+ TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+ TSCRIPT_AUX(tr)->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid
+ TSCRIPT_AUX(tr)->hap = (hap_node_t**) malloc(TSCRIPT_AUX(tr)->nhap*sizeof(hap_node_t*));
+ for (i=0; inhap; i++) TSCRIPT_AUX(tr)->hap[i] = NULL;
+ TSCRIPT_AUX(tr)->root->nend = TSCRIPT_AUX(tr)->nhap;
+ TSCRIPT_AUX(tr)->root->type = HAP_ROOT;
khp_insert(trhp, args->active_tr, &tr);
}
@@ -3689,7 +2739,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
if ( args->phase==PHASE_DROP_GT )
{
if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
- hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root;
+ hap_node_t *parent = TSCRIPT_AUX(tr)->hap[0] ? TSCRIPT_AUX(tr)->hap[0] : TSCRIPT_AUX(tr)->root;
hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
hap_ret = hap_init(args, parent, child, cds, rec, 1);
if ( hap_ret!=0 )
@@ -3734,8 +2784,8 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
parent->mchild = 1;
parent->child = (hap_node_t**) malloc(sizeof(hap_node_t*));
parent->child[0] = child;
- tr->hap[0] = child;
- tr->hap[0]->nend = 1;
+ TSCRIPT_AUX(tr)->hap[0] = child;
+ TSCRIPT_AUX(tr)->hap[0]->nend = 1;
continue;
}
@@ -3793,12 +2843,12 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
assert( ial < rec->n_allele );
if ( rec->d.allele[ial][0]=='<' || rec->d.allele[ial][0]=='*' ) { continue; }
- hap_node_t *parent = tr->hap[i] ? tr->hap[i] : tr->root;
+ hap_node_t *parent = TSCRIPT_AUX(tr)->hap[i] ? TSCRIPT_AUX(tr)->hap[i] : TSCRIPT_AUX(tr)->root;
if ( parent->cur_rec==rec && parent->cur_child[ial]>=0 )
{
// this haplotype has been seen in another sample
- tr->hap[i] = parent->child[ parent->cur_child[ial] ];
- tr->hap[i]->nend++;
+ TSCRIPT_AUX(tr)->hap[i] = parent->child[ parent->cur_child[ial] ];
+ TSCRIPT_AUX(tr)->hap[i]->nend++;
parent->nend--;
continue;
}
@@ -3852,8 +2902,8 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
hts_expand0(hap_node_t*,parent->nchild,parent->mchild,parent->child);
parent->cur_child[ial] = j;
parent->child[j] = child;
- tr->hap[i] = child;
- tr->hap[i]->nend++;
+ TSCRIPT_AUX(tr)->hap[i] = child;
+ TSCRIPT_AUX(tr)->hap[i]->nend++;
parent->nend--;
}
}
@@ -3933,7 +2983,7 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec)
}
int test_utr(args_t *args, bcf1_t *rec)
{
- const char *chr = bcf_seqname(args->hdr,rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
// note that the off-by-one extension of rlen is deliberate to account for insertions
if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
@@ -3944,7 +2994,7 @@ int test_utr(args_t *args, bcf1_t *rec)
while ( regitr_overlap(args->itr) )
{
gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*);
- tscript_t *tr = splice.tr = utr->tr;
+ gf_tscript_t *tr = splice.tr = utr->tr;
for (i=1; in_allele; i++)
{
if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
@@ -3971,7 +3021,7 @@ int test_utr(args_t *args, bcf1_t *rec)
}
int test_splice(args_t *args, bcf1_t *rec)
{
- const char *chr = bcf_seqname(args->hdr,rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0;
splice_t splice;
@@ -4003,7 +3053,7 @@ int test_splice(args_t *args, bcf1_t *rec)
}
int test_tscript(args_t *args, bcf1_t *rec)
{
- const char *chr = bcf_seqname(args->hdr,rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
splice_t splice;
@@ -4012,7 +3062,7 @@ int test_tscript(args_t *args, bcf1_t *rec)
int i, ret = 0;
while ( regitr_overlap(args->itr) )
{
- tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
+ gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*);
for (i=1; in_allele; i++)
{
if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
@@ -4046,7 +3096,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
warned = 1;
}
- const char *chr = bcf_seqname(args->hdr,rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
// only insertions atm
int beg = rec->pos + 1;
@@ -4061,7 +3111,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
csq_t csq;
memset(&csq, 0, sizeof(csq_t));
gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
- tscript_t *tr = cds->tr;
+ gf_tscript_t *tr = cds->tr;
csq.type.type = (GF_is_coding(tr->type) ? CSQ_CODING_SEQUENCE : CSQ_NON_CODING) | csq_class;
csq.pos = rec->pos;
csq.type.biotype = tr->type;
@@ -4079,7 +3129,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
csq_t csq;
memset(&csq, 0, sizeof(csq_t));
gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*);
- tscript_t *tr = utr->tr;
+ gf_tscript_t *tr = utr->tr;
csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | csq_class;
csq.pos = rec->pos;
csq.type.biotype = tr->type;
@@ -4118,7 +3168,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
{
csq_t csq;
memset(&csq, 0, sizeof(csq_t));
- tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
+ gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*);
splice.vcf.alt = rec->d.allele[1];
splice.csq = csq_class;
int splice_ret = splice_csq(args, &splice, tr->beg, tr->end);
@@ -4179,7 +3229,10 @@ static void process(args_t *args, bcf1_t **rec_ptr)
// Perform a simple sanity check (that does not catch much), the chromosome must be present in the
// reference file
if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) )
- error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname);
+ {
+ if ( !faidx_has_seq(args->fai,drop_chr_prefix(args,bcf_seqname(args->hdr,rec))) && !faidx_has_seq(args->fai,add_chr_prefix(args,bcf_seqname(args->hdr,rec))) )
+ error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname);
+ }
}
if ( prev_pos > rec->pos )
error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
@@ -4254,9 +3307,12 @@ static const char *usage(void)
" r: require phased GTs, throw an error on unphased het GTs\n"
" R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
" s: skip unphased hets\n"
- "Options:\n"
- " -e, --exclude EXPR Exclude sites for which the expression is true\n"
+ "GFF options:\n"
+ " --dump-gff FILE.gz Dump the parsed GFF file (for debugging purposes)\n"
" --force Run even if some sanity checks fail\n"
+ " --unify-chr-names 1|0 Automatically unify chromosome naming (e.g. chrX vs X) in GFF, fasta, and VCF [1]\n"
+ "General options:\n"
+ " -e, --exclude EXPR Exclude sites for which the expression is true\n"
" -i, --include EXPR Select sites for which the expression is true\n"
" --no-version Do not append version and command line to the header\n"
" -o, --output FILE Write output to a file [standard output]\n"
@@ -4272,6 +3328,7 @@ static const char *usage(void)
" --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
" --threads INT Use multithreading with worker threads [0]\n"
" -v, --verbose INT Verbosity level 0-2 [1]\n"
+ " --write-index Automatically index the output files [off]\n"
"\n"
"Example:\n"
" bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
@@ -4292,6 +3349,7 @@ int main_csq(int argc, char *argv[])
args->verbosity = 1;
args->record_cmd_line = 1;
args->clevel = -1;
+ args->unify_chr_names = 1;
static struct option loptions[] =
{
@@ -4321,6 +3379,9 @@ int main_csq(int argc, char *argv[])
{"targets-file",1,0,'T'},
{"targets-overlap",required_argument,NULL,5},
{"no-version",no_argument,NULL,3},
+ {"write-index",no_argument,NULL,6},
+ {"dump-gff",required_argument,NULL,7},
+ {"unify-chr-names",required_argument,NULL,8},
{0,0,0,0}
};
int c, targets_is_file = 0, regions_is_file = 0;
@@ -4339,7 +3400,7 @@ int main_csq(int argc, char *argv[])
case 3 : args->record_cmd_line = 0; break;
case 'b':
args->brief_predictions = 1;
- fprintf(stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n");
+ fprintf(stderr,"Warning: The -b option will be removed in future versions. Please use -B 1 instead.\n");
break;
case 'B':
args->brief_predictions = strtol(optarg,&tmp,10);
@@ -4409,6 +3470,13 @@ int main_csq(int argc, char *argv[])
targets_overlap = parse_overlap_option(optarg);
if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
break;
+ case 6 : args->write_index = 1; break;
+ case 7 : args->dump_gff = optarg; break;
+ case 8 :
+ if ( !strcmp(optarg,"0") ) args->unify_chr_names = 0;
+ else if ( !strcmp(optarg,"1") ) args->unify_chr_names = 1;
+ else error("Could not parse: --unify-chr-names %s\n",optarg);
+ break;
case 'h':
case '?': error("%s",usage());
default: error("The option not recognised: %s\n\n", optarg); break;
diff --git a/doc/bcftools.1 b/doc/bcftools.1
index 0e3d5290e..c940065fb 100644
--- a/doc/bcftools.1
+++ b/doc/bcftools.1
@@ -2,12 +2,12 @@
.\" Title: bcftools
.\" Author: [see the "AUTHOR(S)" section]
.\" Generator: Asciidoctor 2.0.16.dev
-.\" Date: 2023-02-21
+.\" Date: 2023-07-25
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
-.TH "BCFTOOLS" "1" "2023-02-21" "\ \&" "\ \&"
+.TH "BCFTOOLS" "1" "2023-07-25" "\ \&" "\ \&"
.ie \n(.g .ds Aq \(aq
.el .ds Aq '
.ss \n[.ss] 0
@@ -51,10 +51,10 @@ standard input (stdin) and outputs to the standard output (stdout). Several
commands can thus be combined with Unix pipes.
.SS "VERSION"
.sp
-This manual page was last updated \fB2023\-02\-21\fP and refers to bcftools git version \fB1.17\fP.
+This manual page was last updated \fB2023\-07\-25\fP and refers to bcftools git version \fB1.18\fP.
.SS "BCF1"
.sp
-The BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP
+The obsolete BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP
compatible with this version of bcftools. To read BCF1 files one can use
the view command from old versions of bcftools packaged with samtools
versions <= 0.1.19 to convert to VCF, which can then be read by
@@ -75,6 +75,9 @@ done with \fIbcftools view\fP. Users are now required to choose between the old
samtools calling model (\fI\-c/\-\-consensus\-caller\fP) and the new multiallelic
calling model (\fI\-m/\-\-multiallelic\-caller\fP). The multiallelic calling model
is recommended for most tasks.
+.SS "FILTERING EXPRESSIONS"
+.sp
+See \fBEXPRESSIONS\fP
.SH "LIST OF COMMANDS"
.sp
For a full list of available commands, run \fBbcftools\fP without arguments. For a full
@@ -344,6 +347,17 @@ Some helper scripts are bundled with the bcftools code.
. sp -1
. IP \(bu 2.3
.\}
+\fBgff2gff\fP .. converts a GFF file to the format required by \fBcsq\fP
+.RE
+.sp
+.RS 4
+.ie n \{\
+\h'-04'\(bu\h'+03'\c
+.\}
+.el \{\
+. sp -1
+. IP \(bu 2.3
+.\}
\fBplot\-vcfstats\fP .. plots the output of \fBstats\fP
.RE
.SH "COMMANDS AND OPTIONS"
@@ -597,6 +611,11 @@ Same as \fB\-\-regions\-overlap\fP but for \fB\-t/\-T\fP.
Use multithreading with \fIINT\fP worker threads. The option is currently used only for the compression of the
output stream, only when \fI\-\-output\-type\fP is \fIb\fP or \fIz\fP. Default: 0.
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output files. Can be used only for compressed BCF and VCF output.
+.RE
.SS "bcftools annotate \fI[OPTIONS]\fP \fIFILE\fP"
.sp
Add or remove annotations.
@@ -881,6 +900,11 @@ except GT. To remove all INFO tags except "FOO" and "BAR", use
"INFO" can be abbreviated to "INF" and "FORMAT" to "FMT".
.RE
.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
+.sp
\fBExamples:\fP
.sp
.if n .RS 4
@@ -1017,6 +1041,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
.SS "Input/output options:"
.sp
\fB\-A, \-\-keep\-alts\fP
@@ -1401,6 +1430,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
.SS "bcftools consensus \fI[OPTIONS]\fP \fIFILE\fP"
.sp
Create consensus sequence by applying VCF variants to a reference fasta file.
@@ -1432,18 +1466,13 @@ exclude sites for which \fIEXPRESSION\fP is true. For valid expressions see
reference sequence in fasta format
.RE
.sp
-\fB\-H, \-\-haplotype\fP \fI1\fP|\fI2\fP|\fIR\fP|\fIA\fP|\fII\fP|\fILR\fP|\fILA\fP|\fISR\fP|\fISA\fP|\fI1pIu\fP|\fI2pIu\fP
+\fB\-H, \-\-haplotype\fP N|\fIR\fP|\fIA\fP|\fII\fP|\fILR\fP|\fILA\fP|\fISR\fP|\fISA\fP|\fINpIu\fP
.RS 4
choose which allele from the FORMAT/GT field to use (the codes are case\-insensitive):
.sp
-\fI1\fP
-.RS 4
-the first allele, regardless of phasing
-.RE
-.sp
-\fI2\fP
+\fIN\fP
.RS 4
-the second allele, regardless of phasing
+N={1,2,3,...}, the allele index within the genotype, regardless of phasing
.RE
.sp
\fIR\fP
@@ -1471,20 +1500,15 @@ the longer allele. If both have the same length, use the REF allele (LR), or the
the shorter allele. If both have the same length, use the REF allele (SR), or the ALT allele (SA)
.RE
.sp
-\fI1pIu, 2pIu\fP
+\fINpIu\fP
.RS 4
-first/second allele for phased genotypes and IUPAC code for unphased genotypes
-.sp
-.if n .RS 4
-.nf
-.fam C
-This option requires *\-s*, unless exactly one sample is present in the VCF
-.fam
-.fi
-.if n .RE
+N={1,2,3,...}, the allele index within genotype for phased genotypes and IUPAC code for unphased genotypes.
+For example, \fI1pIu\fP or \fI2pIu\fP
.RE
.RE
.sp
+Note that the \fB\-H, \-\-haplotype\fP option requires the \fB\-s, \-\-samples\fP option, unless exactly one sample is present in the VCF
+.sp
\fB\-i, \-\-include\fP \fIEXPRESSION\fP
.RS 4
include only sites for which \fIEXPRESSION\fP is true. For valid expressions see
@@ -1494,24 +1518,24 @@ include only sites for which \fIEXPRESSION\fP is true. For valid expressions see
\fB\-I, \-\-iupac\-codes\fP
.RS 4
output variants in the form of IUPAC ambiguity codes determined from FORMAT/GT fields. By default all
-samples are used and can be subset with \f(CR\-s, \-\-samples\fP and \f(CR\-S, \-\-samples\-file\fP. Use \f(CR\-s \-\fP to ignore
+samples are used and can be subset with \fB\-s, \-\-samples\fP and \fB\-S, \-\-samples\-file\fP. Use \fB\-s \-\fP to ignore
samples and use only the REF and ALT columns. NOTE: prior to version 1.17 the IUPAC codes were determined solely
from REF,ALT columns and sample genotypes were not considered.
.RE
.sp
\fB\-\-mark\-del\fP \fICHAR\fP
.RS 4
-instead of removing sequence, insert CHAR for deletions
+instead of removing sequence, insert character CHAR for deletions
.RE
.sp
-\fB\-\-mark\-ins\fP \fIuc\fP|\fIlc\fP
+\fB\-\-mark\-ins\fP \fIuc\fP|\fIlc\fP|\fICHAR\fP
.RS 4
-highlight inserted sequence in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is
+highlight inserted sequence in uppercase (uc), lowercase (lc), or a provided character CHAR, leaving the rest of the sequence as is
.RE
.sp
\fB\-\-mark\-snv\fP \fIuc\fP|\fIlc\fP
.RS 4
-highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is
+highlight substitutions in uppercase (uc), lowercase (lc), or a provided character CHAR, leaving the rest of the sequence as is
.RE
.sp
\fB\-m, \-\-mask\fP \fIFILE\fP
@@ -1539,12 +1563,12 @@ write output to a file
.sp
\fB\-s, \-\-samples\fP \fILIST\fP
.RS 4
-apply variants of the listed samples. See also the option \f(CR\-I, \-\-iupac\-codes\fP
+apply variants of the listed samples. See also the option \fB\-I, \-\-iupac\-codes\fP
.RE
.sp
\fB\-S, \-\-samples\-file\fP \fIFILE\fP
.RS 4
-apply variants of the samples listed in the file. See also the option \f(CR\-I, \-\-iupac\-codes\fP
+apply variants of the samples listed in the file. See also the option \fB\-I, \-\-iupac\-codes\fP
.RE
.sp
\fBExamples:\fP
@@ -1563,6 +1587,44 @@ apply variants of the samples listed in the file. See also the option \f(CR\-I,
.fam
.fi
.if n .RE
+.sp
+\fBNotes:\fP
+.RS 4
+Masking options are applied in the following order
+.sp
+.RS 4
+.ie n \{\
+\h'-04' 1.\h'+01'\c
+.\}
+.el \{\
+. sp -1
+. IP " 1." 4.2
+.\}
+mask regions with \fB\-\-mask\-with\fP character if \fB\-\-mask\fP is given. All overlapping VCF variants are ignored
+.RE
+.sp
+.RS 4
+.ie n \{\
+\h'-04' 2.\h'+01'\c
+.\}
+.el \{\
+. sp -1
+. IP " 2." 4.2
+.\}
+replace sequence not mentioned in the VCF with the requested character if \fB\-\-absent\fP is given
+.RE
+.sp
+.RS 4
+.ie n \{\
+\h'-04' 3.\h'+01'\c
+.\}
+.el \{\
+. sp -1
+. IP " 3." 4.2
+.\}
+finally apply \fB\-\-mark\-del\fP, \fB\-\-mark\-ins\fP, \fB\-\-mark\-snv\fP masks
+.RE
+.RE
.SS "bcftools convert \fI[OPTIONS]\fP \fIFILE\fP"
.SS "VCF input options:"
.sp
@@ -1617,6 +1679,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
.SS "VCF output options:"
.sp
\fB\-\-no\-version\fP
@@ -1887,13 +1954,13 @@ convert from TSV (tab\-separated values) format (such as generated by
\fB\-c, \-\-columns\fP \fIlist\fP
.RS 4
comma\-separated list of fields in the input file. In the current
-version, the fields CHROM, POS, ID, and AA are expected and
-can appear in arbitrary order, columns which should be ignored in the input
+version, the fields CHROM, POS, ID, and AA or REF, ALT are expected and
+can appear in arbitrary order. Columns which should be ignored in the input
file can be indicated by "\-".
The AA field lists alleles on the forward reference strand,
for example "CC" or "CT" for diploid genotypes or "C"
for haploid genotypes (sex chromosomes). Insertions and deletions
-are not supported yet, missing data can be indicated with "\-\-".
+are supported only with REF and ALT but not with AA. Missing data can be indicated with "\-\-" or ".".
.RE
.sp
\fB\-f, \-\-fasta\-ref\fP \fIfile\fP
@@ -1917,7 +1984,10 @@ file of sample names. See \fBCommon Options\fP
.nf
.fam C
# Convert 23andme results into VCF
-bcftools convert \-c ID,CHROM,POS,AA \-s SampleName \-f 23andme\-ref.fa \-\-tsv2vcf 23andme.txt \-Oz \-o out.vcf.gz
+bcftools convert \-c ID,CHROM,POS,AA \-s SampleName \-f 23andme\-ref.fa \-\-tsv2vcf 23andme.txt \-o out.vcf.gz
+
+# Convert tab\-delimited file into a sites\-only VCF (no genotypes), in this example first column to be ignored
+bcftools convert \-c \-,CHROM,POS,REF,ALT \-f ref.fa \-\-tsv2vcf calls.txt \-o out.bcf
.fam
.fi
.if n .RE
@@ -1966,6 +2036,12 @@ aminoacids, with \fB\-B 1\fP only an abbreviated version such as \fI25E..329>25G
written.
.RE
.sp
+\fB\-\-dump\-gff\fP \fIFILE\fP
+.RS 4
+dump the parsed GFF into a gzipped FILE. Intended for debugging purposes,
+shows how is the input GFF viewed by the program.
+.RE
+.sp
\fB\-e, \-\-exclude\fP \fIEXPRESSION\fP
.RS 4
exclude sites for which \fIEXPRESSION\fP is true. For valid expressions see
@@ -1987,6 +2063,7 @@ transcripts in malformatted GFFs with incorrect phase
.RS 4
GFF3 annotation file (required), such as \c
.URL "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens" "" "."
+The script \fBgff2gff\fP can help with conversion from non\-standard GFF formats.
An example of a minimal working GFF file:
.RE
.sp
@@ -1998,6 +2075,17 @@ An example of a minimal working GFF file:
# the gene (determined from the transcript\*(Aqs "Parent=gene:" attribute), and the biotype
# (the most interesting is "protein_coding").
#
+ # Empty and commented lines are skipped, the following GFF columns are required
+ # 1. chromosome
+ # 2. IGNORED
+ # 3. type (CDS, exon, three_prime_UTR, five_prime_UTR, gene, transcript, etc.)
+ # 4. start of the feature (1\-based)
+ # 5. end of the feature (1\-based)
+ # 6. IGNORED
+ # 7. strand (+ or \-)
+ # 8. phase (0, 1, 2 or .)
+ # 9. semicolon\-separated attributes (see below)
+ #
# Attributes required for
# gene lines:
# \- ID=gene:
@@ -2137,6 +2225,18 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-unify\-chr\-names\fP \fI0\fP|\fI1\fP
+.RS 4
+Automatically detect and unify chromosome naming conventions in the GFF, fasta
+and VCF, such as "chrX" vs "X". The chromosome names in the output VCF will match
+that of the input VCF. The default is to attempt the automatic translation.
+.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
+.sp
\fBExamples:\fP
.sp
.if n .RS 4
@@ -2366,6 +2466,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
.SS "bcftools gtcheck [\fIOPTIONS\fP] [\fB\-g\fP \fIgenotypes.vcf.gz\fP] \fIquery.vcf.gz\fP"
.sp
Checks sample identity. The program can operate in two modes. If the \fB\-g\fP
@@ -2373,6 +2478,10 @@ option is given, the identity of samples from \fIquery.vcf.gz\fP
is checked against the samples in the \fB\-g\fP file.
Without the \fB\-g\fP option, multi\-sample cross\-check of samples in \fIquery.vcf.gz\fP is performed.
.sp
+Note that the interpretation of the discordance score depends on the options provided (specifically \fB\-e\fP and
+\fB\-u\fP) and on the available annotations (FORMAT/PL vs FORMAT/GT).
+The discordance score can be interpreted as the number of mismatching genotypes if only GT\-vs\-GT matching is performed.
+.sp
\fB\-\-distinctive\-sites\fP \fINUM[,MEM[,DIR]]\fP
.RS 4
Find sites that can distinguish between at least NUM sample pairs. If the number is smaller or equal to 1,
@@ -2391,11 +2500,18 @@ Stop after first record to estimate required time.
Interpret genotypes and genotype likelihoods probabilistically. The value of \fIINT\fP
represents genotype quality when GT tag is used (e.g. Q=30 represents one error in 1,000 genotypes and
Q=40 one error in 10,000 genotypes) and is ignored when PL tag is used (in that case an arbitrary
-non\-zero integer can be provided). See also the \fB\-u, \-\-use\fP option below. If set to 0,
-the discordance equals to the number of mismatching genotypes when GT vs GT is compared.
-Note that the values with and without \fB\-e\fP are not comparable, only values generated
-with \fB\-e 0\fP correspond to mismatching genotypes.
-If performance is an issue, set to 0 for faster run but less accurate results.
+non\-zero integer can be provided).
+\~
+.br
+\~
+.br
+If \fB\-e\fP is set to 0, the discordance score can be interpreted as the number of mismatching genotypes,
+but only in the GT\-vs\-GT matching mode. See the \fB\-u, \-\-use\fP option below for additional notes and caveats.
+\~
+.br
+\~
+.br
+If performance is an issue, set \fB\-e 0\fP for faster run times but less accurate results.
.RE
.sp
\fB\-g, \-\-genotypes\fP \fIFILE\fP
@@ -2476,8 +2592,15 @@ see \fBCommon Options\fP
\fB\-u, \-\-use\fP \fITAG1\fP[,\fITAG2\fP]
.RS 4
specifies which tag to use in the query file (\fITAG1\fP) and the \fB\-g\fP (\fITAG2\fP) file.
-By default, the PL tag is used in the query file and GT in the \fB\-g\fP file when
-available.
+By default, the PL tag is used in the query file and, when available, the GT tags in the
+\fB\-g\fP file.
+\~
+.br
+\~
+.br
+Note that when the requested tag is not available, the program will attempt to use
+the other tag. The output includes the number of sites that were matched by the four
+possible mode (for example GT\-vs\-GT or GT\-vs\-PL).
.RE
.sp
\fBExamples:\fP
@@ -2676,6 +2799,11 @@ see \fBCommon Options\fP
list of input files to output given as 1\-based indices. With \fB\-p\fP and no
\fB\-w\fP, all files are written.
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file. This is done automatically with the \fB\-p\fP option.
+.RE
.SS "Examples:"
.sp
Create intersection and complements of two sets saving the output in dir/*
@@ -2785,7 +2913,8 @@ merge gVCF blocks, INFO/END tag is expected. If the reference fasta
file \fIFILE\fP is not given and the dash (\fI\-\fP) is given, unknown reference
bases generated at gVCF block splits will be substituted with N\(cqs.
The \fB\-\-gvcf\fP option uses the following default INFO rules:
-\fB\-i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\fP.
+\fB\-i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\fP and the following missing
+rules: \fB\-M PL:max,AD:0\fP.
.RE
.sp
\fB\-i, \-\-info\-rules\fP \fI\-\fP|\fITAG:METHOD\fP[,...]
@@ -2835,6 +2964,17 @@ The option controls what types of multiallelic records can be created:
.fi
.if n .RE
.sp
+\fB\-M, \-\-missing\-rules\fP \fI\-\fP|\fITAG:METHOD\fP[,...]
+.RS 4
+Rules for merging vector tags at multiallelic sites. When input files have different alternate
+alleles, vector fields pertaining to unobserved alleles are set to missing (\fI.\fP) by default.
+The \fIMETHOD\fP is one of \fI.\fP (the default, use missing values), \fINUMBER\fP (use a constant value, e.g. 0),
+\fImax\fP (the maximum value observed for other alleles in the sample). When \fB\-\-gvcf\fP option is set,
+the rule \fB\-M PL:max,AD:0\fP is implied. This can be overriden with providing \fB\-M \-\fP or \fB\-M PL:.,AD:.\fP.
+Note that if the unobserved allele is explicitly present as \fI<*>\fP or \fI\fP, then its corresponding
+value will be used regardless of \fB\-M\fP settings.
+.RE
+.sp
\fB\-\-no\-index\fP
.RS 4
the option allows to merge files without indexing them first. In order for this
@@ -2876,6 +3016,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
.SS "bcftools mpileup [\fIOPTIONS\fP] \fB\-f\fP \fIref.fa\fP \fIin.bam\fP [\fIin2.bam\fP [...]]"
.sp
Generate VCF or BCF containing genotype likelihoods for one or multiple
@@ -3209,6 +3354,11 @@ BQB.
.fi
.if n .RE
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
.SS "Options for SNP/INDEL genotype likelihood computation"
.sp
\fB\-X, \-\-config\fP \fISTR\fP
@@ -3431,6 +3581,13 @@ try to proceed with \fB\-m\-\fP even if malformed tags with incorrect number of
are encountered, discarding such tags. (Experimental, use at your own risk.)
.RE
.sp
+\fB\-g, \-\-gff\-annot\fP \fIFILE\fP
+.RS 4
+when a GFF file is provided, follow HGVS 3\(cqrule and right\-align variants in transcripts on the forward
+strand. In case of overlapping transcripts, the default mode is to left\-align the variant. For a
+description of the supported GFF3 file format see \fBbcftools csq\fP.
+.RE
+.sp
\fB\-\-keep\-sum\fP \fITAG\fP[,...]
.RS 4
keep vector sum constant when splitting multiallelic sites. Only AD tag
@@ -3528,6 +3685,11 @@ see \fBCommon Options\fP
maximum distance between two records to consider when locally
sorting variants which changed position during the realignment
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
.SS "bcftools [plugin \fINAME\fP|+\fINAME\fP] \fI[OPTIONS]\fP \fIFILE\fP \(em \fI[PLUGIN OPTIONS]\fP"
.sp
A common framework for various utilities. The plugins can be used
@@ -3601,6 +3763,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
.SS "Plugin options:"
.sp
\fB\-h, \-\-help\fP
@@ -4723,7 +4890,13 @@ see \fBCommon Options\fP
.sp
\fB\-T, \-\-temp\-dir\fP \fIDIR\fP
.RS 4
-Use this directory to store temporary files
+Use this directory to store temporary files. If the last six characters of the string DIR are XXXXXX,
+then these are replaced with a string that makes the directory name unique.
+.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
.RE
.SS "bcftools stats [\fIOPTIONS\fP] \fIA.vcf.gz\fP [\fIB.vcf.gz\fP]"
.sp
@@ -4943,6 +5116,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
.SS "Subset options:"
.sp
\fB\-a, \-\-trim\-alt\-alleles\fP
@@ -5137,7 +5315,7 @@ important libraries used by bcftools.
.SS "bcftools [\fI\-\-version\-only\fP]"
.sp
Display the full bcftools version number in a machine\-readable format.
-.SH "EXPRESSIONS"
+.SH "FILTERING EXPRESSIONS"
.sp
These filtering expressions are accepted by most of the commands.
.sp
@@ -5919,7 +6097,18 @@ bcftools view \-i \*(Aq%ID!="." & MAF[0]<0.01\*(Aq
.if n .RE
.sp
Please refer to the documentation of your shell for details.
-.SH "SCRIPTS AND OPTIONS"
+.SH "SCRIPTS"
+.SS "gff2gff"
+.sp
+Attempts to fix a GFF file to be correctly parsed by \fBcsq\fP.
+.sp
+.if n .RS 4
+.nf
+.fam C
+zcat in.gff.gz | gff2gff | gzip \-c > out.gff.gz
+.fam
+.fi
+.if n .RE
.SS "plot\-vcfstats [\fIOPTIONS\fP] \fIfile.vchk\fP [...]"
.sp
Script for processing output of \fBbcftools stats\fP. It can merge
@@ -6013,8 +6202,10 @@ Please report any bugs you encounter on the github website: \c
.sp
Heng Li from the Sanger Institute wrote the original C version of htslib,
samtools and bcftools. Bob Handsaker from the Broad Institute implemented the
-BGZF library. Petr Danecek, Shane McCarthy and John Marshall are maintaining
-and further developing bcftools. Many other people contributed to the program
+BGZF library. Petr Danecek is maintaining and further developing bcftools, together
+with the rest of the \c
+.URL "https://www.sanger.ac.uk/tool/samtools\-bcftools\-htslib" "samtools team" "."
+Many other people contributed to the program
and to the file format specifications, both directly and indirectly by
providing patches, testing and reporting bugs. We thank them all.
.SH "RESOURCES"
diff --git a/doc/bcftools.html b/doc/bcftools.html
index 5a4f5ae51..0b4baab9e 100644
--- a/doc/bcftools.html
+++ b/doc/bcftools.html
@@ -50,13 +50,13 @@ DESCRIPTION
VERSION
-
This manual page was last updated 2023-02-21 and refers to bcftools git version 1.17.
+
This manual page was last updated 2023-07-25 and refers to bcftools git version 1.18.
BCF1
-
The BCF1 format output by versions of samtools <= 0.1.19 is not
+
The obsolete BCF1 format output by versions of samtools <= 0.1.19 is not
compatible with this version of bcftools. To read BCF1 files one can use
the view command from old versions of bcftools packaged with samtools
versions <= 0.1.19 to convert to VCF, which can then be read by
@@ -79,6 +79,12 @@
VARIANT CALLING
is recommended for most tasks.
+
+
FILTERING EXPRESSIONS
+
+
@@ -172,6 +178,9 @@
LIST OF SCRIPTS
@@ -417,6 +426,10 @@
Common Options
Use multithreading with INT worker threads. The option is currently used only for the compression of the
output stream, only when --output-type is b or z. Default: 0.
+
--write-index
+
+Automatically index the output files. Can be used only for compressed BCF and VCF output.
+
@@ -668,6 +681,10 @@ bcftools annotate [OPTIONS] FILE
"^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER).
"INFO" can be abbreviated to "INF" and "FORMAT" to "FMT".
+--write-index
+
+Automatically index the output file
+
@@ -797,6 +814,10 @@
see Common Options
+
--write-index
+
+Automatically index the output file
+
@@ -1161,6 +1182,10 @@ bcftools concat [OPTIONS] FILE1 FILE2
see Common Options
+--write-index
+
+Automatically index the output file
+
@@ -1194,18 +1219,14 @@ bcftools consensus [OPTIONS] FILE
reference sequence in fasta format
--H, --haplotype 1|2|R|A|I|LR|LA|SR|SA|1pIu|2pIu
+-H, --haplotype N|R|A|I|LR|LA|SR|SA|NpIu
choose which allele from the FORMAT/GT field to use (the codes are case-insensitive):
-- 1
--
-
the first allele, regardless of phasing
-
-- 2
+- N
-
-
the second allele, regardless of phasing
+N={1,2,3,…}, the allele index within the genotype, regardless of phasing
- R
-
@@ -1227,18 +1248,21 @@
bcftools consensus [OPTIONS] FILE
-
the shorter allele. If both have the same length, use the REF allele (SR), or the ALT allele (SA)
- - 1pIu, 2pIu
+- NpIu
-
-
first/second allele for phased genotypes and IUPAC code for unphased genotypes
-
-
-
This option requires *-s*, unless exactly one sample is present in the VCF
-
-
+N={1,2,3,…}, the allele index within genotype for phased genotypes and IUPAC code for unphased genotypes.
+For example, 1pIu or 2pIu
+
+
+
+
Note that the -H, --haplotype option requires the -s, --samples option, unless exactly one sample is present in the VCF
+
+
+
- -i, --include EXPRESSION
-
include only sites for which EXPRESSION is true. For valid expressions see
@@ -1247,21 +1271,21 @@
bcftools consensus [OPTIONS] FILE
- -I, --iupac-codes
-
output variants in the form of IUPAC ambiguity codes determined from FORMAT/GT fields. By default all
-samples are used and can be subset with -s, --samples
and -S, --samples-file
. Use -s -
to ignore
+samples are used and can be subset with -s, --samples and -S, --samples-file. Use -s - to ignore
samples and use only the REF and ALT columns. NOTE: prior to version 1.17 the IUPAC codes were determined solely
from REF,ALT columns and sample genotypes were not considered.
- --mark-del CHAR
-
-
instead of removing sequence, insert CHAR for deletions
+instead of removing sequence, insert character CHAR for deletions
-- --mark-ins uc|lc
+- --mark-ins uc|lc|CHAR
-
-
highlight inserted sequence in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is
+highlight inserted sequence in uppercase (uc), lowercase (lc), or a provided character CHAR, leaving the rest of the sequence as is
- --mark-snv uc|lc
-
-
highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is
+highlight substitutions in uppercase (uc), lowercase (lc), or a provided character CHAR, leaving the rest of the sequence as is
- -m, --mask FILE
-
@@ -1284,11 +1308,11 @@
bcftools consensus [OPTIONS] FILE
- -s, --samples LIST
-
-
apply variants of the listed samples. See also the option -I, --iupac-codes
+apply variants of the listed samples. See also the option -I, --iupac-codes
- -S, --samples-file FILE
-
-
apply variants of the samples listed in the file. See also the option -I, --iupac-codes
+apply variants of the samples listed in the file. See also the option -I, --iupac-codes
@@ -1307,6 +1331,27 @@ bcftools consensus [OPTIONS] FILE
# For more examples see http://samtools.github.io/bcftools/howtos/consensus-sequence.html
+
+
+- Notes:
+-
+
Masking options are applied in the following order
+
+
+-
+
mask regions with --mask-with character if --mask is given. All overlapping VCF variants are ignored
+
+-
+
replace sequence not mentioned in the VCF with the requested character if --absent is given
+
+-
+
finally apply --mark-del, --mark-ins, --mark-snv masks
+
+
+
+
+
+
bcftools convert [OPTIONS] FILE
@@ -1356,6 +1401,10 @@
see Common Options
+
--write-index
+
+Automatically index the output file
+
@@ -1637,13 +1686,13 @@ TSV conversion:
-c, --columns list
comma-separated list of fields in the input file. In the current
-version, the fields CHROM, POS, ID, and AA are expected and
-can appear in arbitrary order, columns which should be ignored in the input
+version, the fields CHROM, POS, ID, and AA or REF, ALT are expected and
+can appear in arbitrary order. Columns which should be ignored in the input
file can be indicated by "-".
The AA field lists alleles on the forward reference strand,
for example "CC" or "CT" for diploid genotypes or "C"
for haploid genotypes (sex chromosomes). Insertions and deletions
-are not supported yet, missing data can be indicated with "--".
+are supported only with REF and ALT but not with AA. Missing data can be indicated with "--" or ".".
-f, --fasta-ref file
@@ -1665,7 +1714,10 @@ TSV conversion:
# Convert 23andme results into VCF
-bcftools convert -c ID,CHROM,POS,AA -s SampleName -f 23andme-ref.fa --tsv2vcf 23andme.txt -Oz -o out.vcf.gz
+bcftools convert -c ID,CHROM,POS,AA -s SampleName -f 23andme-ref.fa --tsv2vcf 23andme.txt -o out.vcf.gz
+
+# Convert tab-delimited file into a sites-only VCF (no genotypes), in this example first column to be ignored
+bcftools convert -c -,CHROM,POS,REF,ALT -f ref.fa --tsv2vcf calls.txt -o out.bcf
@@ -1721,6 +1773,11 @@ bcftools csq [OPTIONS] FILE
aminoacids, with -B 1 only an abbreviated version such as 25E..329>25G..94 will be
written.
+--dump-gff FILE
+
+dump the parsed GFF into a gzipped FILE. Intended for debugging purposes,
+shows how is the input GFF viewed by the program.
+
-e, --exclude EXPRESSION
exclude sites for which EXPRESSION is true. For valid expressions see
@@ -1738,6 +1795,7 @@
bcftools csq [OPTIONS] FILE
-g, --gff-annot FILE
GFF3 annotation file (required), such as ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens.
+The script gff2gff can help with conversion from non-standard GFF formats.
An example of a minimal working GFF file:
@@ -1749,6 +1807,17 @@ bcftools csq [OPTIONS] FILE
# the gene (determined from the transcript's "Parent=gene:" attribute), and the biotype
# (the most interesting is "protein_coding").
#
+ # Empty and commented lines are skipped, the following GFF columns are required
+ # 1. chromosome
+ # 2. IGNORED
+ # 3. type (CDS, exon, three_prime_UTR, five_prime_UTR, gene, transcript, etc.)
+ # 4. start of the feature (1-based)
+ # 5. end of the feature (1-based)
+ # 6. IGNORED
+ # 7. strand (+ or -)
+ # 8. phase (0, 1, 2 or .)
+ # 9. semicolon-separated attributes (see below)
+ #
# Attributes required for
# gene lines:
# - ID=gene:<gene_id>
@@ -1871,6 +1940,16 @@ bcftools csq [OPTIONS] FILE
see Common Options
+--unify-chr-names 0|1
+
+Automatically detect and unify chromosome naming conventions in the GFF, fasta
+and VCF, such as "chrX" vs "X". The chromosome names in the output VCF will match
+that of the input VCF. The default is to attempt the automatic translation.
+
+--write-index
+
+Automatically index the output file
+
@@ -2084,6 +2163,10 @@
bcftools filter [OPTIONS] FILE
see Common Options
+
--write-index
+
+Automatically index the output file
+
@@ -2095,6 +2178,11 @@ bcftools gtcheck [OPTIONS] [-g ge
is checked against the samples in the -g file.
Without the -g option, multi-sample cross-check of samples in query.vcf.gz is performed.
+
+
Note that the interpretation of the discordance score depends on the options provided (specifically -e and
+-u) and on the available annotations (FORMAT/PL vs FORMAT/GT).
+The discordance score can be interpreted as the number of mismatching genotypes if only GT-vs-GT matching is performed.
+
- --distinctive-sites NUM[,MEM[,DIR]]
@@ -2113,11 +2201,14 @@ bcftools gtcheck [OPTIONS] [-g ge
Interpret genotypes and genotype likelihoods probabilistically. The value of INT
represents genotype quality when GT tag is used (e.g. Q=30 represents one error in 1,000 genotypes and
Q=40 one error in 10,000 genotypes) and is ignored when PL tag is used (in that case an arbitrary
-non-zero integer can be provided). See also the -u, --use option below. If set to 0,
-the discordance equals to the number of mismatching genotypes when GT vs GT is compared.
-Note that the values with and without -e are not comparable, only values generated
-with -e 0 correspond to mismatching genotypes.
-If performance is an issue, set to 0 for faster run but less accurate results.
+non-zero integer can be provided).
+
+
+If -e is set to 0, the discordance score can be interpreted as the number of mismatching genotypes,
+but only in the GT-vs-GT matching mode. See the -u, --use option below for additional notes and caveats.
+
+
+If performance is an issue, set -e 0 for faster run times but less accurate results.
- -g, --genotypes FILE
-
@@ -2191,8 +2282,13 @@
bcftools gtcheck [OPTIONS] [-g ge
- -u, --use TAG1[,TAG2]
-
specifies which tag to use in the query file (TAG1) and the -g (TAG2) file.
-By default, the PL tag is used in the query file and GT in the -g file when
-available.
+By default, the PL tag is used in the query file and, when available, the GT tags in the
+-g file.
+
+
+Note that when the requested tag is not available, the program will attempt to use
+the other tag. The output includes the number of sites that were matched by the four
+possible mode (for example GT-vs-GT or GT-vs-PL).
@@ -2394,6 +2490,10 @@ bcftools isec [OPTIONS] A.vcf.gz B.vcf.gz
list of input files to output given as 1-based indices. With -p and no
-w, all files are written.
+--write-index
+
+Automatically index the output file. This is done automatically with the -p option.
+
@@ -2497,7 +2597,8 @@
bcftools merge [OPTIONS] A.vcf.gz B.vcf.gz<
file FILE is not given and the dash (-) is given, unknown reference
bases generated at gVCF block splits will be substituted with N’s.
The --gvcf option uses the following default INFO rules:
--i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max.
+-i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max and the following missing
+rules: -M PL:max,AD:0.
-i, --info-rules -|TAG:METHOD[,…]
@@ -2543,6 +2644,16 @@ bcftools merge [OPTIONS] A.vcf.gz B.vcf.gz<
+- -M, --missing-rules -|TAG:METHOD[,…]
+-
+
Rules for merging vector tags at multiallelic sites. When input files have different alternate
+alleles, vector fields pertaining to unobserved alleles are set to missing (.) by default.
+The METHOD is one of . (the default, use missing values), NUMBER (use a constant value, e.g. 0),
+max (the maximum value observed for other alleles in the sample). When --gvcf option is set,
+the rule -M PL:max,AD:0 is implied. This can be overriden with providing -M - or -M PL:.,AD:..
+Note that if the unobserved allele is explicitly present as <*> or <NON_REF>, then its corresponding
+value will be used regardless of -M settings.
+
- --no-index
-
the option allows to merge files without indexing them first. In order for this
@@ -2577,6 +2688,10 @@
bcftools merge [OPTIONS] A.vcf.gz B.vcf.gz<
-
see Common Options
+- --write-index
+-
+
Automatically index the output file
+
@@ -2889,6 +3004,10 @@ Output options
+--write-index
+
+Automatically index the output file
+
@@ -3099,6 +3218,12 @@ bcftools norm [OPTIONS] file.vcf.gz
try to proceed with -m- even if malformed tags with incorrect number of fields
are encountered, discarding such tags. (Experimental, use at your own risk.)
+-g, --gff-annot FILE
+
+when a GFF file is provided, follow HGVS 3’rule and right-align variants in transcripts on the forward
+strand. In case of overlapping transcripts, the default mode is to left-align the variant. For a
+description of the supported GFF3 file format see bcftools csq.
+
--keep-sum TAG[,…]
keep vector sum constant when splitting multiallelic sites. Only AD tag
@@ -3179,6 +3304,10 @@
bcftools norm [OPTIONS] file.vcf.gz
maximum distance between two records to consider when locally
sorting variants which changed position during the realignment
+--write-index
+
+Automatically index the output file
+
@@ -3254,6 +3383,10 @@ VCF output options:
see Common Options
+--write-index
+
+Automatically index the output file
+
@@ -4134,7 +4267,12 @@ bcftools sort [OPTIONS] file.bcf
-T, --temp-dir DIR
-Use this directory to store temporary files
+Use this directory to store temporary files. If the last six characters of the string DIR are XXXXXX,
+then these are replaced with a string that makes the directory name unique.
+
+--write-index
+
+Automatically index the output file
@@ -4339,6 +4477,10 @@ Output options
see Common Options
+--write-index
+
+Automatically index the output file
+
@@ -4538,7 +4680,7 @@ bcftools [--version-only]
-
EXPRESSIONS
+
FILTERING EXPRESSIONS
These filtering expressions are accepted by most of the commands.
@@ -4974,9 +5116,24 @@
EXPRESSIONS
-
SCRIPTS AND OPTIONS
+
SCRIPTS
+
gff2gff
+
+
Attempts to fix a GFF file to be correctly parsed by csq.
+
+
+
+
+
+
zcat in.gff.gz | gff2gff | gzip -c > out.gff.gz
+
+
+
+
+
+
plot-vcfstats [OPTIONS] file.vchk […]
Script for processing output of bcftools stats. It can merge
@@ -5077,8 +5234,9 @@
AUTHORS
Heng Li from the Sanger Institute wrote the original C version of htslib,
samtools and bcftools. Bob Handsaker from the Broad Institute implemented the
-BGZF library. Petr Danecek, Shane McCarthy and John Marshall are maintaining
-and further developing bcftools. Many other people contributed to the program
+BGZF library. Petr Danecek is maintaining and further developing bcftools, together
+with the rest of the samtools team.
+Many other people contributed to the program
and to the file format specifications, both directly and indirectly by
providing patches, testing and reporting bugs. We thank them all.
@@ -5119,7 +5277,7 @@
COPYING