diff --git a/Dockerfile b/Dockerfile index ad8eb21..2ddfa88 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,8 @@ RUN apt-get update --fix-missing && \ wget \ bzip2 \ make \ - gcc && \ + gcc \ + vcftools && \ rm -rf /bar/lib/apt/lists/* WORKDIR /opt/clair diff --git a/README.md b/README.md index 09cd4ec..835e966 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,10 @@ pypy3 -m pip install blosc intervaltree pip install numpy blosc intervaltree tensorflow==1.13.2 pysam matplotlib conda install -c anaconda pigz conda install -c conda-forge parallel zstd -conda install -c bioconda samtools vcflib bcftools +conda install -c bioconda samtools vcflib + +# install vcftools +sudo apt-get install vcftools # clone Clair git clone --depth=1 https://github.com/HKU-BAL/Clair.git @@ -73,6 +76,9 @@ conda config --add channels conda-forge conda create -n clair-env -c bioconda clair conda activate clair-env +# install vcftools +sudo apt-get install vcftools + # store clair.py PATH into $CLAIR variable CLAIR=`which clair.py` @@ -239,7 +245,7 @@ export CUDA_VISIBLE_DEVICES="" cat command.sh | parallel -j4 # concatenate vcf files and sort the variants called -`vcfcat ${OUTPUT_PREFIX}*.vcf | bcftools sort -m 4G - | bgziptabix snp_and_indel.vcf.gz` +vcfcat ${OUTPUT_PREFIX}*.vcf | vcf-sort -c | bgziptabix snp_and_indel.vcf.gz ``` #### Note @@ -249,7 +255,8 @@ cat command.sh | parallel -j4 * callVarBamParallel will generate commonds for chr{1..22},X,Y, to call variants on all chromosomes, please use option `--includingAllContigs`. * If you are going to call on non-human BAM file (e.g. bacteria), please use `--includingAllContigs` option to include all contigs * `CUDA_VISIBLE_DEVICES=""` makes GPUs invisible to Clair so it will use CPU for variant calling. Please notice that unless you want to run `commands.sh` in serial, you cannot use GPU because one running copy of Clair will occupy all available memory of a GPU. While the bottleneck of `callVarBam` is at the `CreateTensor` script, which runs on CPU, the effect of GPU accelerate is insignificant (roughly about 15% faster). But if you have multiple GPU cards in your system, and you want to utilize them in variant calling, you may want split the `commands.sh` in to parts, and run the parts by firstly `export CUDA_VISIBLE_DEVICES="$i"`, where `$i` is an integer from 0 identifying the ID of the GPU to be used. -* `vcfcat` and `bgziptabix` commands are from [vcflib](https://github.com/vcflib/vcflib), and are installed by default using option 2 (conda). +* `vcfcat` and `bgziptabix` commands are from [vcflib](https://github.com/vcflib/vcflib), and are installed by default using option 2 (conda) or option 3 (docker). +* `vcf-sort` command is from [vcftools](https://github.com/vcftools/vcftools) * Please also check the notes in the above sections for other considerations. ---