Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Condaify all the tools #468

Merged
merged 39 commits into from
Aug 22, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
beae3b4
add support for installing bioconda tools
armish May 17, 2017
bc8521b
biocondaify: OptiType
armish May 17, 2017
5ece879
add support for transforming fastq_reads and re-use metadata
armish May 26, 2017
d114fec
biocondaify: seqtk
armish May 26, 2017
e369b3d
fetch tool name and versions from the machine defaults
armish May 26, 2017
417a6d8
biocondaify: seq2HLA
armish May 29, 2017
d6e0cd2
improve the way we handle HLA files/tools
armish May 30, 2017
c2a4843
biocondify and upgrade picard
armish May 29, 2017
b123074
add snpeff-based annotation support
armish May 29, 2017
b438136
name HLA/VCF extraction nodes after the original ones
armish May 30, 2017
e30ec2e
Allow tool-specific variables to be used in conditional checks
armish Jun 2, 2017
47076fd
allow overriding shell when asking for the host
armish Jun 2, 2017
48cb7d7
create bwa-mem-based optitype's workdir before attempting to write there
armish Jun 2, 2017
a1fd60b
make use of the new shell override function
armish Jun 2, 2017
99d607e
also mkdir for the new fastqs
armish Jun 2, 2017
8ab4937
fix path issues with optitype index
armish Jun 2, 2017
7ebb3c0
add forgotten optitype init to prepend the filtering
armish Jun 3, 2017
c3c4575
wip: cnvkit
armish Jun 8, 2017
5fdc61d
init optitype
armish Jun 8, 2017
255d20e
partially commented out code so that it can compile
armish Jun 8, 2017
b274ddf
minor style fix to the conda env
armish Jun 9, 2017
ff5fca7
isolate bwa and optitype work folders better and initialize optitype …
armish Jun 9, 2017
f515df8
wip: cibersort
armish Jun 9, 2017
0bc0a81
wrap mutect with a java-7 conda environment for compatibility
armish Jun 10, 2017
82a5819
comment out cibersort for now
armish Jun 10, 2017
cfc0a44
bump vaxrank
armish Jun 10, 2017
27cfcfa
just a bit more ciber
armish Jun 17, 2017
0b7b6ef
wip: cibersort
armish Jun 30, 2017
f5d604f
save vaxrank's debug output together with the reports
armish Jul 10, 2017
c875641
bump vaxrank version
armish Jul 11, 2017
f70eaf8
use '.' instead 'source' for better shell compatibility
armish Jul 12, 2017
00b02ab
add base_packages to the python tool definitions
armish Jul 12, 2017
7bc42a7
fix Strelka's subfolder mod for convenience
armish Jul 17, 2017
db459fe
Merge `master` into `conda-all`
smondet Aug 17, 2017
5c836f9
Fix naming in type `python_version_type`
smondet Aug 18, 2017
7e10776
Fix Strelka's installation
smondet Aug 21, 2017
7d8153b
Fix `Vaxrank.move_vaxrank_product`
smondet Aug 21, 2017
73d52a7
Remove commented-out/obsolete Cibersort code
smondet Aug 21, 2017
b234f5b
Remove commented-out/obsolete CnvKit code
smondet Aug 21, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 5 additions & 13 deletions src/bfx_tools/hlarp.ml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@ type hla_result = [
]

let run ~(run_with:Machine.t)
?(edges=[])
~(hla_result:hla_result)
~output_path ~extract_alleles ()
~output_path
=
let open KEDSL in
let open Ketrew_pure.Target.Volume in
Expand All @@ -25,17 +24,10 @@ let run ~(run_with:Machine.t)
~requirements:[`Self_identification ["hlarp"]]
Program.(
Machine.Tool.init hlarp
&& shf "hlarp %s %s > %s" subcommand hla_result_directory output_path
&& sh
(if extract_alleles
then sprintf
"awk -F , '{ gsub(/^[ \t]+|[ \t]+$/,\"\", $2); print $2}' %s \
| tail -n +2 \
| sed \"s/'//\" > %s.tmp \
&& mv %s.tmp %s"
output_path output_path output_path output_path
else "")) in
let edges = hla_result_dep :: edges @ [
&& shf "hlarp %s %s > %s" subcommand hla_result_directory output_path)
in
let edges = [
hla_result_dep;
depends_on (Machine.Tool.ensure hlarp);
on_failure_activate
(Workflow_utilities.Remove.file ~run_with output_path);
Expand Down
4 changes: 1 addition & 3 deletions src/bfx_tools/isovar.ml
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,7 @@ let run ~(run_with: Machine.t)
~configuration
~reference_build ~vcf ~bam ~output_file =
let open KEDSL in
let isovar_tool =
Machine.get_tool run_with Machine.Tool.Definition.(create "isovar")
in
let isovar_tool = Machine.get_tool run_with Machine.Tool.Default.isovar in
let genome = Machine.(get_reference_genome run_with reference_build)
|> Reference_genome.name
in
Expand Down
155 changes: 143 additions & 12 deletions src/bfx_tools/optitype.ml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

open Biokepi_run_environment
open Common

Expand All @@ -24,6 +23,24 @@ let move_optitype_product ?host ~path o =
method path = path
end

let get_optitype_data_folder =
let tname, tversion =
let tool_def = Machine.Tool.Default.optitype in
Machine.Tool.Definition.(get_name tool_def, get_version tool_def)
in
sprintf "${CONDA_PREFIX}/share/%s-%s/"
tname (match tversion with None -> "*" | Some v -> v)

(* copy sample config file and the required data over;
then, adjust the razers3 path in the config *)
let prepare_optidata =
let open KEDSL.Program in
let optidata_path = get_optitype_data_folder in
shf "cp -r %s/data ." optidata_path && (* HLA reference data *)
shf "cp -r %s/config.ini.example config.ini" optidata_path &&
sh "sed -i.bak \"s|\\/path\\/to\\/razers3|$(which razers3)|g\" config.ini"


(**
Run OptiType in [`RNA] or [`DNA] mode.

Expand All @@ -43,17 +60,13 @@ let hla_type ~work_dir ~run_with ~fastq ~run_name nt
Machine.Tool.init tool
&& exec ["mkdir"; "-p"; work_dir]
&& exec ["cd"; work_dir]
&& sh "cp -r ${OPTITYPE_DATA}/data ." (* HLA reference data *)
&& (* config example *)
sh "cp -r ${OPTITYPE_DATA}/config.ini.example config.ini"
&& (* adjust config razers3 path *)
sh "sed -i.bak \"s|\\/path\\/to\\/razers3|$(which razers3)|g\" config.ini"
&&
shf "OptiTypePipeline --verbose --input %s %s %s -o %s "
(Filename.quote r1_path)
(Option.value_map ~default:"" r2_path_opt ~f:Filename.quote)
(match nt with | `DNA -> "--dna" | `RNA -> "--rna")
run_name)
&& prepare_optidata
&& shf "OptiTypePipeline.py --verbose -c ./config.ini \
--input %s %s %s -o %s"
(Filename.quote r1_path)
(Option.value_map ~default:"" r2_path_opt ~f:Filename.quote)
(match nt with | `DNA -> "--dna" | `RNA -> "--rna")
run_name)
in
let product =
let host = Machine.as_host run_with in
Expand All @@ -77,3 +90,121 @@ let hla_type ~work_dir ~run_with ~fastq ~run_name nt
(Workflow_utilities.Remove.directory ~run_with work_dir);
]
)

(*
Optitype depends on alignment of reads onto the HLA-locus
as a preliminary filtering step, but the default aligner, razers3,
is so memory hungry that, the run fails on a ~50G memory machine
when the number of reads per FASTQ is approximately more than 200M.

The following variation of optitype run makes use of `bwa mem` instead
of `razers3` to do the initial filtering and some experimentation
with real patient data proved that this doesn't affect the results
despite their worrisome warning on the site.

This is only tested on the DNA arm of the pipeline, so is restricted
to that use case.
*)
let dna_hla_type_with_bwamem
?(configuration = Bwa.Configuration.Mem.default)
~work_dir ~run_with ~fastq ~run_name
=
let open KEDSL in
(* We need to pull in bwa mem and samtools to get some help *)
let optitype = Machine.get_tool run_with Machine.Tool.Default.optitype in
let bwa = Machine.get_tool run_with Machine.Tool.Default.bwa in
let samtools = Machine.get_tool run_with Machine.Tool.Default.samtools in
let bwa_wd = work_dir // "bwamem" in
let dna_hla_ref_path =
get_optitype_data_folder // "data/hla_reference_dna.fasta"
in
(* Step 1: prepare hla reference indexes *)
let index_hla_wf =
let name = "Index OptiType's DNA-based HLA reference with bwa" in
let edges = [
depends_on (Machine.Tool.ensure bwa);
depends_on (Machine.Tool.ensure optitype);
on_failure_activate
(Workflow_utilities.Remove.directory ~run_with bwa_wd);]
in
let make =
Machine.run_big_program run_with ~name
~self_ids:["optitype"; "hla"; "dna"; "bwa index"]
Program.(
Machine.Tool.init optitype
&& Machine.Tool.init bwa
&& Machine.Tool.init optitype
&& shf "mkdir -p %s" bwa_wd
&& shf "bwa index %s" dna_hla_ref_path
)
in
let product =
Workflow_utilities.Variable_tool_paths.single_file
~run_with ~tool:optitype (dna_hla_ref_path ^ ".bwt")
in
workflow_node product ~name ~make ~edges
in
(* Step 2: Map the input fastq to this pseudo reference genome via
`bwa mem` and turn the alignment back into fastq while keeping
only the reads that mapped to reduce OptiType's future memory
consumption.
*)
let filter_hla_reads_wf =
let name = sprintf "Filter out non-HLA-mapping reads: %s" run_name in
let edges = [
depends_on (Machine.Tool.ensure bwa);
depends_on (Machine.Tool.ensure samtools);
depends_on index_hla_wf; (* No indexing, no mapping *)
depends_on fastq;
on_failure_activate
(Workflow_utilities.Remove.directory ~run_with bwa_wd);]
in
let bwa2sam2fastq fqpath = (* the whole pipeline *)
let outfq_path =
let fqbase = fqpath |> Filename.basename |> Filename.chop_extension in
bwa_wd // (sprintf "%s-hla_mapping.fastq" fqbase)
in
let processors = Machine.max_processors run_with in
let bwamem_part = Bwa.(
sprintf "bwa mem -t %d -O %d -E %d -B %d %s %s"
processors
configuration.Configuration.Mem.gap_open_penalty
configuration.Configuration.Mem.gap_extension_penalty
configuration.Configuration.Mem.mismatch_penalty
dna_hla_ref_path
(Filename.quote fqpath))
in
let samtools_part =
(* -F4 filters *out* all reads that do not map.
See SAM/BAM flags for more information:
$ samtools flags
*)
sprintf "samtools fastq -F4 - -0 %s" outfq_path
in
String.concat ~sep:" | " [bwamem_part; samtools_part],
outfq_path
in
let in_r1, in_r2_opt = fastq#product#paths in
let filter_r1, out_r1 = bwa2sam2fastq in_r1 in
let filter_r2, out_r2 =
match in_r2_opt with
| None -> "echo 'Second pair is missing'", None
| Some r2p -> let (f, o) = bwa2sam2fastq r2p in (f, Some o)
in
let make =
Machine.run_big_program run_with ~name
~self_ids:["optitype"; "hla"; "dna"; "filtering"]
Program.(
Machine.Tool.init optitype
&& Machine.Tool.init bwa
&& Machine.Tool.init samtools
&& shf "mkdir -p %s" bwa_wd
&& sh filter_r1 && sh filter_r2
)
in
let product = transform_fastq_reads fastq#product out_r1 out_r2 in
workflow_node product ~name ~make ~edges
in
(* Step 3: Run OptiType as usual on the new filtered down FASTQ(s) *)
let owd = work_dir // "optitype" in
hla_type ~work_dir:owd ~run_with ~fastq:filter_hla_reads_wf ~run_name `DNA
Loading