diff --git a/src/bfx_tools/hlarp.ml b/src/bfx_tools/hlarp.ml index dbccb20..27b1980 100644 --- a/src/bfx_tools/hlarp.ml +++ b/src/bfx_tools/hlarp.ml @@ -7,9 +7,8 @@ type hla_result = [ ] let run ~(run_with:Machine.t) - ?(edges=[]) ~(hla_result:hla_result) - ~output_path ~extract_alleles () + ~output_path = let open KEDSL in let open Ketrew_pure.Target.Volume in @@ -25,17 +24,10 @@ let run ~(run_with:Machine.t) ~requirements:[`Self_identification ["hlarp"]] Program.( Machine.Tool.init hlarp - && shf "hlarp %s %s > %s" subcommand hla_result_directory output_path - && sh - (if extract_alleles - then sprintf - "awk -F , '{ gsub(/^[ \t]+|[ \t]+$/,\"\", $2); print $2}' %s \ - | tail -n +2 \ - | sed \"s/'//\" > %s.tmp \ - && mv %s.tmp %s" - output_path output_path output_path output_path - else "")) in - let edges = hla_result_dep :: edges @ [ + && shf "hlarp %s %s > %s" subcommand hla_result_directory output_path) + in + let edges = [ + hla_result_dep; depends_on (Machine.Tool.ensure hlarp); on_failure_activate (Workflow_utilities.Remove.file ~run_with output_path); diff --git a/src/bfx_tools/isovar.ml b/src/bfx_tools/isovar.ml index 92f0d79..b37d8c1 100644 --- a/src/bfx_tools/isovar.ml +++ b/src/bfx_tools/isovar.ml @@ -30,9 +30,7 @@ let run ~(run_with: Machine.t) ~configuration ~reference_build ~vcf ~bam ~output_file = let open KEDSL in - let isovar_tool = - Machine.get_tool run_with Machine.Tool.Definition.(create "isovar") - in + let isovar_tool = Machine.get_tool run_with Machine.Tool.Default.isovar in let genome = Machine.(get_reference_genome run_with reference_build) |> Reference_genome.name in diff --git a/src/bfx_tools/optitype.ml b/src/bfx_tools/optitype.ml index 4c4d5b3..f30ae9b 100644 --- a/src/bfx_tools/optitype.ml +++ b/src/bfx_tools/optitype.ml @@ -1,4 +1,3 @@ - open Biokepi_run_environment open Common @@ -24,6 +23,24 @@ let move_optitype_product ?host ~path o = method path = path end +let get_optitype_data_folder = + let tname, tversion = + let tool_def = Machine.Tool.Default.optitype in + Machine.Tool.Definition.(get_name tool_def, get_version tool_def) + in + sprintf "${CONDA_PREFIX}/share/%s-%s/" + tname (match tversion with None -> "*" | Some v -> v) + +(* copy sample config file and the required data over; + then, adjust the razers3 path in the config *) +let prepare_optidata = + let open KEDSL.Program in + let optidata_path = get_optitype_data_folder in + shf "cp -r %s/data ." optidata_path && (* HLA reference data *) + shf "cp -r %s/config.ini.example config.ini" optidata_path && + sh "sed -i.bak \"s|\\/path\\/to\\/razers3|$(which razers3)|g\" config.ini" + + (** Run OptiType in [`RNA] or [`DNA] mode. @@ -43,17 +60,13 @@ let hla_type ~work_dir ~run_with ~fastq ~run_name nt Machine.Tool.init tool && exec ["mkdir"; "-p"; work_dir] && exec ["cd"; work_dir] - && sh "cp -r ${OPTITYPE_DATA}/data ." (* HLA reference data *) - && (* config example *) - sh "cp -r ${OPTITYPE_DATA}/config.ini.example config.ini" - && (* adjust config razers3 path *) - sh "sed -i.bak \"s|\\/path\\/to\\/razers3|$(which razers3)|g\" config.ini" - && - shf "OptiTypePipeline --verbose --input %s %s %s -o %s " - (Filename.quote r1_path) - (Option.value_map ~default:"" r2_path_opt ~f:Filename.quote) - (match nt with | `DNA -> "--dna" | `RNA -> "--rna") - run_name) + && prepare_optidata + && shf "OptiTypePipeline.py --verbose -c ./config.ini \ + --input %s %s %s -o %s" + (Filename.quote r1_path) + (Option.value_map ~default:"" r2_path_opt ~f:Filename.quote) + (match nt with | `DNA -> "--dna" | `RNA -> "--rna") + run_name) in let product = let host = Machine.as_host run_with in @@ -77,3 +90,121 @@ let hla_type ~work_dir ~run_with ~fastq ~run_name nt (Workflow_utilities.Remove.directory ~run_with work_dir); ] ) + +(* + Optitype depends on alignment of reads onto the HLA-locus + as a preliminary filtering step, but the default aligner, razers3, + is so memory hungry that, the run fails on a ~50G memory machine + when the number of reads per FASTQ is approximately more than 200M. + + The following variation of optitype run makes use of `bwa mem` instead + of `razers3` to do the initial filtering and some experimentation + with real patient data proved that this doesn't affect the results + despite their worrisome warning on the site. + + This is only tested on the DNA arm of the pipeline, so is restricted + to that use case. +*) +let dna_hla_type_with_bwamem + ?(configuration = Bwa.Configuration.Mem.default) + ~work_dir ~run_with ~fastq ~run_name + = + let open KEDSL in + (* We need to pull in bwa mem and samtools to get some help *) + let optitype = Machine.get_tool run_with Machine.Tool.Default.optitype in + let bwa = Machine.get_tool run_with Machine.Tool.Default.bwa in + let samtools = Machine.get_tool run_with Machine.Tool.Default.samtools in + let bwa_wd = work_dir // "bwamem" in + let dna_hla_ref_path = + get_optitype_data_folder // "data/hla_reference_dna.fasta" + in + (* Step 1: prepare hla reference indexes *) + let index_hla_wf = + let name = "Index OptiType's DNA-based HLA reference with bwa" in + let edges = [ + depends_on (Machine.Tool.ensure bwa); + depends_on (Machine.Tool.ensure optitype); + on_failure_activate + (Workflow_utilities.Remove.directory ~run_with bwa_wd);] + in + let make = + Machine.run_big_program run_with ~name + ~self_ids:["optitype"; "hla"; "dna"; "bwa index"] + Program.( + Machine.Tool.init optitype + && Machine.Tool.init bwa + && Machine.Tool.init optitype + && shf "mkdir -p %s" bwa_wd + && shf "bwa index %s" dna_hla_ref_path + ) + in + let product = + Workflow_utilities.Variable_tool_paths.single_file + ~run_with ~tool:optitype (dna_hla_ref_path ^ ".bwt") + in + workflow_node product ~name ~make ~edges + in + (* Step 2: Map the input fastq to this pseudo reference genome via + `bwa mem` and turn the alignment back into fastq while keeping + only the reads that mapped to reduce OptiType's future memory + consumption. + *) + let filter_hla_reads_wf = + let name = sprintf "Filter out non-HLA-mapping reads: %s" run_name in + let edges = [ + depends_on (Machine.Tool.ensure bwa); + depends_on (Machine.Tool.ensure samtools); + depends_on index_hla_wf; (* No indexing, no mapping *) + depends_on fastq; + on_failure_activate + (Workflow_utilities.Remove.directory ~run_with bwa_wd);] + in + let bwa2sam2fastq fqpath = (* the whole pipeline *) + let outfq_path = + let fqbase = fqpath |> Filename.basename |> Filename.chop_extension in + bwa_wd // (sprintf "%s-hla_mapping.fastq" fqbase) + in + let processors = Machine.max_processors run_with in + let bwamem_part = Bwa.( + sprintf "bwa mem -t %d -O %d -E %d -B %d %s %s" + processors + configuration.Configuration.Mem.gap_open_penalty + configuration.Configuration.Mem.gap_extension_penalty + configuration.Configuration.Mem.mismatch_penalty + dna_hla_ref_path + (Filename.quote fqpath)) + in + let samtools_part = + (* -F4 filters *out* all reads that do not map. + See SAM/BAM flags for more information: + $ samtools flags + *) + sprintf "samtools fastq -F4 - -0 %s" outfq_path + in + String.concat ~sep:" | " [bwamem_part; samtools_part], + outfq_path + in + let in_r1, in_r2_opt = fastq#product#paths in + let filter_r1, out_r1 = bwa2sam2fastq in_r1 in + let filter_r2, out_r2 = + match in_r2_opt with + | None -> "echo 'Second pair is missing'", None + | Some r2p -> let (f, o) = bwa2sam2fastq r2p in (f, Some o) + in + let make = + Machine.run_big_program run_with ~name + ~self_ids:["optitype"; "hla"; "dna"; "filtering"] + Program.( + Machine.Tool.init optitype + && Machine.Tool.init bwa + && Machine.Tool.init samtools + && shf "mkdir -p %s" bwa_wd + && sh filter_r1 && sh filter_r2 + ) + in + let product = transform_fastq_reads fastq#product out_r1 out_r2 in + workflow_node product ~name ~make ~edges + in + (* Step 3: Run OptiType as usual on the new filtered down FASTQ(s) *) + let owd = work_dir // "optitype" in + hla_type ~work_dir:owd ~run_with ~fastq:filter_hla_reads_wf ~run_name `DNA \ No newline at end of file diff --git a/src/bfx_tools/picard.ml b/src/bfx_tools/picard.ml index 763aced..9df4b5b 100644 --- a/src/bfx_tools/picard.ml +++ b/src/bfx_tools/picard.ml @@ -7,14 +7,14 @@ module Remove = Workflow_utilities.Remove let create_dict ~(run_with:Machine.t) fasta = let open KEDSL in - let picard_create_dict = - Machine.get_tool run_with Machine.Tool.Default.picard in + let picard = Machine.get_tool run_with Machine.Tool.Default.picard in let src = fasta#product#path in let dest = sprintf "%s.%s" (Filename.chop_suffix src ".fasta") "dict" in - let program = - Program.(Machine.Tool.(init picard_create_dict) && - shf "java -jar $PICARD_JAR CreateSequenceDictionary R= %s O= %s" - (Filename.quote src) (Filename.quote dest)) in + let program = Program.( + Machine.Tool.(init picard) && + shf "picard CreateSequenceDictionary R= %s O= %s" + (Filename.quote src) (Filename.quote dest)) + in let name = sprintf "picard-create-dict-%s" Filename.(basename src) in let make = Machine.run_stream_processor run_with program ~name @@ -22,7 +22,7 @@ let create_dict ~(run_with:Machine.t) fasta = let host = Machine.(as_host run_with) in workflow_node (single_file dest ~host) ~name ~make ~edges:[ - depends_on fasta; depends_on Machine.Tool.(ensure picard_create_dict); + depends_on fasta; depends_on Machine.Tool.(ensure picard); on_failure_activate (Remove.file ~run_with dest); ] @@ -54,11 +54,10 @@ let sort_vcf ~(run_with:Machine.t) ?(sequence_dict) input_vcf = | None -> [] | Some d -> [depends_on d] in - let program = - Program.(Machine.Tool.(init picard) && - (shf "java -jar $PICARD_JAR SortVcf %s I= %s O= %s" - sequence_dict_opt - (Filename.quote src) (Filename.quote dest))) + let program = Program.( + Machine.Tool.(init picard) && + shf "picard SortVcf %s I= %s O= %s" + sequence_dict_opt (Filename.quote src) (Filename.quote dest)) in let host = Machine.(as_host run_with) in let make = @@ -101,7 +100,7 @@ module Mark_duplicates_settings = struct MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=%d \ MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=%d \ SORTING_COLLECTION_SIZE_RATIO=%f \ - java %s -Djava.io.tmpdir=%s " + picard %s -Djava.io.tmpdir=%s " tmp_dir t.max_sequences_for_disk_read_ends_map t.max_file_handles_for_read_ends_map @@ -117,24 +116,25 @@ let mark_duplicates ?(settings = Mark_duplicates_settings.default) ~(run_with: Machine.t) ~input_bam output_bam_path = let open KEDSL in - let picard_jar = Machine.get_tool run_with Machine.Tool.Default.picard in + let picard = Machine.get_tool run_with Machine.Tool.Default.picard in let metrics_path = sprintf "%s.%s" (Filename.chop_suffix output_bam_path ".bam") ".metrics" in let sorted_bam = Samtools.sort_bam_if_necessary ~run_with input_bam ~by:`Coordinate in - let program = - let java_call = - let default_tmp_dir = - Filename.chop_extension output_bam_path ^ ".tmpdir" in - Mark_duplicates_settings.to_java_shell_call ~default_tmp_dir settings in - Program.(Machine.Tool.(init picard_jar) && - shf "%s -jar $PICARD_JAR MarkDuplicates \ - VALIDATION_STRINGENCY=LENIENT \ - INPUT=%s OUTPUT=%s METRICS_FILE=%s" - java_call - (Filename.quote sorted_bam#product#path) - (Filename.quote output_bam_path) - metrics_path) in + let default_tmp_dir = Filename.chop_extension output_bam_path ^ ".tmpdir" in + let java_call = + Mark_duplicates_settings.to_java_shell_call ~default_tmp_dir settings + in + let program = Program.( + Machine.Tool.(init picard) && + shf "%s MarkDuplicates \ + VALIDATION_STRINGENCY=LENIENT \ + INPUT=%s OUTPUT=%s METRICS_FILE=%s" + java_call + (Filename.quote sorted_bam#product#path) + (Filename.quote output_bam_path) + metrics_path) + in let name = sprintf "picard-markdups-%s" Filename.(basename input_bam#product#path) in let make = @@ -145,7 +145,7 @@ let mark_duplicates ~name ~make ~edges:[ depends_on sorted_bam; - depends_on Machine.Tool.(ensure picard_jar); + depends_on Machine.Tool.(ensure picard); on_failure_activate (Remove.file ~run_with output_bam_path); on_failure_activate (Remove.file ~run_with metrics_path); ] @@ -155,7 +155,7 @@ let reorder_sam ?reference_build ~input_bam output_bam_path = let open KEDSL in - let picard_jar = Machine.get_tool run_with Machine.Tool.Default.picard in + let picard = Machine.get_tool run_with Machine.Tool.Default.picard in let tmp_dir = Filename.chop_extension output_bam_path ^ ".tmpdir" in let input_bam_path = input_bam#product#path in @@ -176,11 +176,9 @@ let reorder_sam | Some m -> sprintf "-Xmx%s" m in Program.( - (Machine.Tool.init picard_jar) && - shf "java %s \ - -Djava.io.tmpdir=%s \ - -jar $PICARD_JAR ReorderSam \ - I=%s O=%s R=%s" + (Machine.Tool.init picard) && + shf "picard %s -Djava.io.tmpdir=%s \ + ReorderSam I=%s O=%s R=%s" mem tmp_dir (Filename.quote input_bam_path) @@ -198,7 +196,7 @@ let reorder_sam ~name ~make ~edges:[ depends_on input_bam; depends_on fasta; - depends_on Machine.Tool.(ensure picard_jar); + depends_on Machine.Tool.(ensure picard); on_failure_activate (Remove.file ~run_with output_bam_path); ] @@ -221,12 +219,12 @@ let bam_to_fastq ~run_with ~sample_type ~output_prefix input_bam = let r1 = sprintf "%s.fastq" output_prefix in ([sprintf "FASTQ=%s" r1], r1, None) in - let picard_jar = Machine.get_tool run_with Machine.Tool.Default.picard in + let picard = Machine.get_tool run_with Machine.Tool.Default.picard in let program = Program.( - Machine.Tool.(init picard_jar) && + Machine.Tool.(init picard) && shf "mkdir -p %s" (r1 |> Filename.dirname |> Filename.quote) - && shf "java -jar $PICARD_JAR SamToFastq INPUT=%s %s" + && shf "picard SamToFastq INPUT=%s %s" (Filename.quote sorted_bam#product#path) (String.concat ~sep:" " fastq_output_options) ) in @@ -242,7 +240,7 @@ let bam_to_fastq ~run_with ~sample_type ~output_prefix input_bam = | None -> list) [ depends_on sorted_bam; - depends_on Machine.Tool.(ensure picard_jar); + depends_on Machine.Tool.(ensure picard); on_failure_activate (Remove.file ~run_with r1); ] in @@ -252,14 +250,14 @@ let bam_to_fastq ~run_with ~sample_type ~output_prefix input_bam = let clean_bam ~run_with input_bam output_bam_path = let open KEDSL in - let picard_jar = Machine.get_tool run_with Machine.Tool.Default.picard in + let picard = Machine.get_tool run_with Machine.Tool.Default.picard in let input_path = input_bam#product#path in let name = sprintf "picard-cleansam-%s" Filename.(basename input_path) in workflow_node (transform_bam input_bam#product output_bam_path) ~name ~edges:([ depends_on input_bam; - depends_on Machine.Tool.(ensure picard_jar); + depends_on Machine.Tool.(ensure picard); on_failure_activate (Remove.file ~run_with output_bam_path); ]) ~make:( @@ -267,8 +265,8 @@ let clean_bam ~run_with input_bam output_bam_path = ~name run_with Program.( - Machine.Tool.(init picard_jar) && - shf "java -jar $PICARD_JAR CleanSam INPUT=%s OUTPUT=%s" + Machine.Tool.(init picard) && + shf "picard CleanSam INPUT=%s OUTPUT=%s" (Filename.quote input_path) (Filename.quote output_bam_path) ) diff --git a/src/bfx_tools/pyensembl.ml b/src/bfx_tools/pyensembl.ml index 083ee0b..0e64901 100644 --- a/src/bfx_tools/pyensembl.ml +++ b/src/bfx_tools/pyensembl.ml @@ -18,9 +18,7 @@ let set_cache_dir_command ~(run_with: Machine.t) = let cache_genome ~(run_with: Machine.t) ~reference_build = let open KEDSL in - let pyensembl = - Machine.get_tool run_with Machine.Tool.Definition.(create "pyensembl") - in + let pyensembl = Machine.get_tool run_with Machine.Tool.Default.pyensembl in let genome = Machine.(get_reference_genome run_with reference_build) in let ensembl_release = genome |> Reference_genome.ensembl in let species = genome |> Reference_genome.species in diff --git a/src/bfx_tools/seq2HLA.ml b/src/bfx_tools/seq2HLA.ml index 22368a8..869349e 100644 --- a/src/bfx_tools/seq2HLA.ml +++ b/src/bfx_tools/seq2HLA.ml @@ -39,10 +39,6 @@ let move_seq2hla_product ?host ~path s = let hla_type ~work_dir ~run_with ~r1 ~r2 ~run_name : product KEDSL.workflow_node = let tool = Machine.get_tool run_with Machine.Tool.Default.seq2hla in - (* Why quote this here? Seems like it easy to create a bug, - why not enforce this at node construction ?*) - let r1pt = Filename.quote r1#product#path in - let r2pt = Filename.quote r2#product#path in let name = sprintf "seq2HLA-%s" run_name in let host = Machine.as_host run_with in let processors = Machine.max_processors run_with in @@ -52,7 +48,9 @@ let hla_type && exec ["mkdir"; "-p"; work_dir] && exec ["cd"; work_dir] && shf "seq2HLA -1 %s -2 %s -r %s -p %d" - r1pt r2pt run_name processors) + (Filename.quote r1#product#path) + (Filename.quote r2#product#path) + run_name processors) in let class1_path = work_dir // (sprintf "%s-ClassI.HLAgenotype4digits" run_name) in diff --git a/src/bfx_tools/seqtk.ml b/src/bfx_tools/seqtk.ml new file mode 100644 index 0000000..83759c2 --- /dev/null +++ b/src/bfx_tools/seqtk.ml @@ -0,0 +1,56 @@ +open Biokepi_run_environment +open Common + +module Remove = Workflow_utilities.Remove + +(* To convert (old) 64-based phred scores to (new) 33-based ones. *) +let shift_phred_scores ~(run_with:Machine.t) + ?(output_postfix=".phred33.fastq") + ~input_fastq + ~output_folder + = + let open KEDSL in + let seqtk = Machine.get_tool run_with Machine.Tool.Default.seqtk in + let output_path rpath = + let fqr_basename = + rpath |> Filename.basename |> Filename.chop_extension + in + output_folder // (sprintf "%s%s" fqr_basename output_postfix) + in + let r1_path, r2_path_opt = input_fastq#product#paths in + let cmds = + let shift_cmd rpath = + sprintf "seqtk seq -VQ64 %s > %s" rpath (output_path rpath) + in + match r2_path_opt with + | Some r2_path -> [ shift_cmd r1_path; shift_cmd r2_path; ] + | None -> [shift_cmd r1_path;] + in + let out_fastq = + transform_fastq_reads + input_fastq#product + (output_path r1_path) + (Option.map ~f:output_path r2_path_opt) + in + let name = + sprintf "PHRED 64->31: %s" input_fastq#product#sample_name + in + let make = + Machine.(run_program run_with + ~requirements:[ + `Self_identification ["seqtk:seq"; "fastq"; "phred_fix"; ] + ] + Program.( + Tool.init seqtk + && shf "mkdir -p %s" output_folder + && chain (List.map ~f:sh cmds) + ) + ) + in + workflow_node ~name ~make out_fastq + ~edges: [ + on_failure_activate (Remove.directory ~run_with output_folder); + depends_on (Machine.Tool.ensure seqtk); + depends_on input_fastq; + ] + diff --git a/src/bfx_tools/snpeff.ml b/src/bfx_tools/snpeff.ml new file mode 100644 index 0000000..9cdb9c0 --- /dev/null +++ b/src/bfx_tools/snpeff.ml @@ -0,0 +1,115 @@ +open Biokepi_run_environment +open Common + +module Remove = Workflow_utilities.Remove + +let get_snpeff_db_name ~run_with reference_build = + Machine.(get_reference_genome run_with reference_build) + |> Reference_genome.snpeff_name_exn + +let get_snpeff_data_folder ~run_with = + let snpeff = Machine.Tool.Default.snpeff in + let tname, tversion = + Machine.Tool.Definition.(get_name snpeff, get_version snpeff) + in + sprintf "${CONDA_PREFIX}/share/%s-%s/data" + tname (match tversion with None -> "*" | Some v -> v) + +(* Note the 'E' in caps in the name! + Also we are setting -Xms because the default is too small + for almost all genomes. This will make annotations faster. + *) +let call_snpeff fmt = sprintf ("%s " ^^ fmt) ("snpEff -Xms4g") + +let prepare_annotation ~(run_with:Machine.t) ~reference_build = + let open KEDSL in + let snpeff = Machine.get_tool run_with Machine.Tool.Default.snpeff in + let dbname = get_snpeff_db_name ~run_with reference_build in + let data_path = get_snpeff_data_folder ~run_with in + let genome_data_path = data_path // dbname in + let witness_path = genome_data_path // "snpEffectPredictor.bin" in + let product = + Workflow_utilities.Variable_tool_paths.single_file + ~run_with ~tool:snpeff witness_path + in + let name = sprintf "Preparing snpEFF DB for %s" reference_build in + let make = + Machine.(run_program run_with + ~requirements:[ + `Self_identification ["annotation"; "prep"; "snpeff"]; + `Internet_access; ] + Program.( + Machine.Tool.init snpeff && + sh (call_snpeff "download %s" dbname) + ) + ) + in + workflow_node ~name ~make product + ~edges: [ + on_failure_activate (Remove.directory ~run_with genome_data_path); + depends_on (Machine.Tool.ensure snpeff); + ] + +type snpeff_product = < + host : Ketrew_pure.Host.t; + is_done : Ketrew_pure.Target.Condition.t option; + csv_stats : KEDSL.single_file; + html_stats : KEDSL.single_file; + workdir_path : string; + annotated_vcf : KEDSL.vcf_file; + path : string; > + +let annotate ~(run_with:Machine.t) ~reference_build ~input_vcf ~out_folder = + let open KEDSL in + let snpeff = Machine.get_tool run_with Machine.Tool.Default.snpeff in + let host = Machine.as_host run_with in + let in_vcf_path = input_vcf#product#path in + let in_basename = (Filename.basename in_vcf_path) in + let in_simplename = (Filename.chop_extension in_basename) in + let out_path suffix = + out_folder // (sprintf "%s-snpeff%s" in_simplename suffix) + in + let annotated_vcf = transform_vcf input_vcf#product ~path:(out_path ".vcf") in + let out_vcf = annotated_vcf#as_single_file in + let out_csv = single_file ~host (out_path ".csv") in + let out_html = single_file ~host (out_path".html") in + let out_product = + let sf_products = [out_vcf; out_csv; out_html;] in + let all_done = List.filter_map ~f:(fun f -> f#is_done) sf_products in + object + method host = host + method is_done = Some (`And all_done) + method csv_stats = out_csv + method html_stats = out_html + method annotated_vcf = annotated_vcf + method workdir_path = out_folder + (* Annotated VCF will often be the only thing + that we care about, hence the path points to that *) + method path = annotated_vcf#path + end + in + let dbname = get_snpeff_db_name ~run_with reference_build in + let name = sprintf "snpEff: annotate %s against %s" in_basename dbname in + let annotate_cmd = + call_snpeff "eff -nodownload -s %s -csvStats %s %s %s > %s" + out_html#path out_csv#path dbname in_vcf_path out_vcf#path + in + let make = + Machine.(run_program run_with + ~requirements:[ + `Self_identification ["snpEff"; "annotate"; "vcf";] + ] + Program.( + Tool.init snpeff + && shf "mkdir -p %s" out_folder + && sh annotate_cmd + ) + ) + in + workflow_node ~name ~make out_product + ~edges: [ + on_failure_activate (Remove.directory ~run_with out_folder); + depends_on (Machine.Tool.ensure snpeff); + depends_on input_vcf; + depends_on (prepare_annotation ~run_with ~reference_build) + ] diff --git a/src/bfx_tools/topiary.ml b/src/bfx_tools/topiary.ml index 6817db5..9aa33aa 100644 --- a/src/bfx_tools/topiary.ml +++ b/src/bfx_tools/topiary.ml @@ -6,51 +6,6 @@ type output_type = [ | `CSV of string ] -type predictor_type = [ - | `NetMHC - | `NetMHCpan - | `NetMHCIIpan - | `NetMHCcons - | `Random - | `SMM - | `SMM_PMBEC - | `NetMHCpan_IEDB - | `NetMHCcons_IEDB - | `SMM_IEDB - | `SMM_PMBEC_IEDB -] - -let predictor_to_string = function - | `NetMHC -> "netmhc" - | `NetMHCpan -> "netmhcpan" - | `NetMHCIIpan -> "netmhciipan" - | `NetMHCcons -> "netmhccons" - | `Random -> "random" - | `SMM -> "smm" - | `SMM_PMBEC -> "smm-pmbec" - | `NetMHCpan_IEDB -> "netmhcpan-iedb" - | `NetMHCcons_IEDB -> "netmhccons-iedb" - | `SMM_IEDB -> "smm-iedb" - | `SMM_PMBEC_IEDB -> "smm-pmbec-iedb" - -let predictor_to_tool ~run_with predictor = - let get_tool t = - let tool = - Machine.get_tool - run_with - Machine.Tool.Definition.(create t) - in - let ensure = Machine.Tool.(ensure tool) in - let init = Machine.Tool.(init tool) in - (ensure, init) - in - match predictor with - | `NetMHC -> Some (get_tool "netMHC") - | `NetMHCpan -> Some (get_tool "netMHCpan") - | `NetMHCIIpan -> Some (get_tool "netMHCIIpan") - | `NetMHCcons -> Some (get_tool "netMHCcons") - | _ -> None - module Configuration = struct type t = { @@ -150,9 +105,8 @@ let run ~(run_with: Machine.t) ~alleles_file ~output = let open KEDSL in - let topiary = - Machine.get_tool run_with Machine.Tool.Definition.(create "topiary") - in + let open Hla_utilities in + let topiary = Machine.get_tool run_with Machine.Tool.Default.topiary in let predictor_tool = predictor_to_tool ~run_with predictor in let (predictor_edges, predictor_init) = match predictor_tool with diff --git a/src/bfx_tools/vaxrank.ml b/src/bfx_tools/vaxrank.ml index 52456cc..6f0bf33 100644 --- a/src/bfx_tools/vaxrank.ml +++ b/src/bfx_tools/vaxrank.ml @@ -25,6 +25,7 @@ module Configuration = struct xlsx_report: bool; pdf_report: bool; ascii_report: bool; + debug_log: string; (* the rest *) parameters: (string * string) list; } @@ -46,6 +47,7 @@ module Configuration = struct xlsx_report; pdf_report; ascii_report; + debug_log; parameters}: Yojson.Basic.json = `Assoc ([ @@ -66,6 +68,7 @@ module Configuration = struct "ascii_report", `Bool ascii_report; "pdf_report", `Bool pdf_report; "xlsx_report", `Bool xlsx_report; + "debug_log", `String debug_log; "parameters", `Assoc (List.map parameters ~f:(fun (a, b) -> a, `String b)); ] @@ -92,6 +95,7 @@ module Configuration = struct xlsx_report; pdf_report; ascii_report; + debug_log; parameters} = let soi = string_of_int in @@ -141,6 +145,7 @@ module Configuration = struct xlsx_report = false; pdf_report = false; ascii_report = true; + debug_log = "vaxrank-debug.log"; parameters = []} let name t = t.name end @@ -151,9 +156,11 @@ type product = < ascii_report_path : string option; xlsx_report_path: string option; pdf_report_path: string option; - output_folder_path: string > + debug_log_path: string; + output_folder_path: string; +> -let move_vaxrank_product ?host ~output_folder_path vp = +let move_vaxrank_product ?host ~output_folder_path (vp : product) : product = let open KEDSL in let open Option in let host = match host with @@ -169,6 +176,7 @@ let move_vaxrank_product ?host ~output_folder_path vp = return (single_file ~host (sub p)) in let pdf_product = vp#pdf_report_path >>= fun p -> return (single_file ~host (sub p)) in + let debug_log_product = single_file ~host (sub vp#debug_log_path) in let opt_path p = p >>= fun p -> return (p#path) in object method host = host @@ -177,10 +185,12 @@ let move_vaxrank_product ?host ~output_folder_path vp = (List.filter_map ~f:(fun f -> let open Option in f >>= fun f -> f#is_done) - [ascii_product; xlsx_product; pdf_product])) + [ascii_product; xlsx_product; + pdf_product; Some debug_log_product])) method ascii_report_path = opt_path ascii_product method xlsx_report_path = opt_path xlsx_product method pdf_report_path = opt_path pdf_product + method debug_log_path = debug_log_product#path method output_folder_path = output_folder_path end @@ -194,13 +204,12 @@ let run ~(run_with: Machine.t) ~output_folder = let open KEDSL in + let open Hla_utilities in let host = Machine.(as_host run_with) in - let vaxrank = - Machine.get_tool run_with Machine.Tool.Definition.(create "vaxrank") - in + let vaxrank = Machine.get_tool run_with Machine.Tool.Default.vaxrank in let sorted_bam = Samtools.sort_bam_if_necessary ~run_with ~by:`Coordinate bam in - let predictor_tool = Topiary.(predictor_to_tool ~run_with predictor) in + let predictor_tool = Hla_utilities.(predictor_to_tool ~run_with predictor) in let (predictor_edges, predictor_init) = match predictor_tool with | Some (e, i) -> ([depends_on e;], i) @@ -209,7 +218,7 @@ let run ~(run_with: Machine.t) let vcfs_arg = List.concat_map vcfs ~f:(fun v -> ["--vcf"; v#product#path]) in let bam_arg = ["--bam"; sorted_bam#product#path] in let predictor_arg = - ["--mhc-predictor"; (Topiary.predictor_to_string predictor)] in + ["--mhc-predictor"; (predictor_to_string predictor)] in let allele_arg = ["--mhc-alleles-file"; alleles_file#product#path] in let output_prefix = output_folder // "vaxrank-result" in let output_of switch kind suffix = @@ -226,6 +235,10 @@ let run ~(run_with: Machine.t) output_of configuration.Configuration.xlsx_report "xlsx" "xlsx" in let pdf_arg, pdf_product = output_of configuration.Configuration.pdf_report "pdf" "pdf" in + let debug_log_arg, debug_log_product = + let log_path = output_folder // configuration.Configuration.debug_log in + ["--log-path"; log_path], (KEDSL.single_file ~host log_path) + in let () = match ascii_product, xlsx_product, pdf_product with | None, None, None -> @@ -234,7 +247,7 @@ let run ~(run_with: Machine.t) | _, _, _ -> () in let arguments = vcfs_arg @ bam_arg @ predictor_arg @ allele_arg (* input *) - @ xlsx_arg @ pdf_arg @ ascii_arg + @ xlsx_arg @ pdf_arg @ ascii_arg @debug_log_arg @ Configuration.render configuration (* other config *) in let name = "Vaxrank run" in @@ -247,10 +260,14 @@ let run ~(run_with: Machine.t) (List.filter_map ~f:(fun f -> let open Option in f >>= fun f -> f#is_done) - [ascii_product; xlsx_product; pdf_product])) + [ ascii_product; + xlsx_product; + pdf_product; + (Some debug_log_product); ])) method ascii_report_path = path_of ascii_product method xlsx_report_path = path_of xlsx_product method pdf_report_path = path_of pdf_product + method debug_log_path = debug_log_product#path method output_folder_path = output_folder end in diff --git a/src/bfx_tools/vcfannotatepolyphen.ml b/src/bfx_tools/vcfannotatepolyphen.ml index 3b84a9e..bd88221 100644 --- a/src/bfx_tools/vcfannotatepolyphen.ml +++ b/src/bfx_tools/vcfannotatepolyphen.ml @@ -3,9 +3,9 @@ open Common let run ~(run_with: Machine.t) ~reference_build ~vcf ~output_vcf = let open KEDSL in - let vap_tool = - Machine.get_tool run_with - Machine.Tool.Definition.(create "vcf-annotate-polyphen") in + let vap_tool = + Machine.get_tool run_with Machine.Tool.Default.vcfannotatepolyphen + in let whessdb = Machine.(get_reference_genome run_with reference_build) |> Reference_genome.whess_exn in diff --git a/src/environment_setup/bioconda.ml b/src/environment_setup/bioconda.ml new file mode 100644 index 0000000..fafbf5a --- /dev/null +++ b/src/environment_setup/bioconda.ml @@ -0,0 +1,51 @@ +open Biokepi_run_environment +open Common + +let create_bioconda_tool + ~host ~(run_program : Machine.Make_fun.t) ~install_path + ?check_bin tool_def = + let open KEDSL in + let name, version = + Machine.Tool.Definition.(get_name tool_def, get_version tool_def) + in + let conda_tool_def = + match version with + | None -> name + | Some v -> sprintf "%s=%s" name v + in + let main_subdir = name ^ "_conda_dir" in + let python_version = Conda.(`Python_tool_dependency conda_tool_def) in + let conda_env = + Conda.setup_environment ~python_version ~main_subdir install_path + (name ^ Option.value_map ~default:"" version ~f:(sprintf ".%s")) + in + let single_file_check id = + single_file ~host Conda.(bin_in_conda_environment ~conda_env id) + in + let exec_check = + match check_bin with + | None -> single_file_check name + | Some s -> single_file_check s + in + let ensure = + workflow_node exec_check + ~name:("Installing Bioconda tool: " ^ name) + ~edges:[ depends_on Conda.(configured ~run_program ~host ~conda_env) ] + (* No need to install the tool, it should already be in the conda env *) + in + let init = Conda.init_env ~conda_env () in + Machine.Tool.create ~ensure ~init tool_def + +let default ~host ~run_program ~install_path () = + Machine.Tool.(Kit.of_list [ + create_bioconda_tool ~host ~run_program ~install_path + ~check_bin:"OptiTypePipeline.py" Default.optitype; + create_bioconda_tool ~host ~run_program ~install_path + Default.picard; + create_bioconda_tool ~host ~run_program ~install_path + Default.seqtk; + create_bioconda_tool ~host ~run_program ~install_path + ~check_bin:"seq2HLA" Default.seq2hla; + create_bioconda_tool ~host ~run_program ~install_path + ~check_bin:"snpEff" Default.snpeff; + ]) diff --git a/src/environment_setup/bioconda.mli b/src/environment_setup/bioconda.mli new file mode 100644 index 0000000..5c47521 --- /dev/null +++ b/src/environment_setup/bioconda.mli @@ -0,0 +1,18 @@ +open Biokepi_run_environment +open Common + +val default: + host: Common.KEDSL.Host.t -> + run_program: Machine.Make_fun.t -> + install_path: string -> + unit -> + Machine.Tool.Kit.t + + +val create_bioconda_tool: + host: Common.KEDSL.Host.t -> + run_program: Machine.Make_fun.t -> + install_path: string -> + ?check_bin: string -> + Machine.Tool.Definition.t -> + Machine.Tool.t \ No newline at end of file diff --git a/src/environment_setup/biopam.ml b/src/environment_setup/biopam.ml index be2756d..72e9fae 100644 --- a/src/environment_setup/biopam.ml +++ b/src/environment_setup/biopam.ml @@ -106,7 +106,7 @@ let get_conda_env = ] (* see https://github.com/ContinuumIO/anaconda-issues/issues/152#issuecomment-225214743 *) ~banned_packages: [ "readline"; "ncurses" ] - ~python_version:`Python2 + ~python_version:`Python_2 (* Hide the messy logic of calling opam in here. This should not be exported and use the Biopam functions directly.*) @@ -280,37 +280,11 @@ let provide ~run_program ~host ~install_path it = let test_version ~host path = KEDSL.Command.shell ~host (sprintf "%s --version" path) -let picard = - install_target - ~tool_type:(`Library "PICARD_JAR") - ~witness:"picard.jar" - (Machine.Tool.Definition.create "picard" ~version:"1.128") - let bowtie = install_target ~witness:"bowtie" ~test:test_version Machine.Tool.Default.bowtie - -let seq2hla = - install_target - ~witness:"seq2HLA" ~requires_conda:true - ~package:"seq2HLA.2.2" (* we need to uppercase HLA for opam *) - Machine.Tool.Default.seq2hla - -let optitype = - install_target ~witness:"OptiTypePipeline" Machine.Tool.Default.optitype - ~requires_conda:true - ~init_environment:KEDSL.Program.( - fun ~install_path -> - let name = Machine.Tool.(Default.optitype.Definition.name) in - let version = Machine.Tool.(Default.optitype.Definition.version) in - shf "export OPAMROOT=%s.%s" - (Opam.root_of_package name |> Opam.root ~install_path) - (match version with None -> "NOVERSION" | Some v -> v) - && shf "export OPTITYPE_DATA=$(%s config var lib)/optitype" - (Opam.bin ~install_path) - ) - + let igvxml = install_target ~witness:"igvxml" ~test:test_version @@ -336,10 +310,7 @@ let default : _ = fun ~run_program ~host ~install_path () -> Machine.Tool.Kit.of_list (List.map ~f:(provide ~run_program ~host ~install_path) [ - picard; bowtie; - seq2hla; - optitype; igvxml; hlarp; ]) diff --git a/src/environment_setup/conda.ml b/src/environment_setup/conda.ml index b0526ac..82c4aa3 100644 --- a/src/environment_setup/conda.ml +++ b/src/environment_setup/conda.ml @@ -16,9 +16,16 @@ type conda_version_type = [ | `Version of string ] +type python_version_type = [ + | `Python_2 + | `Python_3 + | `Python_custom of string + | `Python_tool_dependency of string +] + type conda_environment_type = { name: string; - python_version: [ `Python2 | `Python3 ]; + python_version: python_version_type; channels: string list; base_packages: (string * conda_version_type) list; banned_packages: string list; @@ -33,10 +40,12 @@ let setup_environment ?(banned_packages = []) ?(main_subdir = "conda_dir") ?(envs_subdir = "envs") - ?(python_version = `Python2) + ?(python_version = `Python_2) install_path name = - let channels = [ "bioconda"; "r" ] @ custom_channels in + let channels = + [ "conda-forge"; "defaults"; "r"; "bioconda" ] @ custom_channels + in {name; python_version; channels; base_packages; banned_packages; install_path; main_subdir; envs_subdir} let main_dir ~conda_env = conda_env.install_path // conda_env.main_subdir @@ -46,6 +55,8 @@ let bin ~conda_env = commands ~conda_env "conda" let activate ~conda_env = commands ~conda_env "activate" let deactivate ~conda_env = commands ~conda_env "deactivate" let environment_path ~conda_env = envs_dir ~conda_env // conda_env.name +let bin_in_conda_environment ~conda_env command = + (environment_path ~conda_env) // "bin" // command (* give a conda command. *) let com ~conda_env fmt = @@ -77,20 +88,32 @@ let installed ~(run_program : Machine.Make_fun.t) ~host ~conda_env = let configured ~conda_env ~(run_program : Machine.Make_fun.t) ~host = let open KEDSL in + let open Program in + let seed_package = + match conda_env.python_version with + | `Python_2 -> "python=2" + | `Python_3 -> "python=3" + | `Python_custom v -> sprintf "python=%s" v + | `Python_tool_dependency t -> t + in let create_env = - com ~conda_env "create -y -q --prefix %s python=%d" + com ~conda_env "create -y -q --prefix %s %s" (envs_dir ~conda_env // conda_env.name) - (match conda_env.python_version with `Python2 -> 2 | `Python3 -> 3) + seed_package in let install_package (package, version) = - Program.( - shf "conda install -y %s%s" - package - (match version with `Latest -> "" | `Version v -> "=" ^ v) - ) + shf "conda install -y %s%s" package + (match version with `Latest -> "" | `Version v -> "=" ^ v) + in + let force_rm_package package = shf "conda remove -y --force %s" package in + let configure_channels = + let config_cmd = shf "%s config --add channels %s" (bin ~conda_env) in + chain (List.map ~f:config_cmd conda_env.channels) in - let force_rm_package package = - Program.(shf "conda remove -y --force %s" package) + let activate_env = + let activate_path = activate ~conda_env in + let abs_env_path = envs_dir ~conda_env // conda_env.name in + shf ". %s %s" activate_path abs_env_path in let make = run_program @@ -99,9 +122,9 @@ let configured ~conda_env ~(run_program : Machine.Make_fun.t) ~host = `Self_identification ["conda"; "configuration"]; ] Program.( - sh create_env - && shf "source %s %s" (activate ~conda_env) (envs_dir ~conda_env // conda_env.name) - && chain (List.map ~f:(shf "conda config --add channels %s") conda_env.channels) + configure_channels + && sh create_env + && activate_env && chain (List.map ~f:install_package conda_env.base_packages) && chain (List.map ~f:force_rm_package conda_env.banned_packages) ) @@ -111,7 +134,8 @@ let configured ~conda_env ~(run_program : Machine.Make_fun.t) ~host = (single_file ~host (envs_dir ~conda_env // conda_env.name // "bin/conda") :> < is_done : Common.KEDSL.Condition.t option >) in let name = - sprintf "Configure conda: %s" conda_env.name in + sprintf "Configure conda with %s: %s" seed_package conda_env.name + in workflow_node product ~make ~name ~edges @@ -121,7 +145,7 @@ let init_env ~conda_env () = otherwise, activate the new one *) KEDSL.Program.( shf "[ ${CONDA_PREFIX-none} != \"%s\" ] \ - && source %s %s \ + && . %s %s \ || echo 'Already in conda env: %s'" prefix (activate ~conda_env) prefix prefix ) @@ -130,7 +154,7 @@ let deactivate_env ~conda_env () = let prefix = (envs_dir ~conda_env // conda_env.name) in KEDSL.Program.( shf "[ ${CONDA_PREFIX-none} == \"%s\" ] \ - && source %s \ + && . %s \ || echo 'Doing nothing. The conda env is not active: %s'" prefix (deactivate ~conda_env) prefix ) diff --git a/src/environment_setup/conda.mli b/src/environment_setup/conda.mli index 226f2b4..8615bd8 100644 --- a/src/environment_setup/conda.mli +++ b/src/environment_setup/conda.mli @@ -1,5 +1,12 @@ open Biokepi_run_environment +type python_version_type = [ + | `Python_2 + | `Python_3 + | `Python_custom of string + | `Python_tool_dependency of string +] + type conda_version_type = [ | `Latest | `Version of string @@ -7,7 +14,7 @@ type conda_version_type = [ type conda_environment_type = private { name: string; (* name of the environment *) - python_version: [ `Python2 | `Python3 ]; + python_version: python_version_type; channels: string list; (* supported installation channels *) base_packages: (string * conda_version_type) list; (* defualt installations *) banned_packages: string list; (* packages to be removed after initial setup *) @@ -23,7 +30,7 @@ val setup_environment : ?banned_packages: string list -> ?main_subdir: string -> ?envs_subdir: string -> - ?python_version: [ `Python2 | `Python3 ] -> + ?python_version: python_version_type -> string -> string -> conda_environment_type @@ -51,3 +58,10 @@ val deactivate_env : val environment_path : conda_env: conda_environment_type -> string + +(** A helper method to construct expected path of a binary (command) + given a conda environment **) +val bin_in_conda_environment : + conda_env: conda_environment_type -> + string -> + string diff --git a/src/environment_setup/download_reference_genomes.ml b/src/environment_setup/download_reference_genomes.ml index 9547228..0a3349c 100644 --- a/src/environment_setup/download_reference_genomes.ml +++ b/src/environment_setup/download_reference_genomes.ml @@ -23,6 +23,7 @@ let of_specification cdna; whess; major_contigs; + snpeff_name; } = specification in let dest_file f = destination_path // name // f in let rec compile_location filename = diff --git a/src/environment_setup/netmhc.ml b/src/environment_setup/netmhc.ml index 4dae6e7..69c9619 100644 --- a/src/environment_setup/netmhc.ml +++ b/src/environment_setup/netmhc.ml @@ -153,7 +153,7 @@ let guess_folder_name tool_file_loc = let netmhc_conda_env install_path = Conda.(setup_environment - ~python_version:`Python2 + ~python_version:`Python_2 install_path "netmhc_conda") diff --git a/src/environment_setup/python_package.ml b/src/environment_setup/python_package.ml index 7727309..38f740b 100644 --- a/src/environment_setup/python_package.ml +++ b/src/environment_setup/python_package.ml @@ -3,74 +3,97 @@ open Common type install_tool_type = Pip | Conda +type tool_def_type = Machine.Tool.Definition.t + type install_source_type = - | Package_PyPI of string - | Package_Source of string * string - | Package_Conda of string + | Package_PyPI of tool_def_type + | Package_Source of tool_def_type * string + | Package_Conda of tool_def_type + +type python_package_spec = + install_tool_type * install_source_type * (string option) -let bin_in_conda_environment ~conda_env command = - Conda.(environment_path ~conda_env) // "bin" // command let create_python_tool ~host ~(run_program : Machine.Make_fun.t) ~install_path - ?check_bin ?version ?(python_version=`Python3) - (installation:install_tool_type * install_source_type) = + ?check_bin ?(python_version=`Python_3) package_spec = let open KEDSL in - let versionize ?version ~sep name = match version with + let pkg_id ?version ~sep name = + match version with | None -> name | Some v -> name ^ sep ^ v in - let install_command, name = - match installation with - | (Pip, Package_PyPI pname) -> - ["pip"; "install"; versionize ?version ~sep:"==" pname], pname - | (Pip, Package_Source (pname, source)) -> - ["pip"; "install"; source], pname - | (Conda, Package_Conda pname) -> - ["conda"; "install"; "-y"; versionize ?version ~sep:"=" pname], pname - | (Conda, Package_PyPI pname) -> - ["conda"; "skeleton"; "pypi"; pname], pname + let describe_source ?version name = + sprintf " # source code for: %s" (pkg_id ~sep:"=" ?version name) + in + let src_cmd ~src ?version name = + ["pip"; "install"; src; describe_source ?version name] + in + let pypi_cmd ?version name = + ["pip"; "install"; pkg_id ~sep:"==" ?version name] + in + let conda_cmd ?version name = + ["conda"; "install"; pkg_id ~sep:"=" ?version name] + in + let tool, install_command, custom_bin, base_packages = + match package_spec with + | (Pip, Package_Source (tool, src), cbin, pkgs) + -> (tool, src_cmd ~src, cbin, pkgs) + | (Pip, Package_PyPI tool, cbin, pkgs) -> (tool, pypi_cmd, cbin, pkgs) + | (Conda, Package_Conda tool, cbin, pkgs) -> (tool, conda_cmd, cbin, pkgs) | _ -> failwith "Installation type not supported." in + let name, version = + Machine.Tool.Definition.(get_name tool, get_version tool) + in let main_subdir = name ^ "_conda_dir" in let conda_env = - Conda.setup_environment ~python_version ~main_subdir install_path - (name ^ Option.value_map ~default:"" version ~f:(sprintf ".%s")) + Conda.setup_environment + ~python_version ~base_packages + ~main_subdir install_path + (name ^ Option.value_map ~default:"NOVERSION" version ~f:(sprintf ".%s")) in let single_file_check id = - single_file ~host (bin_in_conda_environment ~conda_env id) + single_file ~host Conda.(bin_in_conda_environment ~conda_env id) in let exec_check = - match check_bin with + match custom_bin with | None -> single_file_check name - | Some s -> single_file_check s + | Some cb -> single_file_check cb in let ensure = workflow_node exec_check ~name:("Installing Python tool: " ^ name) ~edges:[ depends_on Conda.(configured ~run_program ~host ~conda_env) ] - ~make:(run_program - ~requirements:[ - `Internet_access; `Self_identification ["python"; "installation"] - ] - Program.( - Conda.init_env ~conda_env () - && exec install_command) - ) + ~make:( + run_program + ~requirements:[ + `Internet_access; + `Self_identification ["python"; "installation"]; + ] + Program.( + Conda.init_env ~conda_env () + && exec (install_command ?version name) + ) + ) in let init = Conda.init_env ~conda_env () in - Machine.Tool.create Machine.Tool.Definition.(create name) ~ensure ~init + Machine.Tool.create ~init ~ensure tool + + +(* Versions are part of the machine's default toolkit (see machine.ml) *) +let default_python_packages = + let open Machine.Tool.Default in + [ + (Pip, Package_PyPI pyensembl, None, []); + (Pip, Package_PyPI isovar, Some "isovar-protein-sequences.py", []); + (Pip, Package_PyPI vaxrank, None, [ ("wktmltopdf", `Latest); ]); + (Pip, Package_PyPI vcfannotatepolyphen, None, []); + (Pip, Package_PyPI topiary, None, []); + ] + let default ~host ~run_program ~install_path () = - Machine.Tool.Kit.of_list [ - create_python_tool ~host ~run_program ~install_path - ~version:"1.1.0" (Pip, Package_PyPI "pyensembl"); - create_python_tool ~host ~run_program ~install_path - ~version:"0.1.2" (Pip, Package_PyPI "vcf-annotate-polyphen"); - create_python_tool ~host ~run_program ~install_path - ~version:"0.7.0" ~check_bin:"isovar-protein-sequences.py" - (Pip, Package_PyPI "isovar"); - create_python_tool ~host ~run_program ~install_path - ~version:"1.2.1" (Pip, Package_PyPI "topiary"); - create_python_tool ~host ~run_program ~install_path - ~version:"0.6.0" (Pip, Package_PyPI "vaxrank"); - ] + let pkg_to_tool pkg_spec = + create_python_tool ~host ~run_program ~install_path pkg_spec + in + Machine.Tool.Kit.of_list (List.map ~f:pkg_to_tool default_python_packages) diff --git a/src/environment_setup/tool_providers.ml b/src/environment_setup/tool_providers.ml index e3b709d..c0623a5 100644 --- a/src/environment_setup/tool_providers.ml +++ b/src/environment_setup/tool_providers.ml @@ -389,16 +389,6 @@ let varscan = KEDSL.Program.(shf "export VARSCAN_JAR=%s/%s" path jar) in Installable_tool.make Machine.Tool.Default.varscan ~url ~init_program ~witness -let picard = - let url = - "https://github.com/broadinstitute/picard/releases/download/1.127/\ - picard-tools-1.127.zip" - in - let jar = "picard-tools-1.127" // "picard.jar" in - let init_program ~path = KEDSL.Program.(shf "export PICARD_JAR=%s/%s" path jar) in - Installable_tool.make Machine.Tool.Default.picard ~url ~init_program - ~witness:(witness_file jar) - (** Mutect (and some other tools) are behind some web-login annoying thing: c.f. @@ -416,10 +406,10 @@ let mutect_tool let open KEDSL in let install_path = install_tools_path // Tool_def.to_directory_name tool in let conda_env = (* mutect doesn't run on Java; so need to provide Java 7 *) - Conda.(setup_environment - ~python_version:`Python2 + Conda.(setup_environment + ~python_version:`Python_2 ~base_packages:[("java-jdk", `Version "7.0.91")] - install_path + install_path "mutect_env") in let conda_ensure = Conda.(configured ~run_program ~host ~conda_env) in @@ -429,9 +419,8 @@ let mutect_tool let ensure = workflow_node without_product ~name:"MuTect setup" ~edges in - let init = - Program.(conda_init && shf "export mutect_HOME=%s" install_path) - in + let init = + Program.(conda_init && shf "export mutect_HOME=%s" install_path) in Machine.Tool.create tool ~ensure ~init let gatk_tool @@ -444,6 +433,7 @@ let gatk_tool Machine.Tool.create tool ~ensure ~init:Program.(shf "export GATK_JAR=%s" ensure#product#path) + (** Strelka is built from source but does not seem to build on MacOSX. @@ -459,7 +449,9 @@ let strelka = (* C.f. ftp://ftp.illumina.com/v1-branch/v1.0.14/README *) KEDSL.Program.( shf "./configure --prefix=%s" (path // "usr") - && sh "make && make install" + && sh "make" + && sh "chmod -R a+rx ./*" (* it has weird access rights by default *) + && sh "make install" ) in let init_program ~path = @@ -554,7 +546,6 @@ let default_toolkit install bedtools; install vcftools; install strelka; - install picard; install somaticsniper; install sambamba; install varscan; @@ -576,6 +567,8 @@ let default_toolkit ~install_path:(install_tools_path // "biopam-kit") (); Python_package.default ~run_program ~host ~install_path: (install_tools_path // "python-tools") (); + Bioconda.default ~run_program ~host + ~install_path: (install_tools_path // "bioconda") (); Netmhc.default ~run_program ~host ~install_path: (install_tools_path // "netmhc-tools") ~netmhc_config: (netmhc_config ()) diff --git a/src/pipeline_edsl/optimization_framework.ml b/src/pipeline_edsl/optimization_framework.ml index 9797a21..a69813e 100644 --- a/src/pipeline_edsl/optimization_framework.ml +++ b/src/pipeline_edsl/optimization_framework.ml @@ -216,6 +216,9 @@ module Generic_optimizer let vcf_annotate_polyphen vcf = fwd (Input.vcf_annotate_polyphen (bwd vcf)) + + let snpeff vcf = fwd (Input.snpeff (bwd vcf)) + let isovar ?configuration vcf bam = fwd (Input.isovar ?configuration (bwd vcf) (bwd bam)) let topiary ?configuration vcfs predictor alleles = @@ -225,4 +228,7 @@ module Generic_optimizer fwd (Input.vaxrank ?configuration (List.map ~f:(fun v -> (bwd v)) vcfs) (bwd bam) predictor (bwd alleles)) + + let seqtk_shift_phred_scores fastq = + fwd (Input.seqtk_shift_phred_scores (bwd fastq)) end diff --git a/src/pipeline_edsl/semantics.ml b/src/pipeline_edsl/semantics.ml index 95f93a1..679a89d 100644 --- a/src/pipeline_edsl/semantics.ml +++ b/src/pipeline_edsl/semantics.ml @@ -303,6 +303,10 @@ module type Bioinformatics_base = sig [ `Vcf ] repr -> [ `Vcf ] repr + val snpeff: + [ `Vcf ] repr -> + [ `Vcf ] repr + val isovar: ?configuration: Isovar.Configuration.t -> [ `Vcf ] repr -> @@ -312,7 +316,7 @@ module type Bioinformatics_base = sig val topiary: ?configuration: Topiary.Configuration.t -> [ `Vcf ] repr list -> - Topiary.predictor_type -> + Biokepi_run_environment.Hla_utilities.predictor_type -> [ `MHC_alleles ] repr -> [ `Topiary ] repr @@ -320,9 +324,13 @@ module type Bioinformatics_base = sig ?configuration: Vaxrank.Configuration.t -> [ `Vcf ] repr list -> [ `Bam ] repr -> - Topiary.predictor_type -> + Biokepi_run_environment.Hla_utilities.predictor_type -> [ `MHC_alleles ] repr -> [ `Vaxrank ] repr + val seqtk_shift_phred_scores: + [ `Fastq ] repr -> + [ `Fastq ] repr + end diff --git a/src/pipeline_edsl/to_json.ml b/src/pipeline_edsl/to_json.ml index bb6e9ec..2f03328 100644 --- a/src/pipeline_edsl/to_json.ml +++ b/src/pipeline_edsl/to_json.ml @@ -1,5 +1,6 @@ open Nonstd module Tools = Biokepi_bfx_tools +module Hla_utils = Biokepi_run_environment.Hla_utilities type json = Yojson.Basic.json @@ -278,6 +279,9 @@ module Make_serializer (How : sig let vcf_annotate_polyphen = one_to_one "vcf_annotate_polyphen" "default" + let snpeff = + one_to_one "snpeff" "default" + let isovar ?(configuration = Tools.Isovar.Configuration.default) vcf bam = fun ~(var_count : int) -> @@ -297,7 +301,7 @@ module Make_serializer (How : sig function_call "topiary" ([ "configuration", string Tools.Topiary.Configuration.(name configuration); "alleles", alleles ~var_count; - "predictor", string Tools.Topiary.(predictor_to_string predictor); + "predictor", string Hla_utils.(predictor_to_string predictor); ] @ (List.mapi ~f:(fun i v -> (sprintf "vcf%d" i, v)) vcfs_compiled)) let vaxrank @@ -309,7 +313,7 @@ module Make_serializer (How : sig function_call "vaxrank" ([ "configuration", string Tools.Vaxrank.Configuration.(name configuration); "alleles", alleles ~var_count; - "predictor", string Tools.Topiary.(predictor_to_string predictor); + "predictor", string Hla_utils.(predictor_to_string predictor); "bam", bam_compiled; ] @ (List.mapi ~f:(fun i v -> (sprintf "vcf%d" i, v)) vcfs_compiled)) @@ -372,6 +376,9 @@ module Make_serializer (How : sig variant_caller "virmid" configuration.Tools.Virmid.Configuration.name + let seqtk_shift_phred_scores = + one_to_one "seqtk_shift_phred_scores" "default" + end include Make_serializer (struct diff --git a/src/pipeline_edsl/to_workflow.ml b/src/pipeline_edsl/to_workflow.ml index e1d08b9..723601e 100644 --- a/src/pipeline_edsl/to_workflow.ml +++ b/src/pipeline_edsl/to_workflow.ml @@ -964,7 +964,7 @@ module Make (Config : Compiler_configuration) let concat_files ~read l = let result_path = Name_file.in_directory - Config.work_dir + Config.work_dir ~readable_suffix:( sprintf "%s-Read%d-Concat.fastq" first_fastq#product#escaped_sample_name read) ( @@ -1132,7 +1132,7 @@ module Make (Config : Compiler_configuration) let picard_clean_bam bam = let input_bam = get_bam (AF.get_file bam) in let output_bam_path = - Name_file.from_path + Name_file.from_path input_bam#product#path ~readable_suffix:"cleaned.bam" [] @@ -1171,6 +1171,22 @@ module Make (Config : Compiler_configuration) ] + let seqtk_shift_phred_scores fastq = + let input_fastq = get_fastq (AF.get_file fastq) in + let output_folder = + Name_file.from_path input_fastq#product#sample_name + ~readable_suffix:"phred33-fastq" [ "seqtk"; ] + in + Fastq ( + Tools.Seqtk.shift_phred_scores + ~run_with + ~output_postfix:".fastq" + ~input_fastq + ~output_folder:(Config.work_dir // output_folder) + ) + |> AF.with_provenance "seqtk-shift-phred-scores" + ["fastq", AF.get_provenance fastq] + let seq2hla fq = let fastq = get_fastq (AF.get_file fq) in let r1 = KEDSL.read_1_file_node fastq in @@ -1206,10 +1222,8 @@ module Make (Config : Compiler_configuration) let r = get_optitype_result (AF.get_file v) in `Optitype r, out r#product#path, ("optitype", AF.get_provenance v) in - let res = - hlarp ~hla_result ~output_path ~extract_alleles:true in - MHC_alleles (res ()) - |> AF.with_provenance "hlarp" [prov_argument] + let res = hlarp ~hla_result ~output_path in + MHC_alleles res |> AF.with_provenance "hlarp" [prov_argument] let filter_to_region vcf bed = let vcff = get_vcf (AF.get_file vcf) in @@ -1267,6 +1281,25 @@ module Make (Config : Compiler_configuration) ) |> AF.with_provenance "vcf-annotate-polyphen" ["vcf", AF.get_provenance vcf] + let snpeff vcf = + let open KEDSL in + let v = get_vcf (AF.get_file vcf) in + let reference_build = v#product#reference_build in + let out_folder = + Name_file.in_directory ~readable_suffix:"snpeff" Config.work_dir + [ Filename.basename v#product#path; reference_build; ] + in + let snpeff_run = + Tools.Snpeff.annotate ~run_with ~reference_build ~input_vcf:v ~out_folder + in + Vcf ( + workflow_node + (transform_vcf v#product ~path:(snpeff_run#product#path)) + ~name:(sprintf "Fetch annotated VCF: %s" v#render#name) + ~edges:[ depends_on snpeff_run; ] + ) + |> AF.with_provenance "snpeff" ["vcf", AF.get_provenance vcf] + let isovar ?(configuration=Tools.Isovar.Configuration.default) vcf bam = @@ -1304,7 +1337,7 @@ module Make (Config : Compiler_configuration) let topiary ?(configuration=Tools.Topiary.Configuration.default) vcfs predictor alleles = let vs = List.map ~f:(fun x -> AF.get_file x |> get_vcf) vcfs in - let refs = + let refs = vs |> List.map ~f:(fun v -> v#product#reference_build) |> List.dedup in let reference_build = @@ -1317,9 +1350,9 @@ module Make (Config : Compiler_configuration) in let mhc = get_mhc_alleles (AF.get_file alleles) in let output_file = - Name_file.in_directory ~readable_suffix:"topiary.tsv" Config.work_dir + Name_file.in_directory ~readable_suffix:"topiary.tsv" Config.work_dir ([ - Tools.Topiary.predictor_to_string predictor; + Hla_utilities.predictor_to_string predictor; Tools.Topiary.Configuration.name configuration; Filename.chop_extension (Filename.basename mhc#product#path); ] @ (List.map vs ~f:(fun v -> v#product#path))) @@ -1335,7 +1368,7 @@ module Make (Config : Compiler_configuration) (("alleles", AF.get_provenance alleles) :: List.mapi vcfs ~f:(fun i v -> sprintf "vcf_%d" i, AF.get_provenance v)) ~string_arguments:[ - "predictor", Tools.Topiary.predictor_to_string predictor; + "predictor", Hla_utilities.predictor_to_string predictor; "configuration-name", Tools.Topiary.Configuration.name configuration; ] @@ -1366,7 +1399,7 @@ module Make (Config : Compiler_configuration) Name_file.in_directory ~readable_suffix:"vaxrank" Config.work_dir ([ Tools.Vaxrank.Configuration.name configuration; - Tools.Topiary.predictor_to_string predictor; + Hla_utilities.predictor_to_string predictor; (Filename.chop_extension (Filename.basename mhc#product#path)); ] @ (List.map vs ~f:(fun v -> @@ -1385,7 +1418,7 @@ module Make (Config : Compiler_configuration) :: ("bam", AF.get_provenance bam) :: List.mapi vcfs ~f:(fun i v -> sprintf "vcf_%d" i, AF.get_provenance v)) ~string_arguments:[ - "predictor", Tools.Topiary.predictor_to_string predictor; + "predictor", Hla_utilities.predictor_to_string predictor; "configuration-name", Tools.Vaxrank.Configuration.name configuration; ] @@ -1404,10 +1437,15 @@ module Make (Config : Compiler_configuration) fastq#product#fragment_id_forced; ] in + let run_name = fastq#product#escaped_sample_name in Optitype_result ( - Tools.Optitype.hla_type - ~work_dir ~run_with ~run_name:fastq#product#escaped_sample_name ~fastq - how + match how with + | `DNA -> + Tools.Optitype.dna_hla_type_with_bwamem + ~configuration:Tools.Bwa.Configuration.Mem.default + ~work_dir ~run_with ~fastq ~run_name + | `RNA -> + Tools.Optitype.hla_type ~work_dir ~run_with ~fastq ~run_name `RNA ) |> AF.with_provenance "optitype" ["fastq", AF.get_provenance fq] ~string_arguments:["input-type", intput_type] diff --git a/src/run_environment/common.ml b/src/run_environment/common.ml index 94c7afc..f7d458d 100644 --- a/src/run_environment/common.ml +++ b/src/run_environment/common.ml @@ -190,6 +190,20 @@ module KEDSL = struct | other -> '_') end + let transform_fastq_reads + ?name ?fragment_id + (fq_reads: fastq_reads) r1 r2_opt + : fastq_reads + = + fastq_reads + ~host:fq_reads#r1#host + ~name:(match name with Some n -> n | None -> fq_reads#sample_name) + ?fragment_id:( + match fragment_id with + | Some fi -> fi + | None -> fq_reads#fragment_id) + r1 r2_opt + let read_1_file_node (fq : fastq_reads workflow_node) = let product = fq#product#r1 in workflow_node product diff --git a/src/run_environment/hla_utilities.ml b/src/run_environment/hla_utilities.ml new file mode 100644 index 0000000..f7f59aa --- /dev/null +++ b/src/run_environment/hla_utilities.ml @@ -0,0 +1,97 @@ +(* Hassle-free HLA/MHC handling *) +open Common + +type predictor_type = [ + | `NetMHC + | `NetMHCpan + | `NetMHCIIpan + | `NetMHCcons + | `Random + | `SMM + | `SMM_PMBEC + | `NetMHCpan_IEDB + | `NetMHCcons_IEDB + | `SMM_IEDB + | `SMM_PMBEC_IEDB +] + +let predictor_to_string = function + | `NetMHC -> "netmhc" + | `NetMHCpan -> "netmhcpan" + | `NetMHCIIpan -> "netmhciipan" + | `NetMHCcons -> "netmhccons" + | `Random -> "random" + | `SMM -> "smm" + | `SMM_PMBEC -> "smm-pmbec" + | `NetMHCpan_IEDB -> "netmhcpan-iedb" + | `NetMHCcons_IEDB -> "netmhccons-iedb" + | `SMM_IEDB -> "smm-iedb" + | `SMM_PMBEC_IEDB -> "smm-pmbec-iedb" + +let predictor_to_tool ~run_with predictor = + let get_tool t = + let tool = + Machine.get_tool + run_with + Machine.Tool.Definition.(create t) + in + let ensure = Machine.Tool.(ensure tool) in + let init = Machine.Tool.(init tool) in + (ensure, init) + in + match predictor with + | `NetMHC -> Some (get_tool "netMHC") + | `NetMHCpan -> Some (get_tool "netMHCpan") + | `NetMHCIIpan -> Some (get_tool "netMHCIIpan") + | `NetMHCcons -> Some (get_tool "netMHCcons") + | _ -> None + + +(* + Example input (all in): + 1,A*03:01,,0.026478,samplename + 1,B*37:01',,0.000000,samplename + 1, C*06:02,,0.000086,samplename + 2,DQA1*01:02 ,,0.000000,samplename + Example output (type-I predictions out as a plain list): + A*03:01 + B*37:01 + C*06:02 + C*07:02 + + So the following "one-liner" + - extracts the second column + - trims the allele names from white-spaces around them + - gets rid of 's at the end of the allele names + - removes type-II predictions (since we don't support + type-II predictions) + to be able to feed the file into `mhctools` based utilities, + including topiary, vaxrank, and netmhctools itself. +*) + +let sanitize_hlarp_out_for_mhctools ~run_with ~hlarp_result ~output_path = + let open KEDSL in + let input_path = hlarp_result#product#path in + let name = + sprintf + "Extract and sanitize alleles: %s" + (hlarp_result#render#name) + in + let edges = [ depends_on hlarp_result; ] in + let product = single_file ~host:(Machine.as_host run_with) output_path in + let tmp_path = output_path ^ ".tmp" in + let make = Machine.( + run_program run_with + ~requirements:[ `Quick_run; ] + Program.( + shf "cat %s \ + | grep -v '^2' \ + | awk -F , '{ gsub(/^[ \t]+|[ \t]+$/,\"\", $2); print $2}' \ + | tail -n +2 \ + | sed \"s/'//\" > %s \ + && mv %s %s" + input_path tmp_path tmp_path output_path + ) + ) + in + workflow_node product ~name ~make ~edges diff --git a/src/run_environment/machine.ml b/src/run_environment/machine.ml index 2a544ec..8384fb4 100644 --- a/src/run_environment/machine.ml +++ b/src/run_environment/machine.ml @@ -28,6 +28,8 @@ module Tool = struct sprintf "%s.%s" name (Option.value ~default:"NOVERSION" version) let to_string = to_opam_name let to_directory_name = to_opam_name + let get_version t = t.version + let get_name t = t.name end module Default = struct open Definition @@ -40,7 +42,6 @@ module Tool = struct let bedtools = create "bedtools" ~version:"2.23.0" let somaticsniper = create "somaticsniper" ~version:"1.0.3" let varscan = create "varscan" ~version:"2.3.5" - let picard = create "picard" ~version:"1.127" let mutect = create "mutect" (* We don't know the versions of the users' GATKs *) let gatk = create "gatk" (* idem, because of their non-open-source licenses *) let strelka = create "strelka" ~version:"1.0.14" @@ -54,13 +55,23 @@ module Tool = struct let mosaik = create "mosaik" ~version:"2.2.3" let kallisto = create "kallisto" ~version:"0.42.3" let bowtie = create "bowtie" ~version:"1.1.2" - let optitype = create "optitype" ~version:"1.0.0" - let seq2hla = create "seq2hla" ~version:"2.2" let fastqc = create "fastqc" ~version:"0.11.5" let igvxml = create "igvxml" ~version:"0.1.0" let hlarp = create "hlarp" ~version:"biokepi-branch" let samblaster = create "samblaster" ~version:"v.0.1.22" let delly2 = create "delly2" ~version:"0.7.7" + (* Bioconda *) + let optitype = create "optitype" ~version:"1.2.1-0" + let seqtk = create "seqtk" ~version:"1.2" + let seq2hla = create "seq2hla" ~version:"2.2" + let picard = create "picard" ~version:"2.9.2" + let snpeff = create "snpeff" ~version:"4.3.1m-0" + (* PyPI packages *) + let pyensembl = create "pyensembl" ~version:"1.1.0" + let vcfannotatepolyphen = create "vcf-annotate-polyphen" ~version:"0.1.2" + let topiary = create "topiary" ~version:"1.2.1" + let vaxrank = create "vaxrank" ~version:"0.6.0" + let isovar = create "isovar" ~version:"0.7.0" end type t = { @@ -186,7 +197,23 @@ let create host; run_program; work_dir; max_processors} let name t = t.name -let as_host t = t.host + +let as_host ?with_shell t = + match with_shell with + | None -> t.host + | Some shell -> + begin + let open Ketrew_pure in + let shell_key = "shell" in + let org_uri = Host.to_uri t.host in + let uri_no_shell = Uri.remove_query_param org_uri shell_key in + let uri_with_shell = + let shell_str = sprintf "%s,-c" shell in (* as in `bash -c` *) + Uri.add_query_param uri_no_shell (shell_key, [shell_str;]) + in + KEDSL.Host.parse (Uri.to_string uri_with_shell) + end + let get_pyensembl_cache_dir t = t.pyensembl_cache_dir let get_reference_genome t = t.get_reference_genome let get_tool t tool = diff --git a/src/run_environment/reference_genome.ml b/src/run_environment/reference_genome.ml index e96314c..60408fc 100644 --- a/src/run_environment/reference_genome.ml +++ b/src/run_environment/reference_genome.ml @@ -33,6 +33,7 @@ module Specification = struct cdna: Location.t option; whess: Location.t option; major_contigs: string list option; + snpeff_name: string option; } let create @@ -47,6 +48,7 @@ module Specification = struct ?cdna ?whess ?major_contigs + ?snpeff_name name = { name; ensembl; @@ -60,6 +62,7 @@ module Specification = struct cdna; whess; major_contigs; + snpeff_name; } module Default = struct @@ -105,12 +108,10 @@ module Default = struct let b37_whess_url = "ftp://genetics.bwh.harvard.edu/pph2/whess/\ polyphen-2.2.2-whess-2011_12.sqlite.bz2" - let b37_known_indels_url = "https://storage.googleapis.com/hammerlab-biokepi-data/raw_data/\ Mills_and_1000G_gold_standard.indels.b37.vcf.gz" - let human = "homo sapiens" let mouse = "mus musculus" @@ -133,6 +134,7 @@ module Default = struct ~exome_gtf:Location.(url b37_exome_gtf_url |> gunzip) ~cdna:Location.(url b37_cdna_url |> gunzip) ~whess:Location.(url b37_whess_url |> bunzip2) + ~snpeff_name:"GRCh37.75" let b37decoy = create Name.b37decoy @@ -151,6 +153,7 @@ module Default = struct ~cosmic:Location.(url b37_cosmic_url) ~cdna:Location.(url b37_cdna_url |> gunzip) ~whess:Location.(url b37_whess_url |> bunzip2) + ~snpeff_name:"GRCh37.75" let hg38 = (* Release 87 *) @@ -168,6 +171,7 @@ module Default = struct ~fasta:Location.(url hg38_url|> gunzip) ~dbsnp:Location.(url dbsnp_hg38 |> gunzip) ~known_indels:Location.(url known_indels_hg38 |> gunzip) + ~snpeff_name:"GRCh38.86" let b38 = (* Release 87 *) @@ -191,6 +195,7 @@ module Default = struct ~exome_gtf:Location.(url gtf_b38_url |> gunzip) ~dbsnp:Location.(url dbsnp_url |> gunzip) ~cdna:Location.(url cdna_b38_url |> gunzip) + ~snpeff_name:"GRCh38.86" let hg18 = let hg18_url = @@ -226,6 +231,7 @@ module Default = struct ~dbsnp:Location.(url dbsnp_hg19_url |> gunzip) ~known_indels:Location.(url known_indels_hg19_url |> gunzip) ~whess:Location.(url b37_whess_url |> bunzip2) + ~snpeff_name:"hg19" let mm10 = let mm10_url = @@ -255,6 +261,7 @@ module Default = struct ) ~exome_gtf:Location.(url gene_annotations_gtf |> gunzip) ~cdna:Location.(url cdna_mm10_url |> gunzip) + ~snpeff_name:"mm10" end @@ -283,6 +290,10 @@ let create ?cosmic ?dbsnp ?known_indels ?gtf ?cdna ?whess specification location let name t = t.specification.Specification.name let ensembl t = t.specification.Specification.ensembl let species t = t.specification.Specification.species +let snpeff_name_exn t = + Option.value_exn + ~msg:(sprintf "%s: no snpEff name" (name t)) + t.specification.Specification.snpeff_name let path t = t.location#product#path let cosmic_path_exn t = let msg = sprintf "cosmic_path_exn of %s" (name t) in diff --git a/src/run_environment/reference_genome.mli b/src/run_environment/reference_genome.mli index 31af9a8..565b63b 100644 --- a/src/run_environment/reference_genome.mli +++ b/src/run_environment/reference_genome.mli @@ -52,6 +52,7 @@ module Specification : sig cdna : Location.t option; whess : Location.t option; major_contigs : string list option; + snpeff_name : string option; } val create : ?metadata:string -> @@ -65,6 +66,7 @@ module Specification : sig ?cdna:Location.t -> ?whess:Location.t -> ?major_contigs:string list -> + ?snpeff_name:string -> string -> t module Default : @@ -129,6 +131,7 @@ val known_indels_path_exn : t -> string val gtf_path_exn : t -> string val cdna_path_exn : t -> string val whess_path_exn : t -> string +val snpeff_name_exn: t -> string val major_contigs : t -> Region.t list (** {5 Targets} *) diff --git a/src/run_environment/workflow_utilities.ml b/src/run_environment/workflow_utilities.ml index e48c9ce..e26c989 100644 --- a/src/run_environment/workflow_utilities.ml +++ b/src/run_environment/workflow_utilities.ml @@ -385,5 +385,22 @@ module Vcftools = struct ~make_product ~host ~vcftools ~run_program ?more_edges ~vcfs:[src] ~final_vcf:dest "vcf-sort -c" +end +module Variable_tool_paths = struct + let single_file ~run_with ~tool path = + let open KEDSL in + let condition = + let init = Machine.Tool.init tool in + let host = Machine.as_host ~with_shell:"bash" run_with in + let condition_cmd = + Ketrew_pure.Program.to_single_shell_command + Program.(init && shf "test -e %s" path) + in KEDSL.Command.shell ~host condition_cmd + in + object + method is_done = Some (`Command_returns (condition, 0)) + end end + +