diff --git a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/README.md b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/README.md index 1bd3ff6d..01461f08 100644 --- a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/README.md +++ b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/README.md @@ -1,27 +1,29 @@ +# Local haplotype reconstruction benchmark + This repository stores the scripts and notebooks used to conduct the benchmark study presented in the manuscript: XXX To reproduce the benchmark study and create the figures presented in the manuscript, use the following instructions: 1. Clone the repository of V-pipe 3.0 into your working directory: -``` -git clone https://github.com/cbg-ethz/V-pipe.git -``` + ```shell + git clone https://github.com/cbg-ethz/V-pipe.git + ``` 2. Go into the directory of the benchmarking study for the local haplotype reconstruction: -``` -cd V-pipe/resources/auxiliary_workflows/benchmark/resources/local_haplo_setup -``` + ```shell + cd V-pipe/resources/auxiliary_workflows/benchmark/resources/local_haplo_setup + ``` 3. The parameters to reproduce the synthetic datasets are in the parameter files `config_xxx/params.csv` with the configuration file `config_xxx/config.yaml` where simulation mode, replicate number and methods to be executed are defined. 4. The methods to execute must be define in a Python script in this directory: -``` -V-pipe/resources/auxiliary_workflows/benchmark/resources/method_definitions. -``` + ```shell + V-pipe/resources/auxiliary_workflows/benchmark/resources/method_definitions. + ``` 5. Now the workflow is ready, go back to the directory -``` -cd V-pipe/resources/auxiliary_workflows/benchmark/resources/local_haplo_setup -``` + ```shell + cd V-pipe/resources/auxiliary_workflows/benchmark/resources/local_haplo_setup + ``` 6. To install the needed Conda environments execute: -``` -snakemake --conda-create-envs-only --use-conda -c1 -``` + ```shell + snakemake --conda-create-envs-only --use-conda -c1 + ``` 7. To submit the workflow to a lsf-cluster execute `./run_workflow.sh`, otherwise execute the workflow with `snakemake --use-conda -c1` 8. The workflow will provide the results in the directory `results`. 9. When the workflow has terminated and all result files were generated, figures from the manuscript can be generated by executing the notebooks in `workflow/notebooks/`. diff --git a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_indels_freqs.py b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_indels_freqs.py index 4aec704a..0646391a 100644 --- a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_indels_freqs.py +++ b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_indels_freqs.py @@ -27,10 +27,12 @@ def convert_vcf(fname): variant_list.add(f"{zero_based_pos}{base}") return variant_list + def convert_groundtruth(fname): df = pd.read_csv(fname, index_col=0) return set((df["position"].astype(str) + df["variant"]).tolist()) + def mutation_calls_details(vcf_list, groundtruth_list): # compute performance tmp = [] @@ -45,11 +47,16 @@ def mutation_calls_details(vcf_list, groundtruth_list): predicted_variants = convert_vcf(fname_vcf) # iter through ground truth mutations - set_true_variants = [str(gt_row["position"]) + gt_row["variant"] for iter_row, gt_row in pd.read_csv(fname_groundtruth, index_col=0).iterrows()] + set_true_variants = [ + str(gt_row["position"]) + gt_row["variant"] + for iter_row, gt_row in pd.read_csv( + fname_groundtruth, index_col=0 + ).iterrows() + ] for iter_row, gt_row in pd.read_csv(fname_groundtruth, index_col=0).iterrows(): true_variant = str(gt_row["position"]) + gt_row["variant"] mutation_type = str(gt_row["type"]) - if method.startswith(('lofreq', 'shorah', 'viloca')): + if method.startswith(("lofreq", "shorah", "viloca")): is_false_negative = True for variant in VCF(fname_vcf): for idx_base, base in enumerate(variant.ALT): @@ -59,14 +66,20 @@ def mutation_calls_details(vcf_list, groundtruth_list): if true_variant == predicted_variant: is_false_negative = False # get pred_freq - if method.startswith('lofreq'): + if method.startswith("lofreq"): posterior = 1.0 - freq = variant.INFO.get('AF')#vi.split(',')[idx_base] - elif method.startswith(('shorah', 'viloca')): + freq = variant.INFO.get("AF") # vi.split(',')[idx_base] + elif method.startswith(("shorah", "viloca")): if mutation_type != "insertion": - freq = (int(variant.INFO.get('Fvar')) + int(variant.INFO.get('Rvar'))) / (int(variant.INFO.get('Ftot')) + int(variant.INFO.get('Rtot'))) + freq = ( + int(variant.INFO.get("Fvar")) + + int(variant.INFO.get("Rvar")) + ) / ( + int(variant.INFO.get("Ftot")) + + int(variant.INFO.get("Rtot")) + ) elif mutation_type == "insertion": - freq = (float(variant.INFO.get('Freq1'))) + freq = float(variant.INFO.get("Freq1")) tmp.append( { @@ -125,6 +138,7 @@ def main(vcf_list, groundtruth_list, fname_out): df = mutation_calls_details(vcf_list, groundtruth_list) df.to_csv(fname_out) + if __name__ == "__main__": main( [Path(e) for e in snakemake.input.vcf_list], diff --git a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_runtime.py b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_runtime.py index bb8bcdf3..8cabd817 100644 --- a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_runtime.py +++ b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_runtime.py @@ -1,6 +1,7 @@ from pathlib import Path import pandas as pd + def runtime(benchmark_list, fname_out): # gather benchmark information tmp = [] diff --git a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_snvs_freqs.py b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_snvs_freqs.py index d9d0e6db..56bb4814 100644 --- a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_snvs_freqs.py +++ b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_snvs_freqs.py @@ -27,10 +27,12 @@ def convert_vcf(fname): variant_list.add(f"{zero_based_pos}{base}") return variant_list + def convert_groundtruth(fname): df = pd.read_csv(fname, index_col=0) return set((df["position"].astype(str) + df["variant"]).tolist()) + def mutation_calls_details(vcf_list, groundtruth_list): # compute performance tmp = [] @@ -48,7 +50,7 @@ def mutation_calls_details(vcf_list, groundtruth_list): for iter_row, gt_row in pd.read_csv(fname_groundtruth, index_col=0).iterrows(): true_variant = str(gt_row["position"]) + gt_row["variant"] mutation_type = str(gt_row["type"]) - if method.startswith(('lofreq', 'shorah')): + if method.startswith(("lofreq", "shorah")): is_false_negative = True for variant in VCF(fname_vcf): for idx_base, base in enumerate(variant.ALT): @@ -58,12 +60,18 @@ def mutation_calls_details(vcf_list, groundtruth_list): if true_variant == predicted_variant: is_false_negative = False # get pred_freq - if method.startswith('lofreq'): + if method.startswith("lofreq"): posterior = 1.0 - freq = variant.INFO.get('AF')#vi.split(',')[idx_base] - elif method.startswith('shorah'): + freq = variant.INFO.get("AF") # vi.split(',')[idx_base] + elif method.startswith("shorah"): posterior = 1.0 - freq = (int(variant.INFO.get('Fvar')) + int(variant.INFO.get('Rvar'))) / (int(variant.INFO.get('Ftot')) + int(variant.INFO.get('Rtot'))) + freq = ( + int(variant.INFO.get("Fvar")) + + int(variant.INFO.get("Rvar")) + ) / ( + int(variant.INFO.get("Ftot")) + + int(variant.INFO.get("Rtot")) + ) tmp.append( { @@ -99,23 +107,41 @@ def mutation_calls_details(vcf_list, groundtruth_list): } ) - elif method.startswith('viloca'): - - colnames=['Chromosome', 'Pos', 'Ref', 'Alt', 'Frq', 'Pst', 'Rvar', 'Fvar', 'Rtot', 'Ftot', 'Qval'] + elif method.startswith("viloca"): + + colnames = [ + "Chromosome", + "Pos", + "Ref", + "Alt", + "Frq", + "Pst", + "Rvar", + "Fvar", + "Rtot", + "Ftot", + "Qval", + ] # we want the file SNVs_0.010000.tsv as the posterior filter was not applied here - fname_SNVs_correct = str(fname_vcf).split("snvs_.vcf")[0] + "work/snv/SNVs_0.010000.tsv" - df_predicted = pd.read_csv(fname_SNVs_correct, names=colnames, header=None ,sep="\t") - df_predicted['Pst'] = pd.to_numeric(df_predicted['Pst'], errors='coerce') + fname_SNVs_correct = ( + str(fname_vcf).split("snvs_.vcf")[0] + "work/snv/SNVs_0.010000.tsv" + ) + df_predicted = pd.read_csv( + fname_SNVs_correct, names=colnames, header=None, sep="\t" + ) + df_predicted["Pst"] = pd.to_numeric( + df_predicted["Pst"], errors="coerce" + ) for iter_row_pred, pred_row in df_predicted.iterrows(): - zero_based_pos = int(pred_row['Pos']) - 1 # VCF is 1-based + zero_based_pos = int(pred_row["Pos"]) - 1 # VCF is 1-based is_false_negative = True predicted_variant = f"{zero_based_pos}{pred_row['Alt']}" if true_variant == predicted_variant: is_false_negative = False - posterior = pred_row['Pst'] - freq = pred_row['Frq'] + posterior = pred_row["Pst"] + freq = pred_row["Frq"] tmp.append( { @@ -158,6 +184,7 @@ def main(vcf_list, groundtruth_list, fname_out): df = mutation_calls_details(vcf_list, groundtruth_list) df.to_csv(fname_out) + if __name__ == "__main__": main( [Path(e) for e in snakemake.input.vcf_list], diff --git a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_snvs_viloca.py b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_snvs_viloca.py index f96a8777..a606fa04 100644 --- a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_snvs_viloca.py +++ b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_snvs_viloca.py @@ -18,11 +18,12 @@ def convert_vcf(df): variant_list = set() for idx_row, variant in df.iterrows(): - for base in variant['Alt']: - zero_based_pos = int(variant['Pos']) - 1 # VCF is 1-based + for base in variant["Alt"]: + zero_based_pos = int(variant["Pos"]) - 1 # VCF is 1-based variant_list.add(f"{zero_based_pos}{base}") return variant_list + def convert_groundtruth(fname): df = pd.read_csv(fname, index_col=0) return set((df["position"].astype(str) + df["variant"]).tolist()) @@ -76,20 +77,38 @@ def compute_performance(true_variants, predicted_variants): def performance_plots(vcf_list, groundtruth_list, posterior_threshold): # compute performance - colnames=['Chromosome', 'Pos', 'Ref', 'Alt', 'Frq', 'Pst', 'Rvar', 'Fvar', 'Rtot', 'Ftot', 'Qval'] + colnames = [ + "Chromosome", + "Pos", + "Ref", + "Alt", + "Frq", + "Pst", + "Rvar", + "Fvar", + "Rtot", + "Ftot", + "Qval", + ] tmp = [] fps_tmp = [] for fname_vcf, fname_groundtruth in zip(vcf_list, groundtruth_list): # we want the file SNVs_0.010000.tsv as the posterior filter was not applied here - fname_SNVs_correct = str(fname_vcf).split("snvs_.vcf")[0] + "work/snv/SNVs_0.010000.tsv" + fname_SNVs_correct = ( + str(fname_vcf).split("snvs_.vcf")[0] + "work/snv/SNVs_0.010000.tsv" + ) if os.path.isfile(fname_SNVs_correct): - df_predicted = pd.read_csv(fname_SNVs_correct, names=colnames, header=None ,sep="\t") + df_predicted = pd.read_csv( + fname_SNVs_correct, names=colnames, header=None, sep="\t" + ) else: - fname_SNVs_correct = str(fname_vcf).split("snvs_.vcf")[0] + "work/snv/SNVs_0.010000_final.csv" + fname_SNVs_correct = ( + str(fname_vcf).split("snvs_.vcf")[0] + + "work/snv/SNVs_0.010000_final.csv" + ) df_predicted = pd.read_csv(fname_SNVs_correct) - df_predicted = df_predicted.rename(columns={'Var': 'Alt', - 'Pst2': 'Pst'}) + df_predicted = df_predicted.rename(columns={"Var": "Alt", "Pst2": "Pst"}) parts = str(fname_vcf).split("/") @@ -98,11 +117,10 @@ def performance_plots(vcf_list, groundtruth_list, posterior_threshold): elif len(parts) == 8: # for multi workflow _, _, _, params, method, _, replicate, _ = parts - - #filter dataframe according to posterior_threshold - df_predicted['Pst'] = pd.to_numeric(df_predicted['Pst'], errors='coerce') - df_predicted = df_predicted.dropna(subset=['Pst']) - df_predicted = df_predicted[df_predicted['Pst']>posterior_threshold] + # filter dataframe according to posterior_threshold + df_predicted["Pst"] = pd.to_numeric(df_predicted["Pst"], errors="coerce") + df_predicted = df_predicted.dropna(subset=["Pst"]) + df_predicted = df_predicted[df_predicted["Pst"] > posterior_threshold] true_variants = convert_groundtruth(fname_groundtruth) predicted_variants = convert_vcf(df_predicted) @@ -143,7 +161,6 @@ def performance_plots(vcf_list, groundtruth_list, posterior_threshold): return df_perf - def main(vcf_list, groundtruth_list, fname_out): posterior_thresholds = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95] @@ -152,15 +169,14 @@ def main(vcf_list, groundtruth_list, fname_out): for posterior_threshold in posterior_thresholds: df_pos_thres = performance_plots( - vcf_list, - groundtruth_list, - posterior_threshold - ) + vcf_list, groundtruth_list, posterior_threshold + ) dfs_tmp.append(df_pos_thres) pd.concat(dfs_tmp).to_csv(fname_out) + if __name__ == "__main__": main( [Path(e) for e in snakemake.input.vcf_list], diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo.py index 356c6af9..0fac5b6a 100644 --- a/resources/auxiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo.py +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo.py @@ -27,7 +27,7 @@ def main( check=True, ) - runtime_limit = 15*24*60*60 - 60*60 # 15 days - 1h + runtime_limit = 15 * 24 * 60 * 60 - 60 * 60 # 15 days - 1h cliquesnv_mode = None if seq_type == "illumina": cliquesnv_mode = "snv-illumina" diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo_tf0.001.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo_tf0.001.py index 356c6af9..0fac5b6a 100644 --- a/resources/auxiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo_tf0.001.py +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo_tf0.001.py @@ -27,7 +27,7 @@ def main( check=True, ) - runtime_limit = 15*24*60*60 - 60*60 # 15 days - 1h + runtime_limit = 15 * 24 * 60 * 60 - 60 * 60 # 15 days - 1h cliquesnv_mode = None if seq_type == "illumina": cliquesnv_mode = "snv-illumina" diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo_tf0.01.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo_tf0.01.py index 52d91474..93e8e202 100644 --- a/resources/auxiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo_tf0.01.py +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo_tf0.01.py @@ -30,24 +30,24 @@ def main( else: # prepare environment subprocess.run( - ["samtools", "view", "-h", "-o", dname_work / "reads.sam", fname_bam], - check=True, + ["samtools", "view", "-h", "-o", dname_work / "reads.sam", fname_bam], + check=True, ) - runtime_limit = 15*24*60*60 - 60*60 # 15 days - 1h + runtime_limit = 15 * 24 * 60 * 60 - 60 * 60 # 15 days - 1h cliquesnv_mode = None if seq_type == "illumina": - cliquesnv_mode = "snv-illumina" + cliquesnv_mode = "snv-illumina" elif seq_type == "pacbio": - cliquesnv_mode = "snv-pacbio" + cliquesnv_mode = "snv-pacbio" elif seq_type == "nanopore": - cliquesnv_mode = "snv-pacbio" + cliquesnv_mode = "snv-pacbio" else: - raise RuntimeError(f"Invalid sequence technology: {seq_type}") + raise RuntimeError(f"Invalid sequence technology: {seq_type}") # execute tool subprocess.run( - [ + [ "cliquesnv", "-m", cliquesnv_mode, @@ -61,27 +61,27 @@ def main( str(runtime_limit), # 119h*60*60 "-threads", str(threads), - "-Xmx50G", # -Xmx50G - ], - check=True, + "-Xmx50G", # -Xmx50G + ], + check=True, ) fname_cliquesnv = dname_work / "output" / "reads.fasta" # fix frequency information if fname_cliquesnv.exists(): - record_list = [] - for record in SeqIO.parse(fname_cliquesnv, "fasta"): - _, _, freq = record.id.split("_") - record.description = f"freq:{freq}" + record_list = [] + for record in SeqIO.parse(fname_cliquesnv, "fasta"): + _, _, freq = record.id.split("_") + record.description = f"freq:{freq}" - seq_nodel = str(record.seq).replace("-", "") - record.seq = Seq(seq_nodel) + seq_nodel = str(record.seq).replace("-", "") + record.seq = Seq(seq_nodel) - record_list.append(record) - #else: + record_list.append(record) + # else: - # record_list = [] - SeqIO.write(record_list, fname_result_haplos, "fasta") + # record_list = [] + SeqIO.write(record_list, fname_result_haplos, "fasta") # create empty vcf files f = open(fname_results_snv, "a") @@ -98,4 +98,4 @@ def main( Path(snakemake.output.dname_work), snakemake.wildcards.seq_tech, snakemake.threads, - ) + ) diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo_tf0.1.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo_tf0.1.py index daea809d..c00c16e1 100644 --- a/resources/auxiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo_tf0.1.py +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo_tf0.1.py @@ -31,24 +31,24 @@ def main( else: # prepare environment subprocess.run( - ["samtools", "view", "-h", "-o", dname_work / "reads.sam", fname_bam], - check=True, + ["samtools", "view", "-h", "-o", dname_work / "reads.sam", fname_bam], + check=True, ) - runtime_limit = 15*24*60*60 - 60*60 # 15 days - 1h + runtime_limit = 15 * 24 * 60 * 60 - 60 * 60 # 15 days - 1h cliquesnv_mode = None if seq_type == "illumina": - cliquesnv_mode = "snv-illumina" + cliquesnv_mode = "snv-illumina" elif seq_type == "pacbio": - cliquesnv_mode = "snv-pacbio" + cliquesnv_mode = "snv-pacbio" elif seq_type == "nanopore": - cliquesnv_mode = "snv-pacbio" + cliquesnv_mode = "snv-pacbio" else: - raise RuntimeError(f"Invalid sequence technology: {seq_type}") + raise RuntimeError(f"Invalid sequence technology: {seq_type}") # execute tool subprocess.run( - [ + [ "cliquesnv", "-m", cliquesnv_mode, @@ -62,27 +62,27 @@ def main( str(runtime_limit), # 119h*60*60 "-threads", str(threads), - "-Xmx50G", # -Xmx50G - ], - check=True, + "-Xmx50G", # -Xmx50G + ], + check=True, ) fname_cliquesnv = dname_work / "output" / "reads.fasta" # fix frequency information if fname_cliquesnv.exists(): - record_list = [] - for record in SeqIO.parse(fname_cliquesnv, "fasta"): - _, _, freq = record.id.split("_") - record.description = f"freq:{freq}" + record_list = [] + for record in SeqIO.parse(fname_cliquesnv, "fasta"): + _, _, freq = record.id.split("_") + record.description = f"freq:{freq}" - seq_nodel = str(record.seq).replace("-", "") - record.seq = Seq(seq_nodel) + seq_nodel = str(record.seq).replace("-", "") + record.seq = Seq(seq_nodel) - record_list.append(record) - #else: + record_list.append(record) + # else: - # record_list = [] - SeqIO.write(record_list, fname_result_haplos, "fasta") + # record_list = [] + SeqIO.write(record_list, fname_result_haplos, "fasta") # create empty vcf files f = open(fname_results_snv, "a") diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/predicthaplo_local_haplo.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/predicthaplo_local_haplo.py index 15c5f885..0fae97ee 100644 --- a/resources/auxiliary_workflows/benchmark/resources/method_definitions/predicthaplo_local_haplo.py +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/predicthaplo_local_haplo.py @@ -30,49 +30,54 @@ from Bio.SeqRecord import SeqRecord -def write_config_file(fname_ref, fname_sam,fname_config, ph_prefix, dname_work, fname_result_haplos): +def write_config_file( + fname_ref, fname_sam, fname_config, ph_prefix, dname_work, fname_result_haplos +): read_length = str(fname_sam).split("read_length~")[1].split("__")[0] if read_length == "Ten_strain_IAV": read_length = 2300 fname_output_haplos = "predicthaplo_output_global_1_2263.fas" - with open((dname_work / "config_5V").resolve(), 'r') as file: + with open((dname_work / "config_5V").resolve(), "r") as file: content = file.readlines() - modified_content = content - modified_content[2] = str(ph_prefix)+'\n' - modified_content[4] = str(fname_ref)+'\n' - modified_content[6] = "0"+'\n' - modified_content[8] = str(fname_sam)+'\n' - modified_content[10] = "0"+'\n' - modified_content[12] = str(fname_result_haplos)+'\n' - modified_content[16] = "10000"+'\n' - modified_content[20] = "1"+'\n' - modified_content[22] = str(int(read_length)+1)+'\n' + modified_content = content + modified_content[2] = str(ph_prefix) + "\n" + modified_content[4] = str(fname_ref) + "\n" + modified_content[6] = "0" + "\n" + modified_content[8] = str(fname_sam) + "\n" + modified_content[10] = "0" + "\n" + modified_content[12] = str(fname_result_haplos) + "\n" + modified_content[16] = "10000" + "\n" + modified_content[20] = "1" + "\n" + modified_content[22] = str(int(read_length) + 1) + "\n" # the following parameters are set such that we don't get the following error: # died with . - modified_content[30] = "-1"+'\n' - modified_content[34] = "0.1"+'\n' - modified_content[18] = "0.0001"+'\n' - modified_content[26] = "1"+'\n' + modified_content[30] = "-1" + "\n" + modified_content[34] = "0.1" + "\n" + modified_content[18] = "0.0001" + "\n" + modified_content[26] = "1" + "\n" - with open(fname_config, 'w') as file: + with open(fname_config, "w") as file: file.writelines(modified_content) -def main(fname_bam, - fname_reference, - fname_results_snv, - fname_result_haplos, - dname_work, - seq_type, - threads, +def main( + fname_bam, + fname_reference, + fname_results_snv, + fname_result_haplos, + dname_work, + seq_type, + threads, ): dname_work.mkdir(parents=True, exist_ok=True) - predicthaplo_exe = "/cluster/work/bewi/members/lfuhrmann/PredictHaplo-1.1/PredictHaplo" + predicthaplo_exe = ( + "/cluster/work/bewi/members/lfuhrmann/PredictHaplo-1.1/PredictHaplo" + ) # prepare environment subprocess.run( @@ -84,7 +89,10 @@ def main(fname_bam, # copy example config file subprocess.run( - ["wget", "https://raw.githubusercontent.com/bmda-unibas/PredictHaplo/master/config_5V"], + [ + "wget", + "https://raw.githubusercontent.com/bmda-unibas/PredictHaplo/master/config_5V", + ], cwd=dname_work, check=True, ) @@ -92,7 +100,14 @@ def main(fname_bam, # modify config file for predicthaplo fname_config = (dname_work / "predicthaplo_config").resolve() ph_prefix = (dname_work / "predicthaplo_output_").resolve() - write_config_file(Path(fname_reference).resolve(), fname_sam, fname_config, ph_prefix, dname_work, fname_result_haplos) + write_config_file( + Path(fname_reference).resolve(), + fname_sam, + fname_config, + ph_prefix, + dname_work, + fname_result_haplos, + ) # execute tool subprocess.run( @@ -112,8 +127,8 @@ def main(fname_bam, # clean output file record_list = [] - for record in SeqIO.parse( (dname_work / fname_output_haplos).resolve(), "fasta"): - freq = record.description.split('Freq:')[1].split(". Overla")[0] + for record in SeqIO.parse((dname_work / fname_output_haplos).resolve(), "fasta"): + freq = record.description.split("Freq:")[1].split(". Overla")[0] record.description = f"freq:{freq}" seq_nodel = str(record.seq).replace("-", "") record.seq = Seq(seq_nodel) diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/shorah_default.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/shorah_default.py index b74b3282..af280068 100644 --- a/resources/auxiliary_workflows/benchmark/resources/method_definitions/shorah_default.py +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/shorah_default.py @@ -5,10 +5,12 @@ from pathlib import Path -def main(fname_bam, fname_reference, fname_result, fname_result_haplos, dname_work, seq_tech): +def main( + fname_bam, fname_reference, fname_result, fname_result_haplos, dname_work, seq_tech +): dname_work.mkdir(parents=True, exist_ok=True) - if seq_tech == 'illumina': + if seq_tech == "illumina": window_length = 162 subprocess.run( diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/strainline.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/strainline.py index b135613c..7b2829be 100644 --- a/resources/auxiliary_workflows/benchmark/resources/method_definitions/strainline.py +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/strainline.py @@ -26,15 +26,15 @@ def fastq_to_fasta(input_file, output_file): SeqIO.write(record, f_out, "fasta") - -def main(fname_bam, - fname_reference, - fname_results_snv, - fname_result_haplos, - dname_work, - seq_type, - threads,): - +def main( + fname_bam, + fname_reference, + fname_results_snv, + fname_result_haplos, + dname_work, + seq_type, + threads, +): # create empty vcf files f = open(fname_results_snv, "a") @@ -58,8 +58,8 @@ def main(fname_bam, seq_platform = "ont" # fastq to fasta - fname_reads_fastq = str(fname_bam.resolve()).split("bam")[0]+"fastq" - fname_reads_fasta = str(fname_bam.resolve()).split("bam")[0]+"fasta" + fname_reads_fastq = str(fname_bam.resolve()).split("bam")[0] + "fastq" + fname_reads_fasta = str(fname_bam.resolve()).split("bam")[0] + "fasta" fastq_to_fasta(fname_reads_fastq, fname_reads_fasta) dname_work.mkdir(parents=True, exist_ok=True) @@ -83,9 +83,6 @@ def main(fname_bam, # adapt haplotype output - - - if __name__ == "__main__": main( Path(snakemake.input.fname_bam), diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.00001.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.00001.py index 6830f7ca..6878caae 100644 --- a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.00001.py +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.00001.py @@ -27,11 +27,19 @@ def gunzip(source_filepath, dest_filepath, block_size=65536): else: d_file.write(block) + def filter_snvs(fname_vcf_viloca, fname_results_snv, posterior_threshold): vf = pyvcf.VcfFrame.from_file(fname_vcf_viloca) - info_strings = '{"' + vf.df.INFO.str.split(';').str.join('","').str.replace('=','":"').str.replace("\"\",", "") + '"}' + info_strings = ( + '{"' + + vf.df.INFO.str.split(";") + .str.join('","') + .str.replace("=", '":"') + .str.replace('"",', "") + + '"}' + ) info_df = pd.json_normalize(info_strings.apply(eval)) - filter_out = info_df["Post1"].astype(float)>posterior_threshold + filter_out = info_df["Post1"].astype(float) > posterior_threshold vf.df = vf.df[filter_out] vf.to_file(fname_results_snv) @@ -43,7 +51,7 @@ def main( fname_results_snv, fname_result_haplos, dname_work, - threads =1 + threads=1, ): genome_size = str(fname_bam).split("genome_size~")[1].split("__coverage")[0] alpha = 0.00001 @@ -115,8 +123,14 @@ def main( # here are all snvs regardless their posterior: (dname_work / "snv" / "SNVs_0.010000_final.vcf") # filter out snvs with low posterior, threshold = 0.9 as default in shorah - fname_vcf_viloca = str(Path(dname_work / "snv" / "SNVs_0.010000_final.vcf").resolve()) - filter_snvs(fname_vcf_viloca, str(Path(fname_results_snv).resolve()), posterior_threshold = 0.9) + fname_vcf_viloca = str( + Path(dname_work / "snv" / "SNVs_0.010000_final.vcf").resolve() + ) + filter_snvs( + fname_vcf_viloca, + str(Path(fname_results_snv).resolve()), + posterior_threshold=0.9, + ) mypath = (dname_work / "support").resolve() onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] @@ -134,7 +148,9 @@ def main( elif file.endswith(".fas"): fname_haplos = (dname_work / "support" / onlyfiles[0]).resolve() - shutil.copy((dname_work / "support" / file).resolve(), fname_result_haplos) + shutil.copy( + (dname_work / "support" / file).resolve(), fname_result_haplos + ) # fix frequency information @@ -142,7 +158,9 @@ def main( post_list = [] for record in SeqIO.parse(fname_result_haplos, "fasta"): freq_list.append(float(record.description.split("ave_reads=")[-1])) - post_list.append(float(record.description.split("posterior=")[-1].split(" ave_")[0])) + post_list.append( + float(record.description.split("posterior=")[-1].split(" ave_")[0]) + ) norm_freq_list = [float(i) / sum(freq_list) for i in freq_list] record_list = [] @@ -161,4 +179,3 @@ def main( Path(snakemake.output.fname_result_haplos), Path(snakemake.output.dname_work), ) - 165,1 Bot diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_envp.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_envp.py index 35288af3..6f8ac848 100644 --- a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_envp.py +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_envp.py @@ -37,11 +37,11 @@ def main( n_mfa_starts = 1 win_min_ext = 0.85 - #coverage = str(fname_bam).split("coverage~")[1].split("__")[0] - #if float(coverage) > 200: + # coverage = str(fname_bam).split("coverage~")[1].split("__")[0] + # if float(coverage) > 200: # exclude_non_var_pos_threshold = 2 / float(coverage) - #else: - # exclude_non_var_pos_threshold = 1 / float(coverage) + # else: + # exclude_non_var_pos_threshold = 1 / float(coverage) exclude_non_var_pos_threshold = 0.01 read_length = str(fname_bam).split("read_length~")[1].split("__")[0] diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_envp_0.001.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_envp_0.001.py index 77058b9a..45ef81e9 100644 --- a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_envp_0.001.py +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_envp_0.001.py @@ -37,11 +37,11 @@ def main( n_mfa_starts = 1 win_min_ext = 0.85 - #coverage = str(fname_bam).split("coverage~")[1].split("__")[0] - #if float(coverage) > 200: + # coverage = str(fname_bam).split("coverage~")[1].split("__")[0] + # if float(coverage) > 200: # exclude_non_var_pos_threshold = 2 / float(coverage) - #else: - # exclude_non_var_pos_threshold = 1 / float(coverage) + # else: + # exclude_non_var_pos_threshold = 1 / float(coverage) exclude_non_var_pos_threshold = 0.001 read_length = str(fname_bam).split("read_length~")[1].split("__")[0] @@ -106,7 +106,6 @@ def main( (dname_work / "snv" / "SNVs_0.010000_final.vcf").rename(fname_results_snv) - mypath = (dname_work / "support").resolve() onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] print("onlyfiles", onlyfiles) @@ -148,4 +147,3 @@ def main( Path(snakemake.output.fname_result_haplos), Path(snakemake.output.dname_work), ) - 149,1 Bot diff --git a/resources/auxiliary_workflows/benchmark/workflow/scripts/bed_file_single_amplicon.py b/resources/auxiliary_workflows/benchmark/workflow/scripts/bed_file_single_amplicon.py index 187c9192..9ee48f85 100644 --- a/resources/auxiliary_workflows/benchmark/workflow/scripts/bed_file_single_amplicon.py +++ b/resources/auxiliary_workflows/benchmark/workflow/scripts/bed_file_single_amplicon.py @@ -10,7 +10,7 @@ def main(fname_reference, fname_insert_bed, genome_size): f.write( str(seq_id) + " 1 " - + str(int(genome_size)+1) + + str(int(genome_size) + 1) + " scheme_INSERT_1 1 +" ) diff --git a/resources/auxiliary_workflows/benchmark/workflow/scripts/generate_haplotypes.py b/resources/auxiliary_workflows/benchmark/workflow/scripts/generate_haplotypes.py index f5169512..27d361b2 100644 --- a/resources/auxiliary_workflows/benchmark/workflow/scripts/generate_haplotypes.py +++ b/resources/auxiliary_workflows/benchmark/workflow/scripts/generate_haplotypes.py @@ -48,27 +48,43 @@ def generate_haplotype(seq_master, mutation_rate=0, insertion_rate=0, deletion_r deletion_count = int(len(seq_haplotype) * deletion_rate) insertion_count = int(len(seq_haplotype) * insertion_rate) position_list = np.random.choice( - np.asarray([number for number in np.arange(len(seq_haplotype)) if not number in position_list_muts ]), - size=deletion_count+insertion_count, - replace=False + np.asarray( + [ + number + for number in np.arange(len(seq_haplotype)) + if not number in position_list_muts + ] + ), + size=deletion_count + insertion_count, + replace=False, ) position_list = sorted(position_list, reverse=True) # insertions seq_haplotype = list(seq_haplotype) insertion_positions = position_list[:insertion_count] - insertion_list = np.random.choice(BASE_LIST, size=(insertion_count,3)) + insertion_list = np.random.choice(BASE_LIST, size=(insertion_count, 3)) insertion_list = np.asarray(["".join(aa) for aa in insertion_list]) - for count, (insert_pos, insert) in enumerate(zip(insertion_positions, insertion_list)): + for count, (insert_pos, insert) in enumerate( + zip(insertion_positions, insertion_list) + ): # count is 1- based # Insert values along the given axis before the given indices. - seq_haplotype.insert(insert_pos+count,insert) + seq_haplotype.insert(insert_pos + count, insert) ground_truth["type"].extend(["insertion"] * insertion_count) - ground_truth["position"].extend([pos -1 for pos in insertion_positions]) # 0-based - ground_truth["variant"].extend([ref+codon for (ref, codon) in zip(seq_master[[pos -1 for pos in insertion_positions]], insertion_list)]) - ground_truth["reference"].extend(seq_master[[pos -1 for pos in insertion_positions]]) - + ground_truth["position"].extend([pos - 1 for pos in insertion_positions]) # 0-based + ground_truth["variant"].extend( + [ + ref + codon + for (ref, codon) in zip( + seq_master[[pos - 1 for pos in insertion_positions]], insertion_list + ) + ] + ) + ground_truth["reference"].extend( + seq_master[[pos - 1 for pos in insertion_positions]] + ) # deletions seq_haplotype = np.asarray(seq_haplotype) @@ -76,11 +92,17 @@ def generate_haplotype(seq_master, mutation_rate=0, insertion_rate=0, deletion_r for delection_pos in deletion_positions: # Return a new array with sub-arrays along an axis deleted. For a one dimensional array, this returns those entries not returned by arr[obj]. - seq_haplotype = np.delete(seq_haplotype, [delection_pos, delection_pos+1, delection_pos+2]) + seq_haplotype = np.delete( + seq_haplotype, [delection_pos, delection_pos + 1, delection_pos + 2] + ) ground_truth["type"].extend(["deletion"]) - ground_truth["position"].extend([delection_pos-2]) - ground_truth["variant"].extend([seq_master[delection_pos-2]]) #delection_pos ? - ground_truth["reference"].extend(["".join(seq_master[delection_pos-2:delection_pos+2])]) + ground_truth["position"].extend([delection_pos - 2]) + ground_truth["variant"].extend( + [seq_master[delection_pos - 2]] + ) # delection_pos ? + ground_truth["reference"].extend( + ["".join(seq_master[delection_pos - 2 : delection_pos + 2])] + ) return "".join(seq_haplotype), pd.DataFrame(ground_truth) diff --git a/resources/auxiliary_workflows/benchmark/workflow/scripts/performance_measures_global.py b/resources/auxiliary_workflows/benchmark/workflow/scripts/performance_measures_global.py index 4f1382f9..e0765739 100644 --- a/resources/auxiliary_workflows/benchmark/workflow/scripts/performance_measures_global.py +++ b/resources/auxiliary_workflows/benchmark/workflow/scripts/performance_measures_global.py @@ -44,7 +44,6 @@ def read_fasta_files(fasta_files, with_method=True): else: posterior = np.nan - if freq is None: freq = props.get("Freq") diff --git a/resources/auxiliary_workflows/benchmark/workflow/scripts/performance_measures_local.py b/resources/auxiliary_workflows/benchmark/workflow/scripts/performance_measures_local.py index 0e30234d..0efbe50c 100644 --- a/resources/auxiliary_workflows/benchmark/workflow/scripts/performance_measures_local.py +++ b/resources/auxiliary_workflows/benchmark/workflow/scripts/performance_measures_local.py @@ -20,7 +20,7 @@ def convert_vcf(fname): vcf = VCF(fname) except OSError: return set() - + for variant in VCF(fname): for base in variant.ALT: zero_based_pos = variant.POS - 1 # VCF is 1-based diff --git a/resources/auxiliary_workflows/benchmark/workflow/scripts/shotgun_simulation.py b/resources/auxiliary_workflows/benchmark/workflow/scripts/shotgun_simulation.py index f681c7f8..82c20c6f 100644 --- a/resources/auxiliary_workflows/benchmark/workflow/scripts/shotgun_simulation.py +++ b/resources/auxiliary_workflows/benchmark/workflow/scripts/shotgun_simulation.py @@ -18,7 +18,7 @@ def simulate_illumina(fname_haplotype, coverage_haplotype, read_length, art_pref subprocess.run( [ "art_illumina", - "-sam", # this alignment fails when we simulate indels in the haplotypes + "-sam", # this alignment fails when we simulate indels in the haplotypes "--paired", "-m", str( @@ -285,25 +285,36 @@ def main( insertion_rate = float(params["haplos"].split("@")[-3]) # bwa is not able to align the simulated indels correctly.vim - if (insertion_rate>0) or (deletion_rate>0): - subprocess.run(["rm", fname_bam]) - # align reads - subprocess.run( - [ - "minimap2", - "-ax", - "sr", - "--secondary=no", - str(dname_work.resolve())[:-4] +"reference.fasta", - str(fname_fastq.resolve()), - "-o", - str(art_prefix) + ".sam", - ] - ) - - subprocess.run(["samtools", "sort", str(art_prefix) + ".sam", "-o", str(art_prefix) + ".sam"]) - - subprocess.run(["samtools", "view", "-bS", str(art_prefix) + ".sam", "-o", fname_bam]) + if (insertion_rate > 0) or (deletion_rate > 0): + subprocess.run(["rm", fname_bam]) + # align reads + subprocess.run( + [ + "minimap2", + "-ax", + "sr", + "--secondary=no", + str(dname_work.resolve())[:-4] + "reference.fasta", + str(fname_fastq.resolve()), + "-o", + str(art_prefix) + ".sam", + ] + ) + + subprocess.run( + [ + "samtools", + "sort", + str(art_prefix) + ".sam", + "-o", + str(art_prefix) + ".sam", + ] + ) + + subprocess.run( + ["samtools", "view", "-bS", str(art_prefix) + ".sam", "-o", fname_bam] + ) + if __name__ == "__main__": main( diff --git a/workflow/scripts/file_parser.py b/workflow/scripts/file_parser.py index 2acbb904..27360ab0 100755 --- a/workflow/scripts/file_parser.py +++ b/workflow/scripts/file_parser.py @@ -202,7 +202,7 @@ def main(): datefmt = None if args.config_file: with open(args.config_file, "r") as stream: - yaml = ruamel.yaml.YAML(typ='rt') + yaml = ruamel.yaml.YAML(typ="rt") reg_data = yaml.load(stream) if "sample" in reg_data: rxsam = regex.compile(reg_data.get("sample"))