From bbfe6047dcbd72ae7477c718bfe227d0ea254ced Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Thu, 18 Apr 2024 15:44:28 +0200 Subject: [PATCH] update method defintions as run on cluster --- .../resources/method_definitions/viloca.py | 2 +- .../viloca_alpha_0.00001.py | 17 ++--- .../method_definitions/viloca_envp.py | 25 ++++---- .../{viloca_ewm.py => viloca_envp_0.001.py} | 63 ++++++++++++++----- .../method_definitions/viloca_ewm_uni.py | 8 +-- .../viloca_learn_error_params.py | 10 +-- .../method_definitions/viloca_uniform.py | 10 +-- 7 files changed, 86 insertions(+), 49 deletions(-) rename resources/auxiliary_workflows/benchmark/resources/method_definitions/{viloca_ewm.py => viloca_envp_0.001.py} (54%) diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca.py index 007bc68e..748fdf06 100644 --- a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca.py +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca.py @@ -1,7 +1,7 @@ # GROUP: global # CONDA: libshorah # CONDA: biopython = 1.79 -# PIP: git+https://github.com/LaraFuhrmann/shorah@master +# PIP: git+https://github.com/cbg-ethz/VILOCA@master import subprocess from pathlib import Path diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.00001.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.00001.py index e65d84a5..6830f7ca 100644 --- a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.00001.py +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.00001.py @@ -1,9 +1,9 @@ # GROUP: global # CONDA: libshorah -# CONDA: pyvcf +# CONDA: fuc # CONDA: biopython = 1.79 # PIP: pandas -# PIP: git+https://github.com/LaraFuhrmann/shorah@master +# PIP: git+https://github.com/cbg-ethz/VILOCA@master import subprocess from pathlib import Path @@ -63,8 +63,8 @@ def main( if fname_insert_bed == []: subprocess.run( [ - "shorah", - "shotgun", + "viloca", + "run", "-b", fname_bam.resolve(), "-f", @@ -88,8 +88,8 @@ def main( # insert bed file is there subprocess.run( [ - "shorah", - "shotgun", + "viloca", + "run", "-b", fname_bam.resolve(), "-f", @@ -115,8 +115,8 @@ def main( # here are all snvs regardless their posterior: (dname_work / "snv" / "SNVs_0.010000_final.vcf") # filter out snvs with low posterior, threshold = 0.9 as default in shorah - fname_vcf_viloca = (dname_work / "snv" / "SNVs_0.010000_final.vcf").resolve() - filter_snvs(fname_vcf_viloca, fname_results_snv, posterior_threshold = 0.9) + fname_vcf_viloca = str(Path(dname_work / "snv" / "SNVs_0.010000_final.vcf").resolve()) + filter_snvs(fname_vcf_viloca, str(Path(fname_results_snv).resolve()), posterior_threshold = 0.9) mypath = (dname_work / "support").resolve() onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] @@ -161,3 +161,4 @@ def main( Path(snakemake.output.fname_result_haplos), Path(snakemake.output.dname_work), ) + 165,1 Bot diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_envp.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_envp.py index 2f4aa76d..35288af3 100644 --- a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_envp.py +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_envp.py @@ -1,7 +1,7 @@ # GROUP: global # CONDA: libshorah # CONDA: biopython = 1.79 -# PIP: git+https://github.com/LaraFuhrmann/shorah@master +# PIP: git+https://github.com/cbg-ethz/VILOCA@master import subprocess from pathlib import Path @@ -32,17 +32,18 @@ def main( dname_work, ): genome_size = str(fname_bam).split("genome_size~")[1].split("__coverage")[0] - alpha = 0.000001 - n_max_haplotypes = 500 + alpha = 0.00001 + n_max_haplotypes = 100 n_mfa_starts = 1 win_min_ext = 0.85 - coverage = str(fname_bam).split("coverage~")[1].split("__")[0] - if float(coverage) > 200: - exclude_non_var_pos_threshold = 2 / float(coverage) - else: - exclude_non_var_pos_threshold = 1 / float(coverage) + #coverage = str(fname_bam).split("coverage~")[1].split("__")[0] + #if float(coverage) > 200: + # exclude_non_var_pos_threshold = 2 / float(coverage) + #else: + # exclude_non_var_pos_threshold = 1 / float(coverage) + exclude_non_var_pos_threshold = 0.01 read_length = str(fname_bam).split("read_length~")[1].split("__")[0] if read_length == "Ten_strain_IAV": sampler = "learn_error_params" @@ -54,8 +55,8 @@ def main( if fname_insert_bed == []: subprocess.run( [ - "shorah", - "shotgun", + "viloca", + "run", "-b", fname_bam.resolve(), "-f", @@ -79,8 +80,8 @@ def main( # insert bed file is there subprocess.run( [ - "shorah", - "shotgun", + "viloca", + "run", "-b", fname_bam.resolve(), "-f", diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_ewm.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_envp_0.001.py similarity index 54% rename from resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_ewm.py rename to resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_envp_0.001.py index b7d2d0e4..77058b9a 100644 --- a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_ewm.py +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_envp_0.001.py @@ -1,7 +1,7 @@ # GROUP: global # CONDA: libshorah # CONDA: biopython = 1.79 -# PIP: git+https://github.com/LaraFuhrmann/shorah@master +# PIP: git+https://github.com/cbg-ethz/VILOCA@master import subprocess from pathlib import Path @@ -30,14 +30,20 @@ def main( fname_results_snv, fname_result_haplos, dname_work, - threads, ): genome_size = str(fname_bam).split("genome_size~")[1].split("__coverage")[0] - alpha = 0.000001 - n_max_haplotypes = 500 + alpha = 0.00001 + n_max_haplotypes = 100 n_mfa_starts = 1 win_min_ext = 0.85 + #coverage = str(fname_bam).split("coverage~")[1].split("__")[0] + #if float(coverage) > 200: + # exclude_non_var_pos_threshold = 2 / float(coverage) + #else: + # exclude_non_var_pos_threshold = 1 / float(coverage) + + exclude_non_var_pos_threshold = 0.001 read_length = str(fname_bam).split("read_length~")[1].split("__")[0] if read_length == "Ten_strain_IAV": sampler = "learn_error_params" @@ -49,8 +55,8 @@ def main( if fname_insert_bed == []: subprocess.run( [ - "shorah", - "shotgun", + "viloca", + "run", "-b", fname_bam.resolve(), "-f", @@ -65,9 +71,8 @@ def main( str(n_mfa_starts), "--win_min_ext", str(win_min_ext), - "--threads", - str(threads), - "--extended_window_mode", + "--exclude_non_var_pos_threshold", + str(exclude_non_var_pos_threshold), ], cwd=dname_work, ) @@ -93,15 +98,45 @@ def main( str(n_mfa_starts), "--win_min_ext", str(win_min_ext), - "--threads", - str(threads), - "--extended_window_mode", + "--exclude_non_var_pos_threshold", + str(exclude_non_var_pos_threshold), ], cwd=dname_work, ) (dname_work / "snv" / "SNVs_0.010000_final.vcf").rename(fname_results_snv) - open(fname_result_haplos, "a").close() + + + mypath = (dname_work / "support").resolve() + onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] + print("onlyfiles", onlyfiles) + for file in onlyfiles: + if "reads-support.fas" in file: + file_name = onlyfiles[0] + fname_haplos = (dname_work / "support" / onlyfiles[0]).resolve() + if file.endswith(".gz"): + fname_zipped = (dname_work / "support" / onlyfiles[0]).resolve() + fname_haplos = onlyfiles[0].split(".gz")[0] + fname_unzipped = (dname_work / "support" / fname_haplos).resolve() + # unzip + gunzip(fname_zipped, fname_result_haplos) + + elif file.endswith(".fas"): + fname_haplos = (dname_work / "support" / onlyfiles[0]).resolve() + (dname_work / "support" / file).rename(fname_result_haplos) + + # fix frequency information + + freq_list = [] + for record in SeqIO.parse(fname_result_haplos, "fasta"): + freq_list.append(float(record.description.split("ave_reads=")[-1])) + norm_freq_list = [float(i) / sum(freq_list) for i in freq_list] + + record_list = [] + for idx, record in enumerate(SeqIO.parse(fname_result_haplos, "fasta")): + record.description = f"freq:{norm_freq_list[idx]}" + record_list.append(record) + SeqIO.write(record_list, fname_result_haplos, "fasta") if __name__ == "__main__": @@ -112,5 +147,5 @@ def main( Path(snakemake.output.fname_result), Path(snakemake.output.fname_result_haplos), Path(snakemake.output.dname_work), - snakemake.threads, ) + 149,1 Bot diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_ewm_uni.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_ewm_uni.py index c9996485..b7c55b7d 100644 --- a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_ewm_uni.py +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_ewm_uni.py @@ -1,7 +1,7 @@ # GROUP: global # CONDA: libshorah # CONDA: biopython = 1.79 -# PIP: git+https://github.com/LaraFuhrmann/shorah@master +# PIP: git+https://github.com/cbg-ethz/VILOCA@master import subprocess from pathlib import Path @@ -34,7 +34,7 @@ def main( ): genome_size = str(fname_bam).split("genome_size~")[1].split("__coverage")[0] alpha = 0.000001 - n_max_haplotypes = 500 + n_max_haplotypes = 100 n_mfa_starts = 1 win_min_ext = 0.85 @@ -48,8 +48,8 @@ def main( dname_work.mkdir(parents=True, exist_ok=True) subprocess.run( [ - "shorah", - "shotgun", + "viloca", + "run", "-b", fname_bam.resolve(), "-f", diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_learn_error_params.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_learn_error_params.py index 4f8902f0..384ebc44 100644 --- a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_learn_error_params.py +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_learn_error_params.py @@ -1,7 +1,7 @@ # GROUP: global # CONDA: libshorah # CONDA: biopython = 1.79 -# PIP: git+https://github.com/LaraFuhrmann/shorah@master +# PIP: git+https://github.com/cbg-ethz/VILOCA@master import subprocess from pathlib import Path @@ -44,8 +44,8 @@ def main( if fname_insert_bed == []: subprocess.run( [ - "shorah", - "shotgun", + "viloca", + "run", "-b", fname_bam.resolve(), "-f", @@ -67,8 +67,8 @@ def main( # insert bed file is there subprocess.run( [ - "shorah", - "shotgun", + "viloca", + "run", "-b", fname_bam.resolve(), "-f", diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_uniform.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_uniform.py index 94e805e9..c9a8329f 100644 --- a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_uniform.py +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_uniform.py @@ -1,7 +1,7 @@ # GROUP: global # CONDA: libshorah # CONDA: biopython = 1.79 -# PIP: git+https://github.com/LaraFuhrmann/shorah@master +# PIP: git+https://github.com/cbg-ethz/VILOCA@master import subprocess from pathlib import Path @@ -32,8 +32,8 @@ def main( dname_work, ): genome_size = str(fname_bam).split("genome_size~")[1].split("__coverage")[0] - alpha = 0.000001 - n_max_haplotypes = 500 + alpha = 0.00001 + n_max_haplotypes = 100 n_mfa_starts = 1 win_min_ext = 0.85 @@ -47,8 +47,8 @@ def main( dname_work.mkdir(parents=True, exist_ok=True) subprocess.run( [ - "shorah", - "shotgun", + "viloca", + "run", "-b", fname_bam.resolve(), "-f",