Keep the linter happy

cbg-ethz · Jun 7, 2024 · 9ab7264 · 9ab7264
1 parent 04ca094
commit 9ab7264
Show file tree

Hide file tree

Showing 23 changed files with 401 additions and 225 deletions.
diff --git a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/README.md b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/README.md
@@ -1,27 +1,29 @@
+# Local haplotype reconstruction benchmark 
+
 This repository stores the scripts and notebooks used to conduct the benchmark study presented in the manuscript: XXX
 
 To reproduce the benchmark study and create the figures presented in the manuscript, use the following instructions:
 1. Clone the repository of V-pipe 3.0 into your working directory:
-```
-git clone https://github.com/cbg-ethz/V-pipe.git
-```
+   ```shell
+   git clone https://github.com/cbg-ethz/V-pipe.git
+   ```
 2. Go into the directory of the benchmarking study for the local haplotype reconstruction:
-```
-cd V-pipe/resources/auxiliary_workflows/benchmark/resources/local_haplo_setup
-```
+   ```shell
+   cd V-pipe/resources/auxiliary_workflows/benchmark/resources/local_haplo_setup
+   ```
 3. The parameters to reproduce the synthetic datasets are in the parameter files `config_xxx/params.csv` with the configuration file `config_xxx/config.yaml` where simulation mode, replicate number and methods to be executed are defined.   
 4. The methods to execute must be define in a Python script in this directory:  
-```
-V-pipe/resources/auxiliary_workflows/benchmark/resources/method_definitions.
-```
+   ```shell
+   V-pipe/resources/auxiliary_workflows/benchmark/resources/method_definitions.
+   ```
 5. Now the workflow is ready, go back to the directory  
-```
-cd V-pipe/resources/auxiliary_workflows/benchmark/resources/local_haplo_setup
-```
+   ```shell
+   cd V-pipe/resources/auxiliary_workflows/benchmark/resources/local_haplo_setup
+   ```
 6. To install the needed Conda environments execute:
-```
-snakemake --conda-create-envs-only --use-conda -c1
-```
+   ```shell
+   snakemake --conda-create-envs-only --use-conda -c1
+   ```
 7. To submit the workflow to a lsf-cluster execute `./run_workflow.sh`, otherwise execute the workflow with `snakemake --use-conda -c1`  
 8. The workflow will provide the results in the directory `results`.   
 9. When the workflow has terminated and all result files were generated, figures from the manuscript can be generated by executing the notebooks in `workflow/notebooks/`.  
diff --git a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/workflow/Snakefile b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/workflow/Snakefile
diff --git a/...ark/resources/local_haplotype_setup/workflow/scripts/performance_measures_indels_freqs.py b/...ark/resources/local_haplotype_setup/workflow/scripts/performance_measures_indels_freqs.py
@@ -27,10 +27,12 @@ def convert_vcf(fname):
             variant_list.add(f"{zero_based_pos}{base}")
     return variant_list
 
+
 def convert_groundtruth(fname):
     df = pd.read_csv(fname, index_col=0)
     return set((df["position"].astype(str) + df["variant"]).tolist())
 
+
 def mutation_calls_details(vcf_list, groundtruth_list):
     # compute performance
     tmp = []
@@ -45,11 +47,16 @@ def mutation_calls_details(vcf_list, groundtruth_list):
         predicted_variants = convert_vcf(fname_vcf)
 
         # iter through ground truth mutations
-        set_true_variants = [str(gt_row["position"]) + gt_row["variant"] for iter_row, gt_row in pd.read_csv(fname_groundtruth, index_col=0).iterrows()]
+        set_true_variants = [
+            str(gt_row["position"]) + gt_row["variant"]
+            for iter_row, gt_row in pd.read_csv(
+                fname_groundtruth, index_col=0
+            ).iterrows()
+        ]
         for iter_row, gt_row in pd.read_csv(fname_groundtruth, index_col=0).iterrows():
             true_variant = str(gt_row["position"]) + gt_row["variant"]
             mutation_type = str(gt_row["type"])
-            if method.startswith(('lofreq', 'shorah', 'viloca')):
+            if method.startswith(("lofreq", "shorah", "viloca")):
                 is_false_negative = True
                 for variant in VCF(fname_vcf):
                     for idx_base, base in enumerate(variant.ALT):
@@ -59,14 +66,20 @@ def mutation_calls_details(vcf_list, groundtruth_list):
                         if true_variant == predicted_variant:
                             is_false_negative = False
                             # get pred_freq
-                            if method.startswith('lofreq'):
+                            if method.startswith("lofreq"):
                                 posterior = 1.0
-                                freq = variant.INFO.get('AF')#vi.split(',')[idx_base]
-                            elif method.startswith(('shorah', 'viloca')):
+                                freq = variant.INFO.get("AF")  # vi.split(',')[idx_base]
+                            elif method.startswith(("shorah", "viloca")):
                                 if mutation_type != "insertion":
-                                    freq = (int(variant.INFO.get('Fvar')) + int(variant.INFO.get('Rvar'))) / (int(variant.INFO.get('Ftot')) + int(variant.INFO.get('Rtot')))
+                                    freq = (
+                                        int(variant.INFO.get("Fvar"))
+                                        + int(variant.INFO.get("Rvar"))
+                                    ) / (
+                                        int(variant.INFO.get("Ftot"))
+                                        + int(variant.INFO.get("Rtot"))
+                                    )
                                 elif mutation_type == "insertion":
-                                    freq = (float(variant.INFO.get('Freq1')))
+                                    freq = float(variant.INFO.get("Freq1"))
 
                             tmp.append(
                                 {
@@ -125,6 +138,7 @@ def main(vcf_list, groundtruth_list, fname_out):
     df = mutation_calls_details(vcf_list, groundtruth_list)
     df.to_csv(fname_out)
 
+
 if __name__ == "__main__":
     main(
         [Path(e) for e in snakemake.input.vcf_list],

diff --git a/...enchmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_runtime.py b/...enchmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_runtime.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 import pandas as pd
 
+
 def runtime(benchmark_list, fname_out):
     # gather benchmark information
     tmp = []

diff --git a/...hmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_snvs_freqs.py b/...hmark/resources/local_haplotype_setup/workflow/scripts/performance_measures_snvs_freqs.py
@@ -27,10 +27,12 @@ def convert_vcf(fname):
             variant_list.add(f"{zero_based_pos}{base}")
     return variant_list
 
+
 def convert_groundtruth(fname):
     df = pd.read_csv(fname, index_col=0)
     return set((df["position"].astype(str) + df["variant"]).tolist())
 
+
 def mutation_calls_details(vcf_list, groundtruth_list):
     # compute performance
     tmp = []
@@ -48,7 +50,7 @@ def mutation_calls_details(vcf_list, groundtruth_list):
         for iter_row, gt_row in pd.read_csv(fname_groundtruth, index_col=0).iterrows():
             true_variant = str(gt_row["position"]) + gt_row["variant"]
             mutation_type = str(gt_row["type"])
-            if method.startswith(('lofreq', 'shorah')):
+            if method.startswith(("lofreq", "shorah")):
                 is_false_negative = True
                 for variant in VCF(fname_vcf):
                     for idx_base, base in enumerate(variant.ALT):
@@ -58,12 +60,18 @@ def mutation_calls_details(vcf_list, groundtruth_list):
                         if true_variant == predicted_variant:
                             is_false_negative = False
                             # get pred_freq
-                            if method.startswith('lofreq'):
+                            if method.startswith("lofreq"):
                                 posterior = 1.0
-                                freq = variant.INFO.get('AF')#vi.split(',')[idx_base]
-                            elif method.startswith('shorah'):
+                                freq = variant.INFO.get("AF")  # vi.split(',')[idx_base]
+                            elif method.startswith("shorah"):
                                 posterior = 1.0
-                                freq = (int(variant.INFO.get('Fvar')) + int(variant.INFO.get('Rvar'))) / (int(variant.INFO.get('Ftot')) + int(variant.INFO.get('Rtot')))
+                                freq = (
+                                    int(variant.INFO.get("Fvar"))
+                                    + int(variant.INFO.get("Rvar"))
+                                ) / (
+                                    int(variant.INFO.get("Ftot"))
+                                    + int(variant.INFO.get("Rtot"))
+                                )
 
                             tmp.append(
                                 {
@@ -99,23 +107,41 @@ def mutation_calls_details(vcf_list, groundtruth_list):
                         }
                     )
 
-            elif method.startswith('viloca'):
-
-                colnames=['Chromosome', 'Pos', 'Ref', 'Alt', 'Frq', 'Pst', 'Rvar', 'Fvar', 'Rtot', 'Ftot', 'Qval']
+            elif method.startswith("viloca"):
+
+                colnames = [
+                    "Chromosome",
+                    "Pos",
+                    "Ref",
+                    "Alt",
+                    "Frq",
+                    "Pst",
+                    "Rvar",
+                    "Fvar",
+                    "Rtot",
+                    "Ftot",
+                    "Qval",
+                ]
                 # we want the file SNVs_0.010000.tsv as the posterior filter was not applied here
-                fname_SNVs_correct = str(fname_vcf).split("snvs_.vcf")[0] + "work/snv/SNVs_0.010000.tsv"
-                df_predicted = pd.read_csv(fname_SNVs_correct, names=colnames, header=None ,sep="\t")
-                df_predicted['Pst'] = pd.to_numeric(df_predicted['Pst'], errors='coerce')
+                fname_SNVs_correct = (
+                    str(fname_vcf).split("snvs_.vcf")[0] + "work/snv/SNVs_0.010000.tsv"
+                )
+                df_predicted = pd.read_csv(
+                    fname_SNVs_correct, names=colnames, header=None, sep="\t"
+                )
+                df_predicted["Pst"] = pd.to_numeric(
+                    df_predicted["Pst"], errors="coerce"
+                )
 
                 for iter_row_pred, pred_row in df_predicted.iterrows():
-                    zero_based_pos = int(pred_row['Pos']) - 1  # VCF is 1-based
+                    zero_based_pos = int(pred_row["Pos"]) - 1  # VCF is 1-based
                     is_false_negative = True
                     predicted_variant = f"{zero_based_pos}{pred_row['Alt']}"
 
                     if true_variant == predicted_variant:
                         is_false_negative = False
-                        posterior = pred_row['Pst']
-                        freq = pred_row['Frq']
+                        posterior = pred_row["Pst"]
+                        freq = pred_row["Frq"]
 
                         tmp.append(
                             {
@@ -158,6 +184,7 @@ def main(vcf_list, groundtruth_list, fname_out):
     df = mutation_calls_details(vcf_list, groundtruth_list)
     df.to_csv(fname_out)
 
+
 if __name__ == "__main__":
     main(
         [Path(e) for e in snakemake.input.vcf_list],

diff --git a/...mark/resources/local_haplotype_setup/workflow/scripts/performance_measures_snvs_viloca.py b/...mark/resources/local_haplotype_setup/workflow/scripts/performance_measures_snvs_viloca.py
@@ -18,11 +18,12 @@ def convert_vcf(df):
     variant_list = set()
 
     for idx_row, variant in df.iterrows():
-        for base in variant['Alt']:
-            zero_based_pos = int(variant['Pos']) - 1  # VCF is 1-based
+        for base in variant["Alt"]:
+            zero_based_pos = int(variant["Pos"]) - 1  # VCF is 1-based
             variant_list.add(f"{zero_based_pos}{base}")
     return variant_list
 
+
 def convert_groundtruth(fname):
     df = pd.read_csv(fname, index_col=0)
     return set((df["position"].astype(str) + df["variant"]).tolist())
@@ -76,20 +77,38 @@ def compute_performance(true_variants, predicted_variants):
 
 def performance_plots(vcf_list, groundtruth_list, posterior_threshold):
     # compute performance
-    colnames=['Chromosome', 'Pos', 'Ref', 'Alt', 'Frq', 'Pst', 'Rvar', 'Fvar', 'Rtot', 'Ftot', 'Qval']
+    colnames = [
+        "Chromosome",
+        "Pos",
+        "Ref",
+        "Alt",
+        "Frq",
+        "Pst",
+        "Rvar",
+        "Fvar",
+        "Rtot",
+        "Ftot",
+        "Qval",
+    ]
     tmp = []
     fps_tmp = []
     for fname_vcf, fname_groundtruth in zip(vcf_list, groundtruth_list):
 
         # we want the file SNVs_0.010000.tsv as the posterior filter was not applied here
-        fname_SNVs_correct = str(fname_vcf).split("snvs_.vcf")[0] + "work/snv/SNVs_0.010000.tsv"
+        fname_SNVs_correct = (
+            str(fname_vcf).split("snvs_.vcf")[0] + "work/snv/SNVs_0.010000.tsv"
+        )
         if os.path.isfile(fname_SNVs_correct):
-            df_predicted = pd.read_csv(fname_SNVs_correct, names=colnames, header=None ,sep="\t")
+            df_predicted = pd.read_csv(
+                fname_SNVs_correct, names=colnames, header=None, sep="\t"
+            )
         else:
-            fname_SNVs_correct = str(fname_vcf).split("snvs_.vcf")[0] + "work/snv/SNVs_0.010000_final.csv"
+            fname_SNVs_correct = (
+                str(fname_vcf).split("snvs_.vcf")[0]
+                + "work/snv/SNVs_0.010000_final.csv"
+            )
             df_predicted = pd.read_csv(fname_SNVs_correct)
-            df_predicted = df_predicted.rename(columns={'Var': 'Alt',
-                                                        'Pst2': 'Pst'})
+            df_predicted = df_predicted.rename(columns={"Var": "Alt", "Pst2": "Pst"})
 
         parts = str(fname_vcf).split("/")
 
@@ -98,11 +117,10 @@ def performance_plots(vcf_list, groundtruth_list, posterior_threshold):
         elif len(parts) == 8:  # for multi workflow
             _, _, _, params, method, _, replicate, _ = parts
 
-
-        #filter dataframe according to posterior_threshold
-        df_predicted['Pst'] = pd.to_numeric(df_predicted['Pst'], errors='coerce')
-        df_predicted = df_predicted.dropna(subset=['Pst'])
-        df_predicted = df_predicted[df_predicted['Pst']>posterior_threshold]
+        # filter dataframe according to posterior_threshold
+        df_predicted["Pst"] = pd.to_numeric(df_predicted["Pst"], errors="coerce")
+        df_predicted = df_predicted.dropna(subset=["Pst"])
+        df_predicted = df_predicted[df_predicted["Pst"] > posterior_threshold]
 
         true_variants = convert_groundtruth(fname_groundtruth)
         predicted_variants = convert_vcf(df_predicted)
@@ -143,7 +161,6 @@ def performance_plots(vcf_list, groundtruth_list, posterior_threshold):
     return df_perf
 
 
-
 def main(vcf_list, groundtruth_list, fname_out):
 
     posterior_thresholds = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
@@ -152,15 +169,14 @@ def main(vcf_list, groundtruth_list, fname_out):
     for posterior_threshold in posterior_thresholds:
 
         df_pos_thres = performance_plots(
-                                vcf_list,
-                                groundtruth_list,
-                                posterior_threshold
-                                )
+            vcf_list, groundtruth_list, posterior_threshold
+        )
 
         dfs_tmp.append(df_pos_thres)
 
     pd.concat(dfs_tmp).to_csv(fname_out)
 
+
 if __name__ == "__main__":
     main(
         [Path(e) for e in snakemake.input.vcf_list],

diff --git a/...urces/auxiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo.py b/...urces/auxiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo.py
@@ -27,7 +27,7 @@ def main(
         check=True,
     )
 
-    runtime_limit = 15*24*60*60 - 60*60  # 15 days - 1h
+    runtime_limit = 15 * 24 * 60 * 60 - 60 * 60  # 15 days - 1h
     cliquesnv_mode = None
     if seq_type == "illumina":
         cliquesnv_mode = "snv-illumina"

diff --git a/...xiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo_tf0.001.py b/...xiliary_workflows/benchmark/resources/method_definitions/cliquesnv_local_haplo_tf0.001.py
@@ -27,7 +27,7 @@ def main(
         check=True,
     )
 
-    runtime_limit = 15*24*60*60 - 60*60  # 15 days - 1h
+    runtime_limit = 15 * 24 * 60 * 60 - 60 * 60  # 15 days - 1h
     cliquesnv_mode = None
     if seq_type == "illumina":
         cliquesnv_mode = "snv-illumina"