From 5a108114f4045d314f272d8eaf468bfa4a60cf9f Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-13-2.tscc.sdsc.edu>
Date: Sat, 30 Nov 2024 18:13:53 -0800
Subject: [PATCH] adjustments for error catching

---
 marine.py                      | 25 +++++++++++-----
 src/utils.py                   | 53 +++++++++++++++++++---------------
 tests/integration_tests_run.sh |  2 +-
 3 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/marine.py b/marine.py
index 3e85b43..a17b1a8 100755
--- a/marine.py
+++ b/marine.py
@@ -149,6 +149,15 @@ def split_bed_file(input_bed_file, output_folder, bam_filepaths, output_suffix='
     """
     Split a BED file into multiple files based on suffixes in the first column.
     Each line is assigned to the appropriate file based on the suffix.
+
+    e.g.:
+    
+    10_AAACGAAAGTCACACT-1   6143263         6143264
+    10_AAACGAAAGTCACACT-1   11912575        11912576
+    10_AAACGAAAGTCACACT-1   12209751        12209752
+    10_AAACGAAAGTCACACT-1   13320235        13320236
+    10_AAACGAAAGTCACACT-1   27036085        27036086
+
     """
     if not os.path.exists(output_folder):
         os.makedirs(output_folder)
@@ -171,9 +180,10 @@ def split_bed_file(input_bed_file, output_folder, bam_filepaths, output_suffix='
             for line in infile:
                 # Parse the first column to determine the suffix
                 columns = line.split()
+                
                 chrom = columns[0]  # Assuming the first column is the chromosome
                 for prefix, suffix in suffix_pairs:
-                    if chrom.startswith(prefix) and chrom.endswith(suffix):
+                    if chrom.startswith(f"{prefix}_") and chrom.endswith(suffix):
                         file_handles[prefix + suffix].write(line)
                         break
 
@@ -195,8 +205,9 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=
         "awk 'NR > 1 {{print $2, $4-1, $4}}' OFS='\t' \"$file\"; "
         "done | sort -k1,1 -k2,2n -u > {}/combined_source_cells.bed;"
     ).format(output_folder, output_folder)
-    run_command(combine_edit_sites_command)
 
+    if not os.path.exists(f'{output_folder}/combined_source_cells.bed'):
+        run_command(combine_edit_sites_command)
     
     all_depth_commands.append(combine_edit_sites_command)
 
@@ -447,11 +458,11 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
             print("Deleting overall_label_to_list_of_contents...")
             del overall_label_to_list_of_contents
 
-    
-    with open('{}/manifest.txt'.format(logging_folder), 'a+') as f:
-        f.write(f'total_reads_processed\t{overall_total_reads_processed}\n') 
-        for k, v in overall_counts_summary_df.items():
-            f.write(f'{k}\t{v}\n') 
+    if not coverage_only:
+        with open('{}/manifest.txt'.format(logging_folder), 'a+') as f:
+            f.write(f'total_reads_processed\t{overall_total_reads_processed}\n') 
+            for k, v in overall_counts_summary_df.items():
+                f.write(f'{k}\t{v}\n') 
 
         f.write(f'edits per read (EPR)\t{overall_counts_summary_df.get("total_edits")/overall_total_reads_processed}\n')
 
diff --git a/src/utils.py b/src/utils.py
index 7347c14..d005bde 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -665,10 +665,10 @@ def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_fi
     bash_command = f"""#!/bin/bash
     # Step 1: Adjust the depths file by adding a new column that incorporates both contig and position
     # for join purposes, and sort by this column. Output in tab-separated format.
-    awk -v OFS='\\t' '{{print $1, $2+{position_adjustment}, $3, $1":"($2+{position_adjustment})}}' "{file2_path}" | sort -k4,4n | uniq > {output_folder}/depth_modified.tsv
+    awk -v OFS='\\t' '{{print $1, $2+{position_adjustment}, $3, $1":"($2+{position_adjustment})}}' "{file2_path}" | sort -k4,4V | uniq > {output_folder}/depth_modified.tsv
     
     # Step 2: Sort the first file numerically by the join column (the column incuding both contig and position information)
-    sort -k3,3n "{file1_path}" | uniq > {output_folder}/final_edit_info_no_coverage_sorted.tsv
+    sort -k3,3V "{file1_path}" | uniq > {output_folder}/final_edit_info_no_coverage_sorted.tsv
 
     # Step 3: Join the files on the specified columns, output all columns, and select necessary columns with tab separation
 join -1 3 -2 4 -t $'\\t' {output_folder}/final_edit_info_no_coverage_sorted.tsv {output_folder}/depth_modified.tsv | awk -v OFS='\\t' '{{print $2, $3, $4, $5, $6, $7, $8, $11}}' > "{output_file_path}"
@@ -822,27 +822,34 @@ def calculate_coverage(bam_filepath, bed_filepath, output_filepath, output_matri
             chrom, start, end = fields[0], int(fields[1]), int(fields[2])
             regions.append((chrom, start, end))
 
-    print(f"\t{len(regions)} regions")
-    
-    with pysam.AlignmentFile(bam_filepath, "rb") as bam, open(depths_file, "w") as out:
-        for chrom, start, end in regions:
-            # Get coverage, applying base quality filter
-            coverage = bam.count_coverage(
-                chrom, start, end, quality_threshold=0  # Mimics -Q 13 in samtools depth
-            )
-
-            # Calculate total coverage for each position and include zero-coverage positions
-            for pos in range(start, end):
-                total_coverage = sum(base[pos - start] for base in coverage)
-                out.write(f"{chrom}\t{pos}\t{total_coverage}\n")
-
-
-    if output_matrix_folder:
-        matrix_output_file = os.path.join(output_matrix_folder, f"{os.path.basename(depths_file).split('.')[0]}_matrix.tsv")
-    
-        if not os.path.exists(matrix_output_file) or os.path.getsize(matrix_output_file) == 0:
-            # Pivot the depths output file into a matrix
-            pivot_depths_output(depths_file, matrix_output_file)
+    if len(regions) > 0:
+        print(f"\t{len(regions)} regions")
+        
+        with pysam.AlignmentFile(bam_filepath, "rb") as bam, open(depths_file, "w") as out:
+            try:
+                for chrom, start, end in regions:
+                    # Get coverage, applying base quality filter
+                    coverage = bam.count_coverage(
+                        chrom, start, end, quality_threshold=0  # Mimics -Q 13 in samtools depth
+                    )
+        
+                    # Calculate total coverage for each position and include zero-coverage positions
+                    for pos in range(start, end):
+                        total_coverage = sum(base[pos - start] for base in coverage)
+                        out.write(f"{chrom}\t{pos}\t{total_coverage}\n")
+            except Exception as e:
+                # Return the exception and the arguments for debugging
+                raise RuntimeError(
+                        f"Error processing bam {bam_filepath} using bed {bed_filepath} at {chrom}:{start}-{end}, "
+                        f"outputting to {depths_file}: {e}"
+                    )
+                    
+        if output_matrix_folder:
+            matrix_output_file = os.path.join(output_matrix_folder, f"{os.path.basename(depths_file).split('.')[0]}_matrix.tsv")
+        
+            if not os.path.exists(matrix_output_file) or os.path.getsize(matrix_output_file) == 0:
+                # Pivot the depths output file into a matrix
+                pivot_depths_output(depths_file, matrix_output_file)
 
 
 def run_pysam_count_coverage(args_list, processes):
diff --git a/tests/integration_tests_run.sh b/tests/integration_tests_run.sh
index f7ab6be..3d10906 100755
--- a/tests/integration_tests_run.sh
+++ b/tests/integration_tests_run.sh
@@ -34,7 +34,7 @@ echo "SC tests scripts"
 ls -lh $MARINE/tests/$tests_folder/scripts/
 
 
-for t in "only_5_cells_test" "only_5_cells_bulk_mode_test" "long_read_sc_test" "edge_case_test" "edge_case_dist_filter_test"
+for t in "only_5_cells_test" "only_5_cells_bulk_mode_test" "only_5_cells_all_cells_coverage_test" "long_read_sc_test" "edge_case_test" "edge_case_dist_filter_test"
 
 do
     echo $t