From 3565c6d5a907c5061ca66958992641ee6f23283b Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-1-13.tscc.sdsc.edu>
Date: Fri, 22 Nov 2024 08:48:59 -0800
Subject: [PATCH 01/26] merge

---
 src/utils.py | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 285b558..3ae0570 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -8,10 +8,30 @@
 import sys
 from collections import OrderedDict, defaultdict
 
+cb_n = 2
+
+def generate_permutations_list_for_CB(n):
+    """
+    Generate all permutations of A, C, G, T for strings of length n
+    and format the output as a list with "-1" appended to each permutation.
+
+    Output for 2, for example: ['AA-1', 'AC-1', 'AG-1', 'AT-1', 'CA-1', 'CC-1', 'CG-1', 'CT-1', 'GA-1', 'GC-1'...]
+    Args:
+        n (int): Length of the strings to generate.
+        
+    Returns:
+        list: A list of strings where each string is a permutation with "-1" appended.
+    """
+    # Generate all combinations of A, C, G, T of length n
+    combinations = [''.join(p) for p in product('ACGT', repeat=n)]
+    
+    # Append "-1" to each permutation
+    result = [f"{combo}-1" for combo in combinations]
+    
+    return result
+    
 suffixes = {
-    'CB': [
-        "A-1", "C-1", "G-1", "T-1"
-    ],
+    'CB': generate_permutations_list_for_CB(cb_n),
     'IS': [
         "00","01","02","03","04","05","06","07","08","09",
         "10","11","12","13","14","15","16","17","18","19",
@@ -77,7 +97,7 @@ def get_contigs_that_need_bams_written(expected_contigs, split_bams_folder, barc
         subsets_per_contig[contig_label] += 1
 
     if barcode_tag == 'CB':
-        number_of_expected_bams = 4
+        number_of_expected_bams = cb_n
     else:
         number_of_expected_bams = number_of_expected_bams
         

From 60672d4f1849c69d86b0e849b79cfe07b506551a Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-4-15.tscc.sdsc.edu>
Date: Mon, 25 Nov 2024 16:56:00 -0800
Subject: [PATCH 02/26] enable flag for paired end coverage

---
 marine.py    | 40 ++++++++++++++++++----------------------
 src/utils.py |  2 +-
 2 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/marine.py b/marine.py
index 059e21f..be016f1 100755
--- a/marine.py
+++ b/marine.py
@@ -365,7 +365,12 @@ def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_fi
     print(f"Merged file saved to {output_file_path}")
 
 
-def generate_depths_with_samtools(output_folder, bam_filepaths):
+def generate_depths_with_samtools(output_folder, bam_filepaths, paired_end=False):
+    if paired_end:
+        paired_end_option = '-s '
+    else:
+        paired_end_option = ''
+        
     coverage_start_time = time.perf_counter()
 
     all_depth_commands = []
@@ -382,8 +387,8 @@ def generate_depths_with_samtools(output_folder, bam_filepaths):
     for i, bam_filepath in enumerate(bam_filepaths):
         depth_command = (
         "echo 'running samtools depth on {}...';"
-        "samtools depth -a -b {}/combined.bed {} >> {}/coverage/depths.txt"
-    ).format(bam_filepath, output_folder, bam_filepath, output_folder)
+        "samtools depth {}-a -b {}/combined.bed {} >> {}/coverage/depths.txt"
+    ).format(paired_end_option, bam_filepath, output_folder, bam_filepath, output_folder)
         all_depth_commands.append(depth_command)
         
     depth_bash = '{}/depth_command.sh'.format(output_folder)
@@ -556,25 +561,16 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
         coverage_subfolder = '{}/coverage'.format(output_folder)
         make_folder(coverage_subfolder)
         
-        if paired_end:
-            results, total_time, total_seconds_for_contig_df = coverage_processing(output_folder, 
-                                                                                   barcode_tag=barcode_tag, 
-                                                                                   paired_end=paired_end,
-                                                                                   verbose=verbose,
-                                                                                   cores=cores,
-                                                                                   number_of_expected_bams=number_of_expected_bams,
-                                                                                   min_read_quality=min_read_quality,
-                                                                                   bam_filepath=bam_filepath
-                                                                                  )
-        else:
-            # for bulk single-end or single-cell, we will just concatenate all the edits files into one big bed file, and then run samtools depth
-            if not barcode_tag:
-                total_time, total_seconds_for_contig_df = generate_depths_with_samtools(output_folder, [bam_filepath])
-            elif barcode_tag:
-                # for single-end, we want to run the samtools depth command for everyone of the reconfigured bam files
-                reconfigured_bam_filepaths = glob('{}/split_bams/*/*.bam'.format(output_folder))
-                print("Running samtools depth on {} reconfigured bam paths...".format(len(reconfigured_bam_filepaths)))
-                total_time, total_seconds_for_contig_df = generate_depths_with_samtools(output_folder, reconfigured_bam_filepaths)
+        # for bulk or paired end, we will just concatenate all the edits files into one big bed file, 
+        # and then run samtools depth
+        if not barcode_tag:
+            total_time, total_seconds_for_contig_df = generate_depths_with_samtools(output_folder, [bam_filepath], paired_end=paired_end)
+            
+        elif barcode_tag:
+            # for single-cell, we want to run the samtools depth command for everyone of the reconfigured bam files
+            reconfigured_bam_filepaths = glob('{}/split_bams/*/*.bam'.format(output_folder))
+            print("Running samtools depth on {} reconfigured bam paths...".format(len(reconfigured_bam_filepaths)))
+            total_time, total_seconds_for_contig_df = generate_depths_with_samtools(output_folder, reconfigured_bam_filepaths)
                                                   
                 
         total_seconds_for_contig_df.to_csv("{}/coverage_calculation_timing.tsv".format(logging_folder), sep='\t')
diff --git a/src/utils.py b/src/utils.py
index 52958b9..371257b 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -9,7 +9,7 @@
 from collections import OrderedDict, defaultdict
 from itertools import product
 
-cb_n = 2
+cb_n = 1
 
 def generate_permutations_list_for_CB(n):
     """

From 7ae9611b64168a43059d14b04a19ba53a2c9a59b Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-4-15.tscc.sdsc.edu>
Date: Mon, 25 Nov 2024 17:19:02 -0800
Subject: [PATCH 03/26] fixes

---
 marine.py | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/marine.py b/marine.py
index be016f1..d8e822c 100755
--- a/marine.py
+++ b/marine.py
@@ -365,12 +365,32 @@ def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_fi
     print(f"Merged file saved to {output_file_path}")
 
 
-def generate_depths_with_samtools(output_folder, bam_filepaths, paired_end=False):
+def get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output_suffix=''):
+    all_depth_commands = []
+
+    if len(output_suffix) > 0:
+        output_suffix = '_{}'.format(output_suffix)
+        
     if paired_end:
         paired_end_option = '-s '
     else:
         paired_end_option = ''
-        
+
+    for i, bam_filepath in enumerate(bam_filepaths):
+        depth_command = (
+        "echo 'running samtools depth on {}...';"
+        "samtools depth {}-a -b {}/combined.bed {} >> {}/coverage/depths{}.txt".format(
+            bam_filepath, 
+            paired_end_option,
+            output_folder,
+            bam_filepath, 
+            output_folder,
+            output_suffix))
+        all_depth_commands.append(depth_command)
+    return all_depth_commands
+    
+def generate_depths_with_samtools(output_folder, bam_filepaths, paired_end=False):
+    
     coverage_start_time = time.perf_counter()
 
     all_depth_commands = []
@@ -383,12 +403,9 @@ def generate_depths_with_samtools(output_folder, bam_filepaths, paired_end=False
     ).format(output_folder, output_folder)
     
     all_depth_commands.append(combine_edit_sites_command)
-    
-    for i, bam_filepath in enumerate(bam_filepaths):
-        depth_command = (
-        "echo 'running samtools depth on {}...';"
-        "samtools depth {}-a -b {}/combined.bed {} >> {}/coverage/depths.txt"
-    ).format(paired_end_option, bam_filepath, output_folder, bam_filepath, output_folder)
+
+    samtools_depth_commands = get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output_suffix='')
+    for depth_command in samtools_depth_commands:
         all_depth_commands.append(depth_command)
         
     depth_bash = '{}/depth_command.sh'.format(output_folder)

From 046e69b83164f3606191f5ec08259a66a734e1f7 Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-4-15.tscc.sdsc.edu>
Date: Mon, 25 Nov 2024 17:41:37 -0800
Subject: [PATCH 04/26] Fix reading of paired end

---
 marine.py | 74 ++++++++++++++++++++++---------------------------------
 1 file changed, 29 insertions(+), 45 deletions(-)

diff --git a/marine.py b/marine.py
index d8e822c..12c9702 100755
--- a/marine.py
+++ b/marine.py
@@ -365,6 +365,22 @@ def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_fi
     print(f"Merged file saved to {output_file_path}")
 
 
+def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_depth_commands=[], output_suffix='', run=False):
+    samtools_depth_commands = get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output_suffix='')
+    for depth_command in samtools_depth_commands:
+        all_depth_commands.append(depth_command)
+        
+    depth_bash = '{}/depth_command.sh'.format(output_folder)
+    with open(depth_bash, 'w') as f:
+        for depth_command in all_depth_commands:
+            f.write('{}\n\n'.format(depth_command))
+
+    if run:             
+        print("Calculating depths...")
+        subprocess.run(['bash', depth_bash])
+        print("Done calculating depths.")
+            
+
 def get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output_suffix=''):
     all_depth_commands = []
 
@@ -388,8 +404,9 @@ def get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output
             output_suffix))
         all_depth_commands.append(depth_command)
     return all_depth_commands
-    
-def generate_depths_with_samtools(output_folder, bam_filepaths, paired_end=False):
+
+
+def generate_depths(output_folder, bam_filepaths, paired_end=False):
     
     coverage_start_time = time.perf_counter()
 
@@ -401,21 +418,10 @@ def generate_depths_with_samtools(output_folder, bam_filepaths, paired_end=False
         "awk 'NR > 1 {{print $2, $4-1, $4}}' OFS='\t' \"$file\"; "
         "done | sort -k1,1 -k2,2n -u > {}/combined.bed;"
     ).format(output_folder, output_folder)
-    
     all_depth_commands.append(combine_edit_sites_command)
 
-    samtools_depth_commands = get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output_suffix='')
-    for depth_command in samtools_depth_commands:
-        all_depth_commands.append(depth_command)
-        
-    depth_bash = '{}/depth_command.sh'.format(output_folder)
-    with open(depth_bash, 'w') as f:
-        for depth_command in all_depth_commands:
-            f.write('{}\n\n'.format(depth_command))
-        
-    print("Calculating depths...")
-    subprocess.run(['bash', depth_bash])
-    print("Done calculating depths.")
+    make_depth_command_script(paired_end, bam_filepaths, output_folder,
+                              all_depth_commands=all_depth_commands, output_suffix='', run=True)
 
     print("Concatenating edit info files...")
     concatenate_files(output_folder, "edit_info/*edit_info.tsv", "{}/final_edit_info_no_coverage.tsv".format(output_folder))
@@ -437,25 +443,11 @@ def generate_depths_with_samtools(output_folder, bam_filepaths, paired_end=False
 
 
 def get_edits_with_coverage_df(output_folder,
-                               barcode_tag=None, 
-                               paired_end=False):
+                               barcode_tag=None):
 
-    # For paired end, which was calculated using the pysam coverage functions.
-    if paired_end:
-        all_edit_info_unique_position_with_coverage_df = pd.read_csv('{}/final_edit_info.tsv'.format(output_folder), sep='\t',
-                                                                     index_col=0,
-                                                                     names=[
-                                                                         'barcode_position_index', 'barcode', 'contig', 'contig_position', 'position', 'ref', 'alt', 'read_id',
-                                                                         'strand', 'barcode_position', 
-                                                                         'coverage', 'source', 'position_barcode'], dtype={'coverage': int, 'position': int,
-                                                                                             'contig': str})
-
-    else:
-        # For bulk single-end or for single-cell, for which coverages were calculated using samtools depth.
-        all_edit_info_unique_position_with_coverage_df = pd.read_csv('{}/final_edit_info.tsv'.format(output_folder), sep='\t',
+    all_edit_info_unique_position_with_coverage_df = pd.read_csv('{}/final_edit_info.tsv'.format(output_folder), sep='\t',
                                                                      dtype={'coverage': int, 'position': int,
                                                                                              'contig': str})
-    
     # If there was a barcode specified, then the contigs will have been set to a combination of contig and barcode ID.
     # For example, we will find a contig to be 9_GATCCCTCAGTAACGG-1, and we will want to simplify it back to simply 9,
     # as the barcode information is separately already present as it's own column in this dataframe. To ensure code continuity,
@@ -578,18 +570,11 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
         coverage_subfolder = '{}/coverage'.format(output_folder)
         make_folder(coverage_subfolder)
         
-        # for bulk or paired end, we will just concatenate all the edits files into one big bed file, 
-        # and then run samtools depth
-        if not barcode_tag:
-            total_time, total_seconds_for_contig_df = generate_depths_with_samtools(output_folder, [bam_filepath], paired_end=paired_end)
-            
-        elif barcode_tag:
-            # for single-cell, we want to run the samtools depth command for everyone of the reconfigured bam files
-            reconfigured_bam_filepaths = glob('{}/split_bams/*/*.bam'.format(output_folder))
-            print("Running samtools depth on {} reconfigured bam paths...".format(len(reconfigured_bam_filepaths)))
-            total_time, total_seconds_for_contig_df = generate_depths_with_samtools(output_folder, reconfigured_bam_filepaths)
-                                                  
-                
+        # We want to run the samtools depth command for each of the reconfigured bam files
+        reconfigured_bam_filepaths = glob('{}/split_bams/*/*.bam'.format(output_folder))
+        print("Running samtools depth on {} subset bam paths...".format(len(reconfigured_bam_filepaths)))
+        total_time, total_seconds_for_contig_df = generate_depths(output_folder, reconfigured_bam_filepaths)
+                                              
         total_seconds_for_contig_df.to_csv("{}/coverage_calculation_timing.tsv".format(logging_folder), sep='\t')
          
         pretty_print("Total time to calculate coverage: {} minutes".format(round(total_time/60, 3)))
@@ -616,8 +601,7 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
         print("Filtering..")
 
         all_edit_info_unique_position_with_coverage_df = get_edits_with_coverage_df(output_folder,
-                                                                                    barcode_tag=barcode_tag, 
-                                                                                    paired_end=paired_end)
+                                                                                    barcode_tag=barcode_tag)
         
         pretty_print("\tNumber of edits after filtering:\n\t{}".format(len(all_edit_info_unique_position_with_coverage_df)))
     

From 65b7b3ad413954471d778df543833b911845f1b6 Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-4-15.tscc.sdsc.edu>
Date: Mon, 25 Nov 2024 17:51:32 -0800
Subject: [PATCH 05/26] reconfiguring modularity

---
 marine.py    | 148 ++-------------------------------------------------
 src/utils.py | 140 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+), 145 deletions(-)

diff --git a/marine.py b/marine.py
index 12c9702..fe3d6c0 100755
--- a/marine.py
+++ b/marine.py
@@ -9,8 +9,8 @@
 import polars as pl
 import psutil
 import pysam
-from scipy.special import betainc
 import shutil
+import subprocess
 import sys
 from sys import getsizeof
 import time
@@ -32,7 +32,8 @@
 
 from utils import get_intervals, index_bam, write_rows_to_info_file, write_header_to_edit_info, \
 write_read_to_bam_file, remove_file_if_exists, make_folder, concat_and_write_bams_wrapper, \
-pretty_print, read_barcode_whitelist_file, get_contigs_that_need_bams_written
+pretty_print, read_barcode_whitelist_file, get_contigs_that_need_bams_written, \
+make_depth_command_script, generate_and_run_bash_merge, get_sailor_sites
 
 from core import run_edit_identifier, run_bam_reconfiguration, \
 gather_edit_information_across_subcontigs, run_coverage_calculator, generate_site_level_information
@@ -232,72 +233,6 @@ def print_marine_logo():
         
     pretty_print("Multi-core Algorithm for Rapid Identification of Nucleotide Edits", style="=")
 
-def calculate_sailor_score(sailor_row):
-    edits = sailor_row['count']
-    num_reads = sailor_row['coverage']
-    original_base_counts = num_reads - edits
-    
-    cov_margin = 0.01
-    alpha, beta = 0, 0
-
-    num_failures = 0
-    failure_messages = []
-    try:
-        edit_frac = float(edits) / float(num_reads)
-    except Exception as e:
-        num_failures += 1
-        print("Bad Row: {}".format(sailor_row))
-        return None
-        
-    # calc smoothed counts and confidence
-    destination_base_smoothed = edits + alpha
-    origin_base_smoothed = original_base_counts + beta
-
-    ########  MOST IMPORTANT LINE  ########
-    # calculates the confidence of theta as
-    # P( theta < cov_margin | A, G) ~ Beta_theta(G, A)
-    confidence = 1 - betainc(destination_base_smoothed, origin_base_smoothed, cov_margin)
-    return confidence
-    
-
-def get_sailor_sites(final_site_level_information_df, conversion="C>T", skip_coverage=False):
-    final_site_level_information_df = final_site_level_information_df[final_site_level_information_df['strand_conversion'] == conversion]
-
-    if skip_coverage:
-        # Case where we want to skip coverage counting, we will just set it to -1 in the sailor-style output
-        coverage_col = "-1"
-        weird_sites = pd.DataFrame()
-    else:
-        # Normally, assume that there is a coverage column
-        coverage_col = final_site_level_information_df['coverage'].astype(str)
-
-        weird_sites = final_site_level_information_df[
-            (final_site_level_information_df.coverage == 0) |\
-            (final_site_level_information_df.coverage < final_site_level_information_df['count'])]
-    
-        print("{} rows had coverage of 0 or more edits than coverage... filtering these out, but look into them...".format(
-            len(weird_sites)))
-              
-        final_site_level_information_df = final_site_level_information_df[
-        (final_site_level_information_df.coverage > 0) & \
-        (final_site_level_information_df.coverage >= final_site_level_information_df['count'])
-        ]
-
-    
-    final_site_level_information_df['combo'] = final_site_level_information_df['count'].astype(str) + ',' + coverage_col
-
-    if skip_coverage:
-        final_site_level_information_df['score'] = -1
-    else:
-        final_site_level_information_df['score'] = final_site_level_information_df.apply(calculate_sailor_score, axis=1)
-
-    final_site_level_information_df['start'] = final_site_level_information_df['position']
-    final_site_level_information_df['end'] = final_site_level_information_df['position'] + 1
-    
-    final_site_level_information_df = final_site_level_information_df[['contig', 'start', 'end', 'score', 'combo', 'strand']]
-    return final_site_level_information_df, weird_sites
-    
-
 def collate_edit_info_shards(output_folder):
     edit_info_folder = "{}/edit_info".format(output_folder)
     edit_info_files = glob("{}/*".format(edit_info_folder))
@@ -329,83 +264,6 @@ def get_broken_up_contigs(contigs, num_per_sublist):
             broken_up_contigs.append(contig_sublist)
     return broken_up_contigs
 
-import subprocess
-
-
-import subprocess
-
-def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_file_path, header_columns):
-    # Convert header list into a tab-separated string
-    header = "\t".join(header_columns)
-
-    # Generate the Bash command for processing
-    bash_command = f"""#!/bin/bash
-    # Step 1: Adjust the depths file by adding a new column that incorporates both contig and position
-    # for join purposes, and sort by this column. Output in tab-separated format.
-    awk -v OFS='\\t' '{{print $1, $2-1, $3, $1":"($2)}}' "{file2_path}" | sort -k4,4n | uniq > {output_folder}/depth_modified.tsv
-    
-    # Step 2: Sort the first file numerically by the join column (the column incuding both contig and position information)
-    sort -k3,3n "{file1_path}" | uniq > {output_folder}/final_edit_info_no_coverage_sorted.tsv
-
-    # Step 3: Join the files on the specified columns, output all columns, and select necessary columns with tab separation
-join -1 3 -2 4 -t $'\\t' {output_folder}/final_edit_info_no_coverage_sorted.tsv {output_folder}/depth_modified.tsv | awk -v OFS='\\t' '{{print $2, $3, $4, $5, $6, $7, $8, $11}}' > "{output_file_path}"
-
-    # Step 4: Add header to the output file
-    echo -e "{header}" | cat - "{output_file_path}" > temp && mv temp "{output_file_path}"
-    """
-
-    # Write the command to a shell script
-    bash_script_path = "{}/merge_command.sh".format(output_folder)
-    with open(bash_script_path, "w") as file:
-        file.write(bash_command)
-
-    # Run the generated bash script
-    print("Running Bash merge script...")
-    subprocess.run(["bash", bash_script_path], check=True)
-    print(f"Merged file saved to {output_file_path}")
-
-
-def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_depth_commands=[], output_suffix='', run=False):
-    samtools_depth_commands = get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output_suffix='')
-    for depth_command in samtools_depth_commands:
-        all_depth_commands.append(depth_command)
-        
-    depth_bash = '{}/depth_command.sh'.format(output_folder)
-    with open(depth_bash, 'w') as f:
-        for depth_command in all_depth_commands:
-            f.write('{}\n\n'.format(depth_command))
-
-    if run:             
-        print("Calculating depths...")
-        subprocess.run(['bash', depth_bash])
-        print("Done calculating depths.")
-            
-
-def get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output_suffix=''):
-    all_depth_commands = []
-
-    if len(output_suffix) > 0:
-        output_suffix = '_{}'.format(output_suffix)
-        
-    if paired_end:
-        paired_end_option = '-s '
-    else:
-        paired_end_option = ''
-
-    for i, bam_filepath in enumerate(bam_filepaths):
-        depth_command = (
-        "echo 'running samtools depth on {}...';"
-        "samtools depth {}-a -b {}/combined.bed {} >> {}/coverage/depths{}.txt".format(
-            bam_filepath, 
-            paired_end_option,
-            output_folder,
-            bam_filepath, 
-            output_folder,
-            output_suffix))
-        all_depth_commands.append(depth_command)
-    return all_depth_commands
-
-
 def generate_depths(output_folder, bam_filepaths, paired_end=False):
     
     coverage_start_time = time.perf_counter()
diff --git a/src/utils.py b/src/utils.py
index 371257b..47338bb 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -8,6 +8,7 @@
 import sys
 from collections import OrderedDict, defaultdict
 from itertools import product
+from scipy.special import betainc
 
 cb_n = 1
 
@@ -640,3 +641,142 @@ def concat_and_write_bams_wrapper(params):
     
     # print("df_dict keys: {}".format(df_dict.keys()))
     concat_and_write_bams(contig, df_dict, header_string, split_bams_folder, barcode_tag=barcode_tag, number_of_expected_bams=number_of_expected_bams, verbose=verbose)
+
+
+def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_file_path, header_columns):
+    # Convert header list into a tab-separated string
+    header = "\t".join(header_columns)
+
+    # Generate the Bash command for processing
+    bash_command = f"""#!/bin/bash
+    # Step 1: Adjust the depths file by adding a new column that incorporates both contig and position
+    # for join purposes, and sort by this column. Output in tab-separated format.
+    awk -v OFS='\\t' '{{print $1, $2-1, $3, $1":"($2)}}' "{file2_path}" | sort -k4,4n | uniq > {output_folder}/depth_modified.tsv
+    
+    # Step 2: Sort the first file numerically by the join column (the column incuding both contig and position information)
+    sort -k3,3n "{file1_path}" | uniq > {output_folder}/final_edit_info_no_coverage_sorted.tsv
+
+    # Step 3: Join the files on the specified columns, output all columns, and select necessary columns with tab separation
+join -1 3 -2 4 -t $'\\t' {output_folder}/final_edit_info_no_coverage_sorted.tsv {output_folder}/depth_modified.tsv | awk -v OFS='\\t' '{{print $2, $3, $4, $5, $6, $7, $8, $11}}' > "{output_file_path}"
+
+    # Step 4: Add header to the output file
+    echo -e "{header}" | cat - "{output_file_path}" > temp && mv temp "{output_file_path}"
+    """
+
+    # Write the command to a shell script
+    bash_script_path = "{}/merge_command.sh".format(output_folder)
+    with open(bash_script_path, "w") as file:
+        file.write(bash_command)
+
+    # Run the generated bash script
+    print("Running Bash merge script...")
+    subprocess.run(["bash", bash_script_path], check=True)
+    print(f"Merged file saved to {output_file_path}")
+            
+
+def get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output_suffix=''):
+    all_depth_commands = []
+
+    if len(output_suffix) > 0:
+        output_suffix = '_{}'.format(output_suffix)
+        
+    if paired_end:
+        paired_end_option = '-s '
+    else:
+        paired_end_option = ''
+
+    for i, bam_filepath in enumerate(bam_filepaths):
+        depth_command = (
+        "echo 'running samtools depth on {}...';"
+        "samtools depth {}-a -b {}/combined.bed {} >> {}/coverage/depths{}.txt".format(
+            bam_filepath, 
+            paired_end_option,
+            output_folder,
+            bam_filepath, 
+            output_folder,
+            output_suffix))
+        all_depth_commands.append(depth_command)
+    return all_depth_commands
+
+
+def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_depth_commands=[], output_suffix='', run=False):
+    samtools_depth_commands = get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output_suffix='')
+    for depth_command in samtools_depth_commands:
+        all_depth_commands.append(depth_command)
+        
+    depth_bash = '{}/depth_command.sh'.format(output_folder)
+    with open(depth_bash, 'w') as f:
+        for depth_command in all_depth_commands:
+            f.write('{}\n\n'.format(depth_command))
+
+    if run:             
+        print("Calculating depths...")
+        subprocess.run(['bash', depth_bash])
+        print("Done calculating depths.")
+
+
+def calculate_sailor_score(sailor_row):
+    edits = sailor_row['count']
+    num_reads = sailor_row['coverage']
+    original_base_counts = num_reads - edits
+    
+    cov_margin = 0.01
+    alpha, beta = 0, 0
+
+    num_failures = 0
+    failure_messages = []
+    try:
+        edit_frac = float(edits) / float(num_reads)
+    except Exception as e:
+        num_failures += 1
+        print("Bad Row: {}".format(sailor_row))
+        return None
+        
+    # calc smoothed counts and confidence
+    destination_base_smoothed = edits + alpha
+    origin_base_smoothed = original_base_counts + beta
+
+    ########  MOST IMPORTANT LINE  ########
+    # calculates the confidence of theta as
+    # P( theta < cov_margin | A, G) ~ Beta_theta(G, A)
+    confidence = 1 - betainc(destination_base_smoothed, origin_base_smoothed, cov_margin)
+    return confidence
+    
+
+def get_sailor_sites(final_site_level_information_df, conversion="C>T", skip_coverage=False):
+    final_site_level_information_df = final_site_level_information_df[final_site_level_information_df['strand_conversion'] == conversion]
+
+    if skip_coverage:
+        # Case where we want to skip coverage counting, we will just set it to -1 in the sailor-style output
+        coverage_col = "-1"
+        weird_sites = pd.DataFrame()
+    else:
+        # Normally, assume that there is a coverage column
+        coverage_col = final_site_level_information_df['coverage'].astype(str)
+
+        weird_sites = final_site_level_information_df[
+            (final_site_level_information_df.coverage == 0) |\
+            (final_site_level_information_df.coverage < final_site_level_information_df['count'])]
+    
+        print("{} rows had coverage of 0 or more edits than coverage... filtering these out, but look into them...".format(
+            len(weird_sites)))
+              
+        final_site_level_information_df = final_site_level_information_df[
+        (final_site_level_information_df.coverage > 0) & \
+        (final_site_level_information_df.coverage >= final_site_level_information_df['count'])
+        ]
+
+    
+    final_site_level_information_df['combo'] = final_site_level_information_df['count'].astype(str) + ',' + coverage_col
+
+    if skip_coverage:
+        final_site_level_information_df['score'] = -1
+    else:
+        final_site_level_information_df['score'] = final_site_level_information_df.apply(calculate_sailor_score, axis=1)
+
+    final_site_level_information_df['start'] = final_site_level_information_df['position']
+    final_site_level_information_df['end'] = final_site_level_information_df['position'] + 1
+    
+    final_site_level_information_df = final_site_level_information_df[['contig', 'start', 'end', 'score', 'combo', 'strand']]
+    return final_site_level_information_df, weird_sites
+    

From ad1b0b72094945d05a369989c341cf5cd4039d98 Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-4-15.tscc.sdsc.edu>
Date: Mon, 25 Nov 2024 17:57:14 -0800
Subject: [PATCH 06/26] concatenate files

---
 marine.py    | 56 ++++------------------------------------------------
 src/utils.py | 20 +++++++++++++++++++
 2 files changed, 24 insertions(+), 52 deletions(-)

diff --git a/marine.py b/marine.py
index fe3d6c0..63412c2 100755
--- a/marine.py
+++ b/marine.py
@@ -33,7 +33,7 @@
 from utils import get_intervals, index_bam, write_rows_to_info_file, write_header_to_edit_info, \
 write_read_to_bam_file, remove_file_if_exists, make_folder, concat_and_write_bams_wrapper, \
 pretty_print, read_barcode_whitelist_file, get_contigs_that_need_bams_written, \
-make_depth_command_script, generate_and_run_bash_merge, get_sailor_sites
+make_depth_command_script, generate_and_run_bash_merge, get_sailor_sites, concatenate_files
 
 from core import run_edit_identifier, run_bam_reconfiguration, \
 gather_edit_information_across_subcontigs, run_coverage_calculator, generate_site_level_information
@@ -168,56 +168,6 @@ def bam_processing(overall_label_to_list_of_contents, output_folder, barcode_tag
     return total_bam_generation_time, total_seconds_for_bams_df
     
     
-import subprocess 
-
-
-def concatenate_files(source_folder, file_pattern, output_filepath):
-    # Create the concatenation command with numeric sorting and header skipping
-    concat_command = (
-        f"for f in $(ls -v {source_folder}/{file_pattern}); do "
-        "tail -n +2 \"$f\"; "  # Skip the header row for each file
-        "done > {}".format(output_filepath)
-    )
-
-    # Write the command to a shell script
-    concat_bash = f"{source_folder}/concat_command.sh"
-    with open(concat_bash, 'w') as f:
-        f.write(concat_command)
-        
-    print("Concatenating files in numerical order without headers...")
-    subprocess.run(['bash', concat_bash])
-    print("Done concatenating.")
-
-
-def coverage_processing(output_folder, barcode_tag='CB', paired_end=False, verbose=False, cores=1, number_of_expected_bams=4,
-                       min_read_quality=0, bam_filepath=''):
-
-    # Single-cell or long read version:
-    edit_info_grouped_per_contig_combined = gather_edit_information_across_subcontigs(output_folder, 
-                                                                                      barcode_tag=barcode_tag,
-                                                                                      number_of_expected_bams=number_of_expected_bams
-                                                                                     )
-    
-    #if verbose:
-    #    print('edit_info_grouped_per_contig_combined', edit_info_grouped_per_contig_combined.keys())
-
-        
-    results, total_time, total_seconds_for_contig = run_coverage_calculator(edit_info_grouped_per_contig_combined, output_folder,
-                                                                            barcode_tag=barcode_tag,
-                                                                            paired_end=paired_end,
-                                                                            verbose=verbose,
-                                                                            processes=cores
-                                                                           )
-    concatenate_files(output_folder, "coverage/*.tsv", "{}/final_edit_info.tsv".format(output_folder))
-    
-    total_seconds_for_contig_df = pd.DataFrame.from_dict(total_seconds_for_contig, orient='index')
-    total_seconds_for_contig_df.columns = ['seconds']
-    total_seconds_for_contig_df['contig sections'] = total_seconds_for_contig_df.index
-    total_seconds_for_contig_df.index = range(len(total_seconds_for_contig_df))
-    
-    return results, total_time, total_seconds_for_contig_df
-    
-
 def print_marine_logo():
     logo_lines = [
     "::::    ::::      :::     :::::::::  ::::::::::: ::::    ::: :::::::::: ",
@@ -282,7 +232,9 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False):
                               all_depth_commands=all_depth_commands, output_suffix='', run=True)
 
     print("Concatenating edit info files...")
-    concatenate_files(output_folder, "edit_info/*edit_info.tsv", "{}/final_edit_info_no_coverage.tsv".format(output_folder))
+    concatenate_files(output_folder, "edit_info/*edit_info.tsv",
+                      "{}/final_edit_info_no_coverage.tsv".format(output_folder),
+                     run=True)
 
     print("Append the depth columns to the concatenated final_edit_info file...")
 
diff --git a/src/utils.py b/src/utils.py
index 47338bb..d302ec9 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -6,6 +6,7 @@
 import pandas as pd
 import numpy as np
 import sys
+import subprocess
 from collections import OrderedDict, defaultdict
 from itertools import product
 from scipy.special import betainc
@@ -779,4 +780,23 @@ def get_sailor_sites(final_site_level_information_df, conversion="C>T", skip_cov
     
     final_site_level_information_df = final_site_level_information_df[['contig', 'start', 'end', 'score', 'combo', 'strand']]
     return final_site_level_information_df, weird_sites
+
+
+def concatenate_files(source_folder, file_pattern, output_filepath, run=False):
+    # Create the concatenation command with numeric sorting and header skipping
+    concat_command = (
+        f"for f in $(ls -v {source_folder}/{file_pattern}); do "
+        "tail -n +2 \"$f\"; "  # Skip the header row for each file
+        "done > {}".format(output_filepath)
+    )
+
+    # Write the command to a shell script
+    concat_bash = f"{source_folder}/concat_command.sh"
+    with open(concat_bash, 'w') as f:
+        f.write(concat_command)
+
+    if run:
+        print("Concatenating files in numerical order without headers...")
+        subprocess.run(['bash', concat_bash])
+        print("Done concatenating.")
     

From 7f7ac5481fa09b4916531ba890e47b125eb2b45f Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-4-15.tscc.sdsc.edu>
Date: Mon, 25 Nov 2024 18:13:35 -0800
Subject: [PATCH 07/26] paired end flag propagation

---
 marine.py                                                       | 2 +-
 .../scripts/.ipynb_checkpoints/F1R2_pair_test-checkpoint.sh     | 2 +-
 tests/strandedness_tests/scripts/F1R2_pair_test.sh              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/marine.py b/marine.py
index 63412c2..4b797f9 100755
--- a/marine.py
+++ b/marine.py
@@ -383,7 +383,7 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
         # We want to run the samtools depth command for each of the reconfigured bam files
         reconfigured_bam_filepaths = glob('{}/split_bams/*/*.bam'.format(output_folder))
         print("Running samtools depth on {} subset bam paths...".format(len(reconfigured_bam_filepaths)))
-        total_time, total_seconds_for_contig_df = generate_depths(output_folder, reconfigured_bam_filepaths)
+        total_time, total_seconds_for_contig_df = generate_depths(output_folder, reconfigured_bam_filepaths, paired_end=paired_end)
                                               
         total_seconds_for_contig_df.to_csv("{}/coverage_calculation_timing.tsv".format(logging_folder), sep='\t')
          
diff --git a/tests/strandedness_tests/scripts/.ipynb_checkpoints/F1R2_pair_test-checkpoint.sh b/tests/strandedness_tests/scripts/.ipynb_checkpoints/F1R2_pair_test-checkpoint.sh
index eeffa87..887a8f9 100644
--- a/tests/strandedness_tests/scripts/.ipynb_checkpoints/F1R2_pair_test-checkpoint.sh
+++ b/tests/strandedness_tests/scripts/.ipynb_checkpoints/F1R2_pair_test-checkpoint.sh
@@ -17,4 +17,4 @@ $MARINE/tests/strandedness_tests/F1R2_pair_test \
 --strandedness 2 \
 --contigs "chr17" \
 --sailor \
---num_intervals_per_contig 16
\ No newline at end of file
+--keep_intermediate_files
\ No newline at end of file
diff --git a/tests/strandedness_tests/scripts/F1R2_pair_test.sh b/tests/strandedness_tests/scripts/F1R2_pair_test.sh
index eeffa87..887a8f9 100644
--- a/tests/strandedness_tests/scripts/F1R2_pair_test.sh
+++ b/tests/strandedness_tests/scripts/F1R2_pair_test.sh
@@ -17,4 +17,4 @@ $MARINE/tests/strandedness_tests/F1R2_pair_test \
 --strandedness 2 \
 --contigs "chr17" \
 --sailor \
---num_intervals_per_contig 16
\ No newline at end of file
+--keep_intermediate_files
\ No newline at end of file

From 87bd8560e7a1559b7069b661f8f5f477f145dd1f Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-4-15.tscc.sdsc.edu>
Date: Mon, 25 Nov 2024 21:15:08 -0800
Subject: [PATCH 08/26] more cleanup

---
 marine.py    | 248 ++++++++++++++++++++++++---------------------------
 src/utils.py |  95 +++++++++++++++++++-
 2 files changed, 207 insertions(+), 136 deletions(-)

diff --git a/marine.py b/marine.py
index 4b797f9..1b2528a 100755
--- a/marine.py
+++ b/marine.py
@@ -33,71 +33,14 @@
 from utils import get_intervals, index_bam, write_rows_to_info_file, write_header_to_edit_info, \
 write_read_to_bam_file, remove_file_if_exists, make_folder, concat_and_write_bams_wrapper, \
 pretty_print, read_barcode_whitelist_file, get_contigs_that_need_bams_written, \
-make_depth_command_script, generate_and_run_bash_merge, get_sailor_sites, concatenate_files
+make_depth_command_script, generate_and_run_bash_merge, get_sailor_sites, \
+concatenate_files, get_edits_with_coverage_df, zero_edit_found, delete_intermediate_files
 
 from core import run_edit_identifier, run_bam_reconfiguration, \
 gather_edit_information_across_subcontigs, run_coverage_calculator, generate_site_level_information
 
 from annotate import annotate_sites, get_strand_specific_conversion 
 
-def zero_edit_found(final_site_level_information_df, output_folder, sailor_list, bedgraphs_list, keep_intermediate_files, start_time, logging_folder):
-    print("No edits found.")
-    sites_columns = ['site_id','barcode','contig','position','ref','alt','strand','count','coverage','conversion','strand_conversion']
-    sites_empty_df = pd.DataFrame(columns=sites_columns)
-    sites_empty_df.to_csv('{}/final_filtered_site_info.tsv'.format(output_folder), sep='\t', index=False)
-
-    Annotated_sites_columns = ['site_id','barcode','contig','position','ref','alt','strand','count','coverage','conversion','strand_conversion','feature_name','feature_type','feature_strand']
-    annotated_sites_empty_df = pd.DataFrame(columns=Annotated_sites_columns)
-    annotated_sites_empty_df.to_csv('{}/final_filtered_site_info_annotated.tsv'.format(output_folder), sep='\t', index=False)
-
-    empty_df = pd.DataFrame()
-    if len(sailor_list) > 0:
-        for conversion in sailor_list:
-            conversion_search = conversion[0] + '>' + conversion[1]
-            empty_df.to_csv('{}/sailor_style_sites_{}.bed'.format(
-                output_folder, 
-                conversion_search.replace(">", "-")), 
-                header=False, index=False, sep='\t')
-
-    if len(bedgraphs_list) > 0:
-        bedgraph_folder = '{}/bedgraphs'.format(output_folder)
-        make_folder(bedgraph_folder)
-        for conversion in bedgraphs_list:
-            conversion_search = conversion[0] + '>' + conversion[1]
-            empty_df.to_csv('{}/{}_{}.bedgraph'.format(bedgraph_folder, output_folder.split('/')[-1], conversion), sep='\t', index=False, header=False)
-            
-    current, peak = tracemalloc.get_traced_memory()
-    
-    with open('{}/manifest.txt'.format(logging_folder), 'a+') as f:
-        f.write(f'sites\t{len(final_site_level_information_df)}\n') 
-        f.write(f'peak_memory_mb\t{peak/1e6}\n') 
-        f.write(f'time_elapsed_seconds\t{time.time()-start_time:.2f}s\n') 
-
-    print(f"Current memory usage {current/1e6}MB; Peak: {peak/1e6}MB")
-    print(f'Time elapsed: {time.time()-start_time:.2f}s')
-
-    if not keep_intermediate_files:
-        pretty_print("Deleting intermediate files...", style="-")
-        delete_intermediate_files(output_folder)
-
-    pretty_print("Done!", style="+")
-    return 'Done'
-
-
-def delete_intermediate_files(output_folder):
-    to_delete = ['coverage', 'edit_info', 'split_bams', 'all_edit_info.tsv', 
-                 'concat_command.sh', 'depth_command.sh', 'combined.bed', 'merge_command.sh',
-                 'final_edit_info_no_coverage.tsv', 'final_edit_info_no_coverage_sorted.tsv',
-                 'depth_modified.tsv', 'final_edit_info.tsv', 'final_filtered_edit_info.tsv']
-    for object in to_delete:
-        object_path = '{}/{}'.format(output_folder, object)
-
-        if os.path.exists(object_path):
-            if os.path.isfile(object_path):
-                os.remove(object_path)
-            else:
-                shutil.rmtree(object_path)
-
 
 def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[],
                 verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0, interval_length=2000000):
@@ -141,6 +84,7 @@ def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", bar
     
     return overall_label_to_list_of_contents, results, total_seconds_for_reads_df, overall_total_reads, counts_summary_df
 
+    
 def bam_processing(overall_label_to_list_of_contents, output_folder, barcode_tag='CB', cores=1, number_of_expected_bams=4,
                    verbose=False):
     # Only used for single-cell and/or long read reconfiguration of bams to optimize coverage calculation
@@ -183,19 +127,6 @@ def print_marine_logo():
         
     pretty_print("Multi-core Algorithm for Rapid Identification of Nucleotide Edits", style="=")
 
-def collate_edit_info_shards(output_folder):
-    edit_info_folder = "{}/edit_info".format(output_folder)
-    edit_info_files = glob("{}/*".format(edit_info_folder))
-    print("\tFound {} files in {}...".format(len(edit_info_files), edit_info_folder))
-    all_dfs = []
-    for f in edit_info_files:
-        edit_info_df = pd.read_csv(f, sep='\t')
-        all_dfs.append(edit_info_df)
-        
-    collated_df = pd.concat(all_dfs)
-    print("\tShape of collated edit info df: {}".format(collated_df.shape))
-    print("\tColumns of collated edit info df: {}".format(collated_df.columns))
-    return collated_df
 
 def get_broken_up_contigs(contigs, num_per_sublist):
     broken_up_contigs = []
@@ -214,6 +145,7 @@ def get_broken_up_contigs(contigs, num_per_sublist):
             broken_up_contigs.append(contig_sublist)
     return broken_up_contigs
 
+
 def generate_depths(output_folder, bam_filepaths, paired_end=False):
     
     coverage_start_time = time.perf_counter()
@@ -250,30 +182,95 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False):
     
     total_seconds_for_contig_df = pd.DataFrame({'coverage_total_time': [coverage_total_time]})
     return coverage_total_time, total_seconds_for_contig_df
+        
 
-
-def get_edits_with_coverage_df(output_folder,
-                               barcode_tag=None):
-
-    all_edit_info_unique_position_with_coverage_df = pd.read_csv('{}/final_edit_info.tsv'.format(output_folder), sep='\t',
-                                                                     dtype={'coverage': int, 'position': int,
-                                                                                             'contig': str})
-    # If there was a barcode specified, then the contigs will have been set to a combination of contig and barcode ID.
-    # For example, we will find a contig to be 9_GATCCCTCAGTAACGG-1, and we will want to simplify it back to simply 9,
-    # as the barcode information is separately already present as it's own column in this dataframe. To ensure code continuity,
-    # this will still be true even with no barcode specified, in which case the contig will be <contig>_no_barcode
+def convert_sites_to_sailor(final_site_level_information_df, sailor_list, output_folder, skip_coverage):
+    # Output SAILOR-formatted file for use in FLARE downstream
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # 1       629275  629276  0.966040688     2,30    +
+    # 1       629309  629310  2.8306e-05      1,1043  +
     
-    # Therefore: replace the contig with the part before the barcode
-    all_edit_info_unique_position_with_coverage_df['contig'] = all_edit_info_unique_position_with_coverage_df.apply(lambda row: row['contig'].replace('_{}'.format(row['barcode']), ''), axis=1)
-
-    return all_edit_info_unique_position_with_coverage_df
+    for conversion in sailor_list:
+        conversion_search = conversion[0] + '>' + conversion[1]
         
+        print("Generating SAILOR-style bed outputs for conversion {}...".format(conversion))
         
+        sailor_sites,weird_sites = get_sailor_sites(final_site_level_information_df, conversion_search, skip_coverage=skip_coverage)
+        sailor_sites = sailor_sites.drop_duplicates()
+    
+        print("{} final deduplicated {} SAILOR-formatted sites".format(len(sailor_sites), conversion_search))
+        sailor_sites.to_csv('{}/sailor_style_sites_{}.bed'.format(
+            output_folder, 
+            conversion_search.replace(">", "-")), 
+            header=False,
+            index=False,       
+            sep='\t')
+
+
+def generate_bedgraphs(final_site_level_information_df, conversion_search, output_folder):
+    bedgraph_folder = '{}/bedgraphs'.format(output_folder)
+    make_folder(bedgraph_folder)
+    
+    pretty_print("Making bedgraphs for {} conversions...\n".format(bedgraphs_list))
+    for conversion in bedgraphs_list:
+        conversion_search = conversion[0] + '>' + conversion[1]
+        sites_for_conversion = final_site_level_information_df[final_site_level_information_df.conversion == conversion_search]
+        sites_for_conversion['edit_fraction'] = sites_for_conversion['count']/sites_for_conversion['coverage']
+        sites_for_conversion['start'] = sites_for_conversion['position'] - 1
+        sites_for_conversion_bedgraph_cols = sites_for_conversion[['contig', 'start', 'position', 'edit_fraction']]
+    
+        sites_for_conversion_bedgraph_cols.to_csv('{}/{}_{}.bedgraph'.format(bedgraph_folder, output_folder.split('/')[-1], conversion), sep='\t', index=False, header=False)
+
+
+def generate_combined_sites_bed_for_all_cells(output_folder):
+    # go from:
+    #
+    # 9_GATCCCTCAGTAACGG-1	3000447	3000448
+    # 9_CATCCCTCAGTAACGA-1	3000468	3000469
+    # 2_CATCCCTCAGTAACGA-1	3000451	3000452
+    # 
+    # to:
+    # 
+    # 9_GATCCCTCAGTAACGG-1	3000447	3000448
+    # 9_GATCCCTCAGTAACGG-1	3000468	3000469
+    # 9_CATCCCTCAGTAACGA-1	3000447	3000448
+    # 9_CATCCCTCAGTAACGA-1	3000468	3000469
+    # 2_CATCCCTCAGTAACGA-1	3000451	3000452
+    
+    # Input and output files
+    input_file = f"{output_folder}/combined.bed"
+    output_file = f"{output_folder}/combined_all_cells.bed"
+    
+    # Load the input data
+    df = pd.read_csv(input_file, sep="\t", header=None, names=["Contig", "Start", "End"])
+
+    df['Chromosome'] = [str(i.split('_')[0]) for i in df['Contig']]
+
+    unique_chromosomes = df['Chromosome'].unique()
+    print("Unique chromosomes found: {}".format(unique_chromosomes))
+    
+    for chromosome in unique_chromosomes: 
+        df_for_chromosome = df[df['Chromosome'] == str(chromosome)]
+        # Create unique lists:
+        unique_contigs = df_for_chromosome["Contig"].unique()  # Unique contig pairs, within this chromosome
+        unique_positions = df_for_chromosome[["Start", "End"]].drop_duplicates().values.tolist()  # Unique location pairs
         
+        # Open the output file and write combinations line by line
+        with open(output_file, "w") as f:
+            for contig in unique_contigs:
+                for start, end in unique_positions:
+                    f.write(f"{contig}\t{start}\t{end}\n")
+    
+    print(f"Output written to {output_file}")
+
+
+    
 def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strandedness=True, barcode_tag="CB", paired_end=False, barcode_whitelist_file=None, verbose=False, coverage_only=False, filtering_only=False, annotation_only=False, bedgraphs_list=[], sailor_list=[], min_base_quality = 15, min_read_quality = 0, min_dist_from_end = 10, max_edits_per_read = None, cores = 64, number_of_expected_bams=4, 
         keep_intermediate_files=False,
         num_per_sublist=6,
-        skip_coverage=False, interval_length=2000000):
+        skip_coverage=False, interval_length=2000000,
+        all_cells_coverage=False
+       ):
         
     # Check to make sure the folder is empty, otherwise prompt for overwriting
     if any(os.scandir(output_folder)):
@@ -382,20 +379,13 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
         
         # We want to run the samtools depth command for each of the reconfigured bam files
         reconfigured_bam_filepaths = glob('{}/split_bams/*/*.bam'.format(output_folder))
+        
         print("Running samtools depth on {} subset bam paths...".format(len(reconfigured_bam_filepaths)))
         total_time, total_seconds_for_contig_df = generate_depths(output_folder, reconfigured_bam_filepaths, paired_end=paired_end)
                                               
         total_seconds_for_contig_df.to_csv("{}/coverage_calculation_timing.tsv".format(logging_folder), sep='\t')
          
         pretty_print("Total time to calculate coverage: {} minutes".format(round(total_time/60, 3)))
-
-    
-    if skip_coverage:
-        # If we skip coverage steps, we have to just collate edit info results from the edit finding steps into one giant 
-        # final_edit_info.tsv file (where coverage is None)
-        pretty_print("Skipped coverage, so combining edit info alone...")
-        all_edit_info_unique_position_with_coverage_df = collate_edit_info_shards(output_folder)
-
     
     # Check if filtering step finished
     final_filtered_sites_path = '{}/final_filtered_site_info.tsv'.format(output_folder)
@@ -441,43 +431,11 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
         
         if len(sailor_list) > 0:
             print("{} sites being converted to SAILOR format...".format(len(final_site_level_information_df)))
-
-            # Output SAILOR-formatted file for use in FLARE downstream
-            # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-            # 1       629275  629276  0.966040688     2,30    +
-            # 1       629309  629310  2.8306e-05      1,1043  +
-    
-            for conversion in sailor_list:
-                conversion_search = conversion[0] + '>' + conversion[1]
-                
-                print("Generating SAILOR-style bed outputs for conversion {}...".format(conversion))
-                
-                sailor_sites,weird_sites = get_sailor_sites(final_site_level_information_df, conversion_search, skip_coverage=skip_coverage)
-                sailor_sites = sailor_sites.drop_duplicates()
-    
-                print("{} final deduplicated {} SAILOR-formatted sites".format(len(sailor_sites), conversion_search))
-                sailor_sites.to_csv('{}/sailor_style_sites_{}.bed'.format(
-                    output_folder, 
-                    conversion_search.replace(">", "-")), 
-                    header=False,
-                    index=False,       
-                    sep='\t')
-
+            convert_sites_to_sailor(final_site_level_information_df, conversion_search, output_folder, skip_coverage)
+           
         if len(bedgraphs_list) > 0:
             # Make plot of edit distributions
-            bedgraph_folder = '{}/bedgraphs'.format(output_folder)
-            make_folder(bedgraph_folder)
-            
-            pretty_print("Making bedgraphs for {} conversions...\n".format(bedgraphs_list))
-            for conversion in bedgraphs_list:
-                conversion_search = conversion[0] + '>' + conversion[1]
-                sites_for_conversion = final_site_level_information_df[final_site_level_information_df.conversion == conversion_search]
-                sites_for_conversion['edit_fraction'] = sites_for_conversion['count']/sites_for_conversion['coverage']
-                sites_for_conversion['start'] = sites_for_conversion['position'] - 1
-                sites_for_conversion_bedgraph_cols = sites_for_conversion[['contig', 'start', 'position', 'edit_fraction']]
-
-                sites_for_conversion_bedgraph_cols.to_csv('{}/{}_{}.bedgraph'.format(bedgraph_folder, output_folder.split('/')[-1], conversion), sep='\t', index=False,
-                                                                 header=False)
+            generate_bedgraphs(final_site_level_information_df, conversion_search, output_folder)
                 
     if not annotation_bedfile_path:
         print("annotation_bedfile_path argument not provided ...\
@@ -526,6 +484,26 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
     print(f"Current memory usage {current/1e6}MB; Peak: {peak/1e6}MB")
     print(f'Time elapsed: {time.time()-start_time:.2f}s')
 
+    if all_cells_coverage:
+        print("Calculating coverage at all edit sites in all cells...")
+
+        # Make a permuted version of the combined.bed file, such that every combination of contig
+        # and start-end is generated (this might be a very large file depending on number of edits and cells)
+        generate_combined_sites_bed_for_all_cells(output_folder)
+        
+         # We want to run the samtools depth command for each of the reconfigured bam files
+        reconfigured_bam_filepaths = glob('{}/split_bams/*/*.bam'.format(output_folder))
+        make_depth_command_script(paired_end,
+                                  reconfigured_bam_filepaths, 
+                                  output_folder, 
+                                  output_suffix='all_cells', run=True)
+        # Read the universal coverage data
+        df = pd.read_csv("{}/coverage/depths_all_cells.txt".format(output_folder), sep="\t", header=None, names=["Contig", "Position", "Coverage"])
+        # Pivot the data to make a matrix of contig x position with values being coverage
+        pivot = df.pivot(index="Contig", columns="Position", values="Coverage").fillna(0)
+        # Write to output
+        pivot.to_csv("{}/comprehensive_coverage_matrix.tsv".format(output_folder), sep="\t")
+            
     if not keep_intermediate_files:
         pretty_print("Deleting intermediate files...", style="-")
         delete_intermediate_files(output_folder)
@@ -579,6 +557,7 @@ def check_samtools():
     parser.add_argument('--num_per_sublist', dest='num_per_sublist', type=int, default=6)
     parser.add_argument('--paired_end', dest='paired_end', action='store_true', help='Assess coverage taking without double-counting paired end overlapping regions... slower but more accurate. Edits by default are only counted once for an entire pair, whether they show up on both ends or not.')
     parser.add_argument('--skip_coverage', dest='skip_coverage', action='store_true')
+    parser.add_argument('--all_cells_coverage', dest='all_cells_coverage', action='store_true')
     parser.add_argument('--max_edits_per_read', type=int, default=None)
     parser.add_argument('--num_intervals_per_contig', type=int, default=200, help='Deprecated')
     parser.add_argument('--interval_length', type=int, default=2000000, help='Length of intervals split analysis into...')
@@ -602,6 +581,7 @@ def check_samtools():
     verbose = args.verbose
     keep_intermediate_files = args.keep_intermediate_files
     paired_end = args.paired_end
+    all_cells_coverage = args.all_cells_coverage
     skip_coverage = args.skip_coverage
     
     barcode_tag = args.barcode_tag
@@ -689,7 +669,8 @@ def check_samtools():
                   "\tVerbose:\t{}".format(verbose),
                   "\tKeep intermediate files:\t{}".format(keep_intermediate_files),
                   "\tSkip coverage?:\t{}".format(skip_coverage),
-                  "\tFor single-cell: \t{} contigs at at time\n".format(num_per_sublist)
+                  "\tFor single-cell: \t{} contigs at at time\n".format(num_per_sublist),
+                  "\tCalculate coverage in all barcodes?: \t{}\n".format(all_cells_coverage)
                  ])
 
     if not paired_end:
@@ -727,7 +708,8 @@ def check_samtools():
         skip_coverage=skip_coverage,
         keep_intermediate_files=keep_intermediate_files,
         num_per_sublist=num_per_sublist,
-        interval_length=interval_length
+        interval_length=interval_length,
+        all_cells_coverage=all_cells_coverage
        )
     
     
\ No newline at end of file
diff --git a/src/utils.py b/src/utils.py
index d302ec9..09e4699 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -10,6 +10,8 @@
 from collections import OrderedDict, defaultdict
 from itertools import product
 from scipy.special import betainc
+import shutil
+import time
 
 cb_n = 1
 
@@ -689,10 +691,11 @@ def get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output
     for i, bam_filepath in enumerate(bam_filepaths):
         depth_command = (
         "echo 'running samtools depth on {}...';"
-        "samtools depth {}-a -b {}/combined.bed {} >> {}/coverage/depths{}.txt".format(
+        "samtools depth {}-a -b {}/combined{}.bed {} >> {}/coverage/depths{}.txt".format(
             bam_filepath, 
             paired_end_option,
             output_folder,
+            output_suffix,
             bam_filepath, 
             output_folder,
             output_suffix))
@@ -701,11 +704,18 @@ def get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output
 
 
 def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_depth_commands=[], output_suffix='', run=False):
-    samtools_depth_commands = get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output_suffix='')
+
+    samtools_depth_start_time = time.perf_counter()
+    
+    samtools_depth_commands = get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output_suffix=output_suffix)
+
+    if len(output_suffix) > 0:
+        output_suffix = '_{}'.format(output_suffix)
+    
     for depth_command in samtools_depth_commands:
         all_depth_commands.append(depth_command)
         
-    depth_bash = '{}/depth_command.sh'.format(output_folder)
+    depth_bash = '{}/depth_command{}.sh'.format(output_folder, output_suffix)
     with open(depth_bash, 'w') as f:
         for depth_command in all_depth_commands:
             f.write('{}\n\n'.format(depth_command))
@@ -715,6 +725,9 @@ def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_dept
         subprocess.run(['bash', depth_bash])
         print("Done calculating depths.")
 
+    samtools_depth_total_time = time.perf_counter() - samtools_depth_start_time
+    print('Total seconds for samtools depth commands to run: {}'.format(samtools_depth_total_time))
+
 
 def calculate_sailor_score(sailor_row):
     edits = sailor_row['count']
@@ -799,4 +812,80 @@ def concatenate_files(source_folder, file_pattern, output_filepath, run=False):
         print("Concatenating files in numerical order without headers...")
         subprocess.run(['bash', concat_bash])
         print("Done concatenating.")
+
+
+def get_edits_with_coverage_df(output_folder,
+                               barcode_tag=None):
+
+    all_edit_info_unique_position_with_coverage_df = pd.read_csv('{}/final_edit_info.tsv'.format(output_folder), sep='\t',
+                                                                     dtype={'coverage': int, 'position': int,
+                                                                                             'contig': str})
+    # If there was a barcode specified, then the contigs will have been set to a combination of contig and barcode ID.
+    # For example, we will find a contig to be 9_GATCCCTCAGTAACGG-1, and we will want to simplify it back to simply 9,
+    # as the barcode information is separately already present as it's own column in this dataframe. To ensure code continuity,
+    # this will still be true even with no barcode specified, in which case the contig will be <contig>_no_barcode
     
+    # Therefore: replace the contig with the part before the barcode
+    all_edit_info_unique_position_with_coverage_df['contig'] = all_edit_info_unique_position_with_coverage_df.apply(lambda row: row['contig'].replace('_{}'.format(row['barcode']), ''), axis=1)
+
+    return all_edit_info_unique_position_with_coverage_df
+        
+
+
+def zero_edit_found(final_site_level_information_df, output_folder, sailor_list, bedgraphs_list, keep_intermediate_files, start_time, logging_folder):
+    print("No edits found.")
+    sites_columns = ['site_id','barcode','contig','position','ref','alt','strand','count','coverage','conversion','strand_conversion']
+    sites_empty_df = pd.DataFrame(columns=sites_columns)
+    sites_empty_df.to_csv('{}/final_filtered_site_info.tsv'.format(output_folder), sep='\t', index=False)
+
+    Annotated_sites_columns = ['site_id','barcode','contig','position','ref','alt','strand','count','coverage','conversion','strand_conversion','feature_name','feature_type','feature_strand']
+    annotated_sites_empty_df = pd.DataFrame(columns=Annotated_sites_columns)
+    annotated_sites_empty_df.to_csv('{}/final_filtered_site_info_annotated.tsv'.format(output_folder), sep='\t', index=False)
+
+    empty_df = pd.DataFrame()
+    if len(sailor_list) > 0:
+        for conversion in sailor_list:
+            conversion_search = conversion[0] + '>' + conversion[1]
+            empty_df.to_csv('{}/sailor_style_sites_{}.bed'.format(
+                output_folder, 
+                conversion_search.replace(">", "-")), 
+                header=False, index=False, sep='\t')
+
+    if len(bedgraphs_list) > 0:
+        bedgraph_folder = '{}/bedgraphs'.format(output_folder)
+        make_folder(bedgraph_folder)
+        for conversion in bedgraphs_list:
+            conversion_search = conversion[0] + '>' + conversion[1]
+            empty_df.to_csv('{}/{}_{}.bedgraph'.format(bedgraph_folder, output_folder.split('/')[-1], conversion), sep='\t', index=False, header=False)
+            
+    current, peak = tracemalloc.get_traced_memory()
+    
+    with open('{}/manifest.txt'.format(logging_folder), 'a+') as f:
+        f.write(f'sites\t{len(final_site_level_information_df)}\n') 
+        f.write(f'peak_memory_mb\t{peak/1e6}\n') 
+        f.write(f'time_elapsed_seconds\t{time.time()-start_time:.2f}s\n') 
+
+    print(f"Current memory usage {current/1e6}MB; Peak: {peak/1e6}MB")
+    print(f'Time elapsed: {time.time()-start_time:.2f}s')
+
+    if not keep_intermediate_files:
+        pretty_print("Deleting intermediate files...", style="-")
+        delete_intermediate_files(output_folder)
+
+    pretty_print("Done!", style="+")
+    return 'Done'
+
+
+def delete_intermediate_files(output_folder):
+    to_delete = ['coverage', 'edit_info', 'split_bams', 'all_edit_info.tsv', 
+                 'concat_command.sh', 'depth_command.sh', 'combined.bed', 'merge_command.sh',
+                 'final_edit_info_no_coverage.tsv', 'final_edit_info_no_coverage_sorted.tsv',
+                 'depth_modified.tsv', 'final_edit_info.tsv', 'final_filtered_edit_info.tsv']
+    for object in to_delete:
+        object_path = '{}/{}'.format(output_folder, object)
+
+        if os.path.exists(object_path):
+            if os.path.isfile(object_path):
+                os.remove(object_path)
+            else:
+                shutil.rmtree(object_path)

From c799b93b2bb4c776c59bc3cb7dfeb345c9781ca8 Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@login2.tscc.sdsc.edu>
Date: Mon, 25 Nov 2024 23:01:26 -0800
Subject: [PATCH 09/26] optional generation of full cell matrix

---
 marine.py    | 15 +++++++++------
 src/utils.py | 42 ++++++++++++++++++++++++++++++++----------
 2 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/marine.py b/marine.py
index 1b2528a..db627db 100755
--- a/marine.py
+++ b/marine.py
@@ -34,7 +34,7 @@
 write_read_to_bam_file, remove_file_if_exists, make_folder, concat_and_write_bams_wrapper, \
 pretty_print, read_barcode_whitelist_file, get_contigs_that_need_bams_written, \
 make_depth_command_script, generate_and_run_bash_merge, get_sailor_sites, \
-concatenate_files, get_edits_with_coverage_df, zero_edit_found, delete_intermediate_files
+concatenate_files, run_command, get_edits_with_coverage_df, zero_edit_found, delete_intermediate_files
 
 from core import run_edit_identifier, run_bam_reconfiguration, \
 gather_edit_information_across_subcontigs, run_coverage_calculator, generate_site_level_information
@@ -156,12 +156,15 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False):
         "echo 'concatenating bed file...';"
         "for file in {}/edit_info/*edit_info.tsv; do "
         "awk 'NR > 1 {{print $2, $4-1, $4}}' OFS='\t' \"$file\"; "
-        "done | sort -k1,1 -k2,2n -u > {}/combined.bed;"
+        "done | sort -k1,1 -k2,2n -u > {}/combined_source_cells.bed;"
     ).format(output_folder, output_folder)
+    
+    run_command(combine_edit_sites_command)
+
     all_depth_commands.append(combine_edit_sites_command)
 
     make_depth_command_script(paired_end, bam_filepaths, output_folder,
-                              all_depth_commands=all_depth_commands, output_suffix='', run=True)
+                              all_depth_commands=all_depth_commands, output_suffix='source_cells', run=True)
 
     print("Concatenating edit info files...")
     concatenate_files(output_folder, "edit_info/*edit_info.tsv",
@@ -175,7 +178,7 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False):
 
     generate_and_run_bash_merge(output_folder,
                                 '{}/final_edit_info_no_coverage.tsv'.format(output_folder),
-                            '{}/coverage/depths.txt'.format(output_folder), 
+                            '{}/coverage/depths_source_cells.txt'.format(output_folder), 
                             '{}/final_edit_info.tsv'.format(output_folder), header_columns)
     
     coverage_total_time = time.perf_counter() - coverage_start_time
@@ -238,7 +241,7 @@ def generate_combined_sites_bed_for_all_cells(output_folder):
     # 2_CATCCCTCAGTAACGA-1	3000451	3000452
     
     # Input and output files
-    input_file = f"{output_folder}/combined.bed"
+    input_file = f"{output_folder}/combined_source_cells.bed"
     output_file = f"{output_folder}/combined_all_cells.bed"
     
     # Load the input data
@@ -712,4 +715,4 @@ def check_samtools():
         all_cells_coverage=all_cells_coverage
        )
     
-    
\ No newline at end of file
+    
diff --git a/src/utils.py b/src/utils.py
index 09e4699..750f32e 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -11,6 +11,8 @@
 from itertools import product
 from scipy.special import betainc
 import shutil
+from multiprocessing import Pool
+import multiprocessing
 import time
 
 cb_n = 1
@@ -691,39 +693,59 @@ def get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output
     for i, bam_filepath in enumerate(bam_filepaths):
         depth_command = (
         "echo 'running samtools depth on {}...';"
-        "samtools depth {}-a -b {}/combined{}.bed {} >> {}/coverage/depths{}.txt".format(
+        "samtools depth {}-a -b {}/combined{}.bed {} >> {}/coverage/depths{}{}.txt".format(
             bam_filepath, 
             paired_end_option,
             output_folder,
             output_suffix,
             bam_filepath, 
             output_folder,
+            i,
             output_suffix))
         all_depth_commands.append(depth_command)
     return all_depth_commands
 
 
-def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_depth_commands=[], output_suffix='', run=False):
+def run_command(command):
+    """Helper function to execute a single shell command."""
+    try:
+        subprocess.run(command, shell=True, check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Command failed: {command}\nError: {e}")
+        raise e
 
+def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_depth_commands=[], output_suffix='', run=False, processes=4):
     samtools_depth_start_time = time.perf_counter()
     
-    samtools_depth_commands = get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output_suffix=output_suffix)
+    samtools_depth_commands = get_samtools_depth_commands(paired_end, bam_filepaths, 
+                                                          output_folder, 
+                                                          output_suffix=output_suffix)
+
+    all_depth_commands += samtools_depth_commands
 
     if len(output_suffix) > 0:
         output_suffix = '_{}'.format(output_suffix)
-    
-    for depth_command in samtools_depth_commands:
-        all_depth_commands.append(depth_command)
-        
+     
     depth_bash = '{}/depth_command{}.sh'.format(output_folder, output_suffix)
     with open(depth_bash, 'w') as f:
         for depth_command in all_depth_commands:
             f.write('{}\n\n'.format(depth_command))
 
     if run:             
-        print("Calculating depths...")
-        subprocess.run(['bash', depth_bash])
-        print("Done calculating depths.")
+        print("Calculating depths using multiprocessing...")
+        try:
+            with Pool(processes=processes) as pool:
+                pool.map(run_command, samtools_depth_commands)
+        except Exception as e:
+            # If any command fails, print error and exit
+            print(f"Aborting: One or more commands failed with an error: {e}", file=sys.stderr)
+            sys.exit(1)  # Exit with an error code
+        
+        print("Done calculating depths. Merging depths...")
+        run_command("cat {}/coverage/depths*{}.txt > {}/coverage/depths{}.txt".format(output_folder, output_suffix,
+            output_folder, output_suffix))
+
+        print("Done merging depths.")
 
     samtools_depth_total_time = time.perf_counter() - samtools_depth_start_time
     print('Total seconds for samtools depth commands to run: {}'.format(samtools_depth_total_time))

From 18cd872d38d6ed259b3b2886596765b9cdb3bb59 Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@login2.tscc.sdsc.edu>
Date: Mon, 25 Nov 2024 23:15:18 -0800
Subject: [PATCH 10/26] fix sailor and bedgraph

---
 marine.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/marine.py b/marine.py
index db627db..9767cb6 100755
--- a/marine.py
+++ b/marine.py
@@ -434,11 +434,11 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
         
         if len(sailor_list) > 0:
             print("{} sites being converted to SAILOR format...".format(len(final_site_level_information_df)))
-            convert_sites_to_sailor(final_site_level_information_df, conversion_search, output_folder, skip_coverage)
+            convert_sites_to_sailor(final_site_level_information_df, sailor_list, output_folder, skip_coverage)
            
         if len(bedgraphs_list) > 0:
             # Make plot of edit distributions
-            generate_bedgraphs(final_site_level_information_df, conversion_search, output_folder)
+            generate_bedgraphs(final_site_level_information_df, bedgraphs_list, output_folder)
                 
     if not annotation_bedfile_path:
         print("annotation_bedfile_path argument not provided ...\

From 9f43932243c6793ba956b4b70118dae604f67b84 Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-11-71.tscc.sdsc.edu>
Date: Tue, 26 Nov 2024 09:15:16 -0800
Subject: [PATCH 11/26] generate coverage barcode matrix

---
 marine.py    |  12 ++---
 src/utils.py | 133 +++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 126 insertions(+), 19 deletions(-)

diff --git a/marine.py b/marine.py
index 9767cb6..b1b1338 100755
--- a/marine.py
+++ b/marine.py
@@ -178,7 +178,7 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False):
 
     generate_and_run_bash_merge(output_folder,
                                 '{}/final_edit_info_no_coverage.tsv'.format(output_folder),
-                            '{}/coverage/depths_source_cells.txt'.format(output_folder), 
+                            '{}/depths_source_cells.txt'.format(output_folder), 
                             '{}/final_edit_info.tsv'.format(output_folder), header_columns)
     
     coverage_total_time = time.perf_counter() - coverage_start_time
@@ -499,14 +499,8 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
         make_depth_command_script(paired_end,
                                   reconfigured_bam_filepaths, 
                                   output_folder, 
-                                  output_suffix='all_cells', run=True)
-        # Read the universal coverage data
-        df = pd.read_csv("{}/coverage/depths_all_cells.txt".format(output_folder), sep="\t", header=None, names=["Contig", "Position", "Coverage"])
-        # Pivot the data to make a matrix of contig x position with values being coverage
-        pivot = df.pivot(index="Contig", columns="Position", values="Coverage").fillna(0)
-        # Write to output
-        pivot.to_csv("{}/comprehensive_coverage_matrix.tsv".format(output_folder), sep="\t")
-            
+                                  output_suffix='all_cells', run=True, pivot=True)
+        
     if not keep_intermediate_files:
         pretty_print("Deleting intermediate files...", style="-")
         delete_intermediate_files(output_folder)
diff --git a/src/utils.py b/src/utils.py
index 750f32e..1453b6f 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -692,7 +692,7 @@ def get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output
 
     for i, bam_filepath in enumerate(bam_filepaths):
         depth_command = (
-        "echo 'running samtools depth on {}...';"
+        "echo '    running samtools depth on {}...';"
         "samtools depth {}-a -b {}/combined{}.bed {} >> {}/coverage/depths{}{}.txt".format(
             bam_filepath, 
             paired_end_option,
@@ -705,16 +705,87 @@ def get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output
         all_depth_commands.append(depth_command)
     return all_depth_commands
 
+def generate_empty_matrix_file(matrix_output_filepath):
+    pass
+
+
+def pivot_depths_output(depths_input_filepath, matrix_output_filepath):
+    """
+    Transform depths data into a pivoted matrix with reformatted row and column labels.
+
+        The rows will look like this:
+        
+        9_GATCCCTCAGTAACGG-1	3000448	1
+        9_GATCCCTCAGTAACGG-1	3000469	1
+        9_GATCCCTCAGTAACGG-1	3000508	3
+        
+        We want them to look like this:
+        
+        GATCCCTCAGTAACGG-1	9:3000448	1
+        GATCCCTCAGTAACGG-1	9:3000469	1
+        GATCCCTCAGTAACGG-1	9:3000508	3
+        
+        # And after the pivot:
+                        9:3000448 9:3000469 9:3000508
+        GATCCCTCAGTAACGG        1         1         3
+    
+    """
+    # Read the universal coverage data
+    df = pd.read_csv(depths_input_filepath, sep="\t", header=None, names=["Contig", "Position", "Coverage"])
+
+
+    # Extract sample ID from the Contig and create a combined column for Position
+    df["Barcode"] = df["Contig"].str.split("_", n=1).str[1]  # Extract everything after the first underscore
+    df["CombinedPosition"] = df["Contig"].str.split("_", n=1).str[0] + ":" + df["Position"].astype(str)
+
+    # Drop the original Contig and Position columns (optional, for clarity)
+    df = df[["Barcode", "CombinedPosition", "Coverage"]]
+
+    # Pivot the data to make a matrix of Sample x CombinedPosition with values being Coverage
+    pivot = df.pivot(index="CombinedPosition", columns="Barcode", values="Coverage").fillna(0)
+    
+    # Write to output
+    pivot.to_csv(matrix_output_filepath, sep="\t")
+    
+
 
 def run_command(command):
-    """Helper function to execute a single shell command."""
+    # Run the samtools depth command
+    subprocess.run(command, shell=True, check=True)
+
+
+def run_depth_command(command, pivot=False, output_matrix_folder=None):
+    """
+    Run a shell command and optionally process the output file with the pivot function.
+    """
     try:
-        subprocess.run(command, shell=True, check=True)
+        # Extract output file from the command
+        depths_file = command.split(">>")[-1].strip()
+        
+        # Check if the depths file exists and is non-empty
+        if not os.path.exists(depths_file) or os.path.getsize(depths_file) == 0:
+            # File doesn't exist or is empty, run the command
+            print(f"Running coverage command for {depths_file}...")
+            run_command(command)
+
+        else:
+            print(f"Skipping coverage generation for existing non-empty file: {depths_file}")
+        
+        if pivot:
+            matrix_output_file = os.path.join(output_matrix_folder, f"{os.path.basename(depths_file).split('.')[0]}_matrix.tsv")
+
+            if not os.path.exists(matrix_output_file) or os.path.getsize(matrix_output_file) == 0:
+                # Pivot the depths output file into a matrix
+                pivot_depths_output(depths_file, matrix_output_file)
+
     except subprocess.CalledProcessError as e:
-        print(f"Command failed: {command}\nError: {e}")
+        print(f"Command failed: {command}\nError: {e}", file=sys.stderr)
         raise e
 
-def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_depth_commands=[], output_suffix='', run=False, processes=4):
+
+
+def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_depth_commands=[], output_suffix='', 
+                              run=False, pivot=False, processes=4):
     samtools_depth_start_time = time.perf_counter()
     
     samtools_depth_commands = get_samtools_depth_commands(paired_end, bam_filepaths, 
@@ -732,23 +803,65 @@ def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_dept
             f.write('{}\n\n'.format(depth_command))
 
     if run:             
+        
         print("Calculating depths using multiprocessing...")
         try:
+            # Create a folder for matrix outputs if pivoting is enabled
+            output_matrix_folder = f"{output_folder}/matrix_outputs"
+            if pivot:
+                os.makedirs(output_matrix_folder, exist_ok=True)
+            
+            # Prepare arguments for starmap
+            run_command_args = [
+                (command, pivot, output_matrix_folder) for command in samtools_depth_commands
+            ]
+
+            # Use starmap to pass multiple arguments to run_command
             with Pool(processes=processes) as pool:
-                pool.map(run_command, samtools_depth_commands)
+                pool.starmap(run_depth_command, run_command_args)
+                
         except Exception as e:
             # If any command fails, print error and exit
             print(f"Aborting: One or more commands failed with an error: {e}", file=sys.stderr)
             sys.exit(1)  # Exit with an error code
         
         print("Done calculating depths. Merging depths...")
-        run_command("cat {}/coverage/depths*{}.txt > {}/coverage/depths{}.txt".format(output_folder, output_suffix,
-            output_folder, output_suffix))
-
+        run_command(f"cat {output_folder}/coverage/depths*{output_suffix}.txt > {output_folder}/depths{output_suffix}.txt")
         print("Done merging depths.")
 
+        
+        if pivot:
+            print("Merging matrices column-wise...")
+
+            # Create a list of non-empty matrix files
+            matrix_files = [
+                os.path.join(output_matrix_folder, file)
+                for file in os.listdir(output_matrix_folder)
+                if os.path.getsize(os.path.join(output_matrix_folder, file)) > 20  # Skip empty files
+            ]
+    
+            if len(matrix_files) > 0:
+                # Ensure only the first file keeps the row headers
+                first_file = matrix_files[0]
+                other_files = matrix_files[1:]
+                
+                merged_file = os.path.join(output_folder, "comprehensive_coverage_matrix.tsv")
+        
+                # Prepare the paste command
+                strip_headers_command = " ".join(
+                    [f"<(cut -f2- {file})" for file in other_files]
+                )
+                paste_command = f"paste {first_file} {strip_headers_command} > {merged_file}"
+        
+                # Use bash to execute the paste command
+                run_command(f"bash -c '{paste_command}'")
+        
+                print(f"Columnar merge complete. Output saved to {merged_file}.")
+            else:
+                print("No non-empty matrix files to merge.")
+        
     samtools_depth_total_time = time.perf_counter() - samtools_depth_start_time
-    print('Total seconds for samtools depth commands to run: {}'.format(samtools_depth_total_time))
+    print('Total seconds for samtools depth and matrix generation commands to run: {}'.format(samtools_depth_total_time))
 
 
 def calculate_sailor_score(sailor_row):

From f9c6b4326a2f3f471023b86fc5348ea223661def Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-11-71.tscc.sdsc.edu>
Date: Tue, 26 Nov 2024 13:13:12 -0800
Subject: [PATCH 12/26] pre refactor

---
 marine.py    | 70 +++++++++++++++++++++++++++++++++++++++++++++++++---
 src/utils.py | 51 ++++++++++++++++++++++++--------------
 2 files changed, 98 insertions(+), 23 deletions(-)

diff --git a/marine.py b/marine.py
index b1b1338..495dd73 100755
--- a/marine.py
+++ b/marine.py
@@ -145,6 +145,44 @@ def get_broken_up_contigs(contigs, num_per_sublist):
             broken_up_contigs.append(contig_sublist)
     return broken_up_contigs
 
+def split_bed_file(input_bed_file, output_folder, bam_file_paths, output_suffix='', run=False):
+    """
+    Split a combined BED file into multiple BED files based on BAM file suffixes.
+    Match rows containing both the chromosome prefix and the cell ID suffix.
+    """
+    all_awk_commands = []
+    
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+
+    # Extract chromosome and suffix pairs from BAM file paths
+    suffix_pairs = [
+        (os.path.basename(bam).split("_")[0], os.path.basename(bam).split("_")[1].split(".")[0])
+        for bam in bam_file_paths
+    ]
+
+    # Run a grep/awk command for each suffix pair
+    for chrom, suffix in suffix_pairs:
+        output_bed_file = os.path.join(output_folder, f"combined_{output_suffix}_{chrom}_{suffix}.bed")
+        awk_command = f"awk '$1 ~ /^{chrom}_/ && $1 ~ /{suffix}/' {input_bed_file} > {output_bed_file}"
+        
+        all_awk_commands.append(awk_command)
+
+        if run:
+            try:
+                #print(f"Running: {awk_command}")
+                run_command(awk_command)
+                
+                # Check if the output file is empty
+                if os.path.getsize(output_bed_file) == 0:
+                    #print(f"No matches found for {chrom}_{suffix}. Removing empty file.")
+                    os.remove(output_bed_file)
+    
+            except subprocess.CalledProcessError as e:
+                #print(f"No matches found for {chrom}_{suffix}. Skipping...")
+                pass
+                
+    return all_awk_commands
 
 def generate_depths(output_folder, bam_filepaths, paired_end=False):
     
@@ -158,14 +196,26 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False):
         "awk 'NR > 1 {{print $2, $4-1, $4}}' OFS='\t' \"$file\"; "
         "done | sort -k1,1 -k2,2n -u > {}/combined_source_cells.bed;"
     ).format(output_folder, output_folder)
-    
     run_command(combine_edit_sites_command)
 
     all_depth_commands.append(combine_edit_sites_command)
 
+    output_suffix = 'source_cells'
+    if len(bam_filepaths) > 1:
+        separate_edit_sites_commands = split_bed_file(
+            f"{output_folder}/combined_{output_suffix}.bed",
+            f"{output_folder}/combined_{output_suffix}_split_by_suffix",
+            bam_filepaths, 
+            output_suffix=output_suffix,
+            run=True
+        )
+        
+        all_depth_commands += separate_edit_sites_commands
+
     make_depth_command_script(paired_end, bam_filepaths, output_folder,
                               all_depth_commands=all_depth_commands, output_suffix='source_cells', run=True)
 
+
     print("Concatenating edit info files...")
     concatenate_files(output_folder, "edit_info/*edit_info.tsv",
                       "{}/final_edit_info_no_coverage.tsv".format(output_folder),
@@ -259,7 +309,7 @@ def generate_combined_sites_bed_for_all_cells(output_folder):
         unique_positions = df_for_chromosome[["Start", "End"]].drop_duplicates().values.tolist()  # Unique location pairs
         
         # Open the output file and write combinations line by line
-        with open(output_file, "w") as f:
+        with open(output_file, "a") as f:
             for contig in unique_contigs:
                 for start, end in unique_positions:
                     f.write(f"{contig}\t{start}\t{end}\n")
@@ -495,11 +545,23 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
         generate_combined_sites_bed_for_all_cells(output_folder)
         
          # We want to run the samtools depth command for each of the reconfigured bam files
-        reconfigured_bam_filepaths = glob('{}/split_bams/*/*.bam'.format(output_folder))
+        bam_filepaths = glob('{}/split_bams/*/*.bam'.format(output_folder))
+        output_suffix = "all_cells"
+        if len(bam_filepaths) > 1:
+            separate_edit_sites_commands = split_bed_file(
+                f"{output_folder}/combined_{output_suffix}.bed",
+                f"{output_folder}/combined_{output_suffix}_split_by_suffix",
+                bam_filepaths, 
+                output_suffix=output_suffix,
+                run=True
+            )
+
         make_depth_command_script(paired_end,
                                   reconfigured_bam_filepaths, 
                                   output_folder, 
-                                  output_suffix='all_cells', run=True, pivot=True)
+                                  output_suffix=output_suffix,
+                                  run=True,
+                                  pivot=True)
         
     if not keep_intermediate_files:
         pretty_print("Deleting intermediate files...", style="-")
diff --git a/src/utils.py b/src/utils.py
index 1453b6f..f88985d 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -680,31 +680,41 @@ def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_fi
             
 
 def get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output_suffix=''):
+    """
+    Generate samtools depth commands for each BAM file using its corresponding split BED file.
+    """
     all_depth_commands = []
 
+    # Format output suffix
     if len(output_suffix) > 0:
         output_suffix = '_{}'.format(output_suffix)
         
-    if paired_end:
-        paired_end_option = '-s '
-    else:
-        paired_end_option = ''
+    # Paired-end option for samtools depth
+    paired_end_option = '-s ' if paired_end else ''
 
+    print("\n\tGetting suffixes from {} bam files...".format(len(bam_filepaths)))
     for i, bam_filepath in enumerate(bam_filepaths):
-        depth_command = (
-        "echo '    running samtools depth on {}...';"
-        "samtools depth {}-a -b {}/combined{}.bed {} >> {}/coverage/depths{}{}.txt".format(
-            bam_filepath, 
-            paired_end_option,
-            output_folder,
-            output_suffix,
-            bam_filepath, 
-            output_folder,
-            i,
-            output_suffix))
-        all_depth_commands.append(depth_command)
+        # Extract suffix from BAM filename
+        bam_filename = os.path.basename(bam_filepath)
+        bam_prefix, bam_suffix = bam_filename.split("_")[0], bam_filename.split("_")[1].split(".")[0]
+
+        # Path to the corresponding split BED file
+        split_bed_file = f"{output_folder}/combined{output_suffix}_split_by_suffix/combined{output_suffix}_{bam_prefix}_{bam_suffix}.bed"
+
+        if os.path.exists(split_bed_file):
+            # Construct the samtools depth command
+            depth_command = (
+                f"echo '    running samtools depth on {bam_filepath}...';"
+                f"samtools depth {paired_end_option}-a -b {split_bed_file} {bam_filepath} >> {output_folder}/coverage/depths{output_suffix}_{bam_prefix}_{bam_suffix}.txt"
+            )
+
+            # Add to the list of commands
+            all_depth_commands.append(depth_command)
+
+    print("\n\tOnly {} of the bam files had edits".format(len(all_depth_commands)))
     return all_depth_commands
 
+
 def generate_empty_matrix_file(matrix_output_filepath):
     pass
 
@@ -794,6 +804,7 @@ def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_dept
 
     all_depth_commands += samtools_depth_commands
 
+    print('\n\t\tsamtools_depth_commands: {}'.format(len(samtools_depth_commands)))
     if len(output_suffix) > 0:
         output_suffix = '_{}'.format(output_suffix)
      
@@ -826,7 +837,7 @@ def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_dept
             sys.exit(1)  # Exit with an error code
         
         print("Done calculating depths. Merging depths...")
-        run_command(f"cat {output_folder}/coverage/depths*{output_suffix}.txt > {output_folder}/depths{output_suffix}.txt")
+        run_command(f"cat {output_folder}/coverage/depths{output_suffix}*.txt > {output_folder}/depths{output_suffix}.txt")
         print("Done merging depths.")
 
         
@@ -1013,9 +1024,11 @@ def zero_edit_found(final_site_level_information_df, output_folder, sailor_list,
 
 def delete_intermediate_files(output_folder):
     to_delete = ['coverage', 'edit_info', 'split_bams', 'all_edit_info.tsv', 
-                 'concat_command.sh', 'depth_command.sh', 'combined.bed', 'merge_command.sh',
+                 'concat_command.sh', 'depth_command_source_cells.sh', 'combined.bed', 'merge_command.sh',
                  'final_edit_info_no_coverage.tsv', 'final_edit_info_no_coverage_sorted.tsv',
-                 'depth_modified.tsv', 'final_edit_info.tsv', 'final_filtered_edit_info.tsv']
+                 'depths_source_cells.txt', 'depth_modified.tsv', 'final_edit_info.tsv', 'final_filtered_edit_info.tsv',
+                 'combined_all_cells.bed', 'depth_command_all_cells.sh', 'depths_all_cells.txt', 'combined_source_cells.bed'
+                ]
     for object in to_delete:
         object_path = '{}/{}'.format(output_folder, object)
 

From a26b2d2993f9385674813c68209af5713670054a Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-11-71.tscc.sdsc.edu>
Date: Tue, 26 Nov 2024 13:37:51 -0800
Subject: [PATCH 13/26] final matrices generated per chrom

---
 src/utils.py | 180 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 112 insertions(+), 68 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index f88985d..aa376ac 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -793,86 +793,130 @@ def run_depth_command(command, pivot=False, output_matrix_folder=None):
         raise e
 
 
+def merge_files_by_chromosome(args):
+    """
+    Helper function to merge files for a single chromosome.
+    """
+    chromosome, files, output_folder = args
+    first_file = files[0]
+    other_files = files[1:]
+    merged_file = os.path.join(output_folder, f"{chromosome}_comprehensive_coverage_matrix.tsv")
+
+    # Prepare the paste command
+    strip_headers_command = " ".join(
+        [f"<(cut -f2- {file})" for file in other_files]
+    )
+    paste_command = f"paste {first_file} {strip_headers_command} > {merged_file}"
 
-def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_depth_commands=[], output_suffix='', 
-                              run=False, pivot=False, processes=4):
-    samtools_depth_start_time = time.perf_counter()
+    # Use bash to execute the paste command
+    run_command(f"bash -c '{paste_command}'")
+    print(f"Columnar merge complete for {chromosome}. Output saved to {merged_file}.")
+
+
+def prepare_matrix_files_multiprocess(output_matrix_folder, 
+                                      output_folder, 
+                                      processes=4):
+    """
+    Merges matrix files column-wise, grouping by chromosome, using multiprocessing.
+    """
+    print("Merging matrices column-wise by chromosome...")
+
+    # Group files by chromosome
+    matrix_files = [
+        os.path.join(output_matrix_folder, file)
+        for file in os.listdir(output_matrix_folder)
+        if file.endswith("matrix.tsv") and os.path.getsize(os.path.join(output_matrix_folder, file)) > 20  # Skip empty files
+    ]
     
+    if not matrix_files:
+        print("No non-empty matrix files to merge.")
+        return
+
+    # Group files by chromosome
+    file_groups = {}
+    for file in matrix_files:
+        chromosome = os.path.basename(file).split("all_cells_")[-1].split("_")[0]  # Adjust this split logic as needed
+        file_groups.setdefault(chromosome, []).append(file)
+
+    # Prepare arguments for multiprocessing
+    task_args = [(chromosome, files, output_folder) for chromosome, files in file_groups.items()]
+
+    # Use multiprocessing to run the merge for each chromosome
+    with Pool(processes=processes) as pool:
+        pool.map(merge_files_by_chromosome, task_args)
+
+    print("All columnar merges complete.")
+
+
+
+def merge_depth_files(output_folder, output_suffix=''):
+    """
+    Merges depth files into a single file.
+    """
+    print("Merging depths...")
+        
+    depth_files = f"{output_folder}/coverage/depths_{output_suffix}*.txt"
+    merged_depth_file = f"{output_folder}/depths_{output_suffix}.txt"
+    run_command(f"cat {depth_files} > {merged_depth_file}")
+    print(f"Depths merged into {merged_depth_file}.")
+
+
+def run_samtools_depth(commands, processes, pivot, output_matrix_folder):
+    """
+    Runs samtools depth commands in parallel using multiprocessing.
+    """
+    try:
+        # Prepare arguments for starmap
+        run_command_args = [
+            (command, pivot, output_matrix_folder) for command in commands
+        ]
+
+        with Pool(processes=processes) as pool:
+            pool.starmap(run_depth_command, run_command_args)
+
+    except Exception as e:
+        print(f"Aborting: One or more commands failed with an error: {e}", file=sys.stderr)
+        sys.exit(1)  # Exit with an error code
+
+
+def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_depth_commands=[], 
+                              output_suffix='', run=False, pivot=False, processes=4):
+    """
+    Main function to generate and execute samtools depth commands, and optionally pivot and merge matrices.
+    """
+    samtools_depth_start_time = time.perf_counter()
+
+    # Generate samtools depth commands
     samtools_depth_commands = get_samtools_depth_commands(paired_end, bam_filepaths, 
                                                           output_folder, 
                                                           output_suffix=output_suffix)
-
     all_depth_commands += samtools_depth_commands
 
-    print('\n\t\tsamtools_depth_commands: {}'.format(len(samtools_depth_commands)))
-    if len(output_suffix) > 0:
-        output_suffix = '_{}'.format(output_suffix)
-     
-    depth_bash = '{}/depth_command{}.sh'.format(output_folder, output_suffix)
-    with open(depth_bash, 'w') as f:
-        for depth_command in all_depth_commands:
-            f.write('{}\n\n'.format(depth_command))
+    print(f"\tsamtools_depth_commands: {len(samtools_depth_commands)}")
 
-    if run:             
-        
+    # Create a folder for matrix outputs if pivoting is enabled
+    output_matrix_folder = f"{output_folder}/matrix_outputs"
+    final_combined_matrices_folder = f"{output_folder}/final_matrix_outputs"
+    if pivot:
+        os.makedirs(output_matrix_folder, exist_ok=True)
+        os.makedirs(final_combined_matrices_folder, exist_ok=True)
+
+    if run:
         print("Calculating depths using multiprocessing...")
-        try:
-            # Create a folder for matrix outputs if pivoting is enabled
-            output_matrix_folder = f"{output_folder}/matrix_outputs"
-            if pivot:
-                os.makedirs(output_matrix_folder, exist_ok=True)
-            
-            # Prepare arguments for starmap
-            run_command_args = [
-                (command, pivot, output_matrix_folder) for command in samtools_depth_commands
-            ]
-
-            # Use starmap to pass multiple arguments to run_command
-            with Pool(processes=processes) as pool:
-                pool.starmap(run_depth_command, run_command_args)
-                
-        except Exception as e:
-            # If any command fails, print error and exit
-            print(f"Aborting: One or more commands failed with an error: {e}", file=sys.stderr)
-            sys.exit(1)  # Exit with an error code
-        
-        print("Done calculating depths. Merging depths...")
-        run_command(f"cat {output_folder}/coverage/depths{output_suffix}*.txt > {output_folder}/depths{output_suffix}.txt")
-        print("Done merging depths.")
+        run_samtools_depth(samtools_depth_commands, processes, pivot, output_matrix_folder)
+
+        merge_depth_files(output_folder, output_suffix)
 
-        
         if pivot:
-            print("Merging matrices column-wise...")
-
-            # Create a list of non-empty matrix files
-            matrix_files = [
-                os.path.join(output_matrix_folder, file)
-                for file in os.listdir(output_matrix_folder)
-                if os.path.getsize(os.path.join(output_matrix_folder, file)) > 20  # Skip empty files
-            ]
-    
-            if len(matrix_files) > 0:
-                # Ensure only the first file keeps the row headers
-                first_file = matrix_files[0]
-                other_files = matrix_files[1:]
-                
-                merged_file = os.path.join(output_folder, "comprehensive_coverage_matrix.tsv")
-        
-                # Prepare the paste command
-                strip_headers_command = " ".join(
-                    [f"<(cut -f2- {file})" for file in other_files]
-                )
-                paste_command = f"paste {first_file} {strip_headers_command} > {merged_file}"
-        
-                # Use bash to execute the paste command
-                run_command(f"bash -c '{paste_command}'")
-        
-                print(f"Columnar merge complete. Output saved to {merged_file}.")
-            else:
-                print("No non-empty matrix files to merge.")
-        
+            prepare_matrix_files_multiprocess(
+                output_matrix_folder, 
+                final_combined_matrices_folder,
+                processes=processes
+            )
+
     samtools_depth_total_time = time.perf_counter() - samtools_depth_start_time
-    print('Total seconds for samtools depth and matrix generation commands to run: {}'.format(samtools_depth_total_time))
+    print(f"Total seconds for samtools depth and matrix generation commands to run: {samtools_depth_total_time}")
+
 
 
 def calculate_sailor_score(sailor_row):

From c09d876d606122b32bbfa3f43865e79a5d61385f Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-11-71.tscc.sdsc.edu>
Date: Thu, 28 Nov 2024 12:32:04 -0800
Subject: [PATCH 14/26] finish sc splitting approach for all cell

---
 marine.py    | 19 +++++++++++++++----
 src/core.py  |  2 +-
 src/utils.py | 42 +++++++++++++++++++++++++++---------------
 3 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/marine.py b/marine.py
index 495dd73..8a1c60b 100755
--- a/marine.py
+++ b/marine.py
@@ -93,7 +93,7 @@ def bam_processing(overall_label_to_list_of_contents, output_folder, barcode_tag
     contigs_to_generate_bams_for = get_contigs_that_need_bams_written(list(overall_label_to_list_of_contents.keys()),
                                                                       split_bams_folder, 
                                                                       barcode_tag=barcode_tag,
-                                                                      number_of_expected_bams=number_of_expected_bams
+                                                                    number_of_expected_bams=number_of_expected_bams
                                                                      )
     if verbose:
         pretty_print("Will split and reconfigure the following contigs: {}".format(",".join(contigs_to_generate_bams_for)))
@@ -213,7 +213,7 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False):
         all_depth_commands += separate_edit_sites_commands
 
     make_depth_command_script(paired_end, bam_filepaths, output_folder,
-                              all_depth_commands=all_depth_commands, output_suffix='source_cells', run=True)
+                              all_depth_commands=all_depth_commands, output_suffix='source_cells', run=True, processes=cores)
 
 
     print("Concatenating edit info files...")
@@ -406,7 +406,8 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
                 pretty_print("Contigs processed\n\n\t{}".format(sorted(list(overall_label_to_list_of_contents.keys()))))
                 pretty_print("Splitting and reconfiguring BAMs to optimize coverage calculations", style="~")
                 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    
+
+                
                 total_bam_generation_time, total_seconds_for_bams_df = bam_processing(overall_label_to_list_of_contents, output_folder, barcode_tag=barcode_tag, cores=cores, number_of_expected_bams=number_of_expected_bams, verbose=verbose)
                 #total_seconds_for_bams_df.to_csv("{}/bam_reconfiguration_timing.tsv".format(logging_folder), sep='\t')
                 pretty_print("Total time to concat and write bams: {} minutes".format(round(total_bam_generation_time/60, 3)))
@@ -561,7 +562,9 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
                                   output_folder, 
                                   output_suffix=output_suffix,
                                   run=True,
-                                  pivot=True)
+                                  pivot=True,
+                                  processes=cores
+                                 )
         
     if not keep_intermediate_files:
         pretty_print("Deleting intermediate files...", style="-")
@@ -654,6 +657,14 @@ def check_samtools():
     num_per_sublist = args.num_per_sublist
 
 
+    """
+    # Convert all_cells_coverage into list of conversions for which to output all cell info
+    if not all_cells_coverage is None:
+        all_cells_coverage_list = all_cells_coverage.upper().replace('I', 'G').split(',')
+        for c in all_cells_coverage_list:
+            assert(c in ['AC', 'AG', 'AT', 'CA', 'CG', 'CT', 'GA', 'GC', 'GT', 'TA', 'TC', 'TG']) 
+    """
+    
     # Convert bedgraphs argument into list of conversions
     if not bedgraphs is None:
         if barcode_tag in ['CB', 'IB']:
diff --git a/src/core.py b/src/core.py
index b96123c..8b9a1ef 100644
--- a/src/core.py
+++ b/src/core.py
@@ -149,7 +149,7 @@ def find_edits(bampath, contig, split_index, start, end, output_folder, barcode_
         
     output_file = '{}/{}_{}_{}_{}_edit_info.tsv'.format(edit_info_subfolder, contig, split_index, start, end)
     output_bedfile = '{}/{}_{}_{}_{}_edit_positions.bed'.format(edit_info_subfolder, contig, split_index, start, end)
-    
+
     remove_file_if_exists(output_file)
     
     with open(output_file, 'w') as f:        
diff --git a/src/utils.py b/src/utils.py
index aa376ac..e7b1de6 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -15,7 +15,7 @@
 import multiprocessing
 import time
 
-cb_n = 1
+CB_N = 2
 
 def generate_permutations_list_for_CB(n):
     """
@@ -38,7 +38,7 @@ def generate_permutations_list_for_CB(n):
     return result
 
 suffixes = {
-    'CB': generate_permutations_list_for_CB(cb_n),
+    'CB': generate_permutations_list_for_CB(CB_N),
     'IS': [
         "00","01","02","03","04","05","06","07","08","09",
         "10","11","12","13","14","15","16","17","18","19",
@@ -104,7 +104,7 @@ def get_contigs_that_need_bams_written(expected_contigs, split_bams_folder, barc
         subsets_per_contig[contig_label] += 1
 
     if barcode_tag == 'CB':
-        number_of_expected_bams = cb_n
+        number_of_expected_bams = 4**CB_N
     else:
         number_of_expected_bams = number_of_expected_bams
         
@@ -582,6 +582,18 @@ def concat_and_write_bams(contig, df_dict, header_string, split_bams_folder, bar
     print("\t{} suffixes".format(len(suffix_options)))
     
     for suffix in suffix_options:
+        # Make a sub-subfolder to put the bams for this specific contig
+        contig_folder = '{}/{}_{}/'.format(split_bams_folder, contig, suffix)
+        if not os.path.exists(contig_folder):
+            os.mkdir(contig_folder)
+            
+            
+        bam_file_name = '{}/{}_{}.bam'.format(contig_folder, contig, suffix)
+
+        if os.path.exists(f"{bam_file_name}.bai"):
+            print(f"{bam_file_name}.bai, skipping...")
+            return
+        
         if barcode_tag:
             all_contents_for_suffix = all_contents_df.filter(pl.col('barcode').str.ends_with(suffix))
         else:
@@ -619,15 +631,6 @@ def concat_and_write_bams(contig, df_dict, header_string, split_bams_folder, bar
                     reads_count_for_suffix
                     ))
                 sys.exit(1)
-            
-        
-        # Make a sub-subfolder to put the bams for this specific contig
-        contig_folder = '{}/{}_{}/'.format(split_bams_folder, contig, suffix)
-        if not os.path.exists(contig_folder):
-            os.mkdir(contig_folder)
-            
-            
-        bam_file_name = '{}/{}_{}.bam'.format(contig_folder, contig, suffix)
         
         # Write, sort and index bam immediately
         write_reads_to_file(reads_deduped, bam_file_name, header_string) 
@@ -639,6 +642,7 @@ def concat_and_write_bams(contig, df_dict, header_string, split_bams_folder, bar
             rm_bam(bam_file_name)
         except Exception as e:
             print("Failed at indexing {}".format(bam_file_name))
+            raise e
             
     
 def concat_and_write_bams_wrapper(params):
@@ -779,7 +783,7 @@ def run_depth_command(command, pivot=False, output_matrix_folder=None):
             run_command(command)
 
         else:
-            print(f"Skipping coverage generation for existing non-empty file: {depths_file}")
+            print(f"\t\tSkipping coverage generation for existing non-empty file: {depths_file}")
         
         if pivot:
             matrix_output_file = os.path.join(output_matrix_folder, f"{os.path.basename(depths_file).split('.')[0]}_matrix.tsv")
@@ -915,7 +919,12 @@ def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_dept
             )
 
     samtools_depth_total_time = time.perf_counter() - samtools_depth_start_time
-    print(f"Total seconds for samtools depth and matrix generation commands to run: {samtools_depth_total_time}")
+
+    
+    if pivot:
+        print(f"Total seconds for samtools depth and matrix generation commands to run: {samtools_depth_total_time}")
+    else:
+        print(f"Total seconds for samtools depth commands to run: {samtools_depth_total_time}")
 
 
 
@@ -1067,7 +1076,10 @@ def zero_edit_found(final_site_level_information_df, output_folder, sailor_list,
 
 
 def delete_intermediate_files(output_folder):
-    to_delete = ['coverage', 'edit_info', 'split_bams', 'all_edit_info.tsv', 
+    to_delete = ['coverage', 'edit_info', 'split_bams', 'combined_all_cells_split_by_suffix',
+                 'combined_source_cells_split_by_suffix',
+                 'matrix_outputs',
+                 'all_edit_info.tsv', 
                  'concat_command.sh', 'depth_command_source_cells.sh', 'combined.bed', 'merge_command.sh',
                  'final_edit_info_no_coverage.tsv', 'final_edit_info_no_coverage_sorted.tsv',
                  'depths_source_cells.txt', 'depth_modified.tsv', 'final_edit_info.tsv', 'final_filtered_edit_info.tsv',

From f5f7e534ae3df0d1a54776e9efb467883adcb823 Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-11-71.tscc.sdsc.edu>
Date: Thu, 28 Nov 2024 13:37:09 -0800
Subject: [PATCH 15/26] more efficient bed split

---
 marine.py    | 69 +++++++++++++++++++++++-----------------------------
 src/utils.py |  7 ++++--
 2 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/marine.py b/marine.py
index 8a1c60b..f9225e2 100755
--- a/marine.py
+++ b/marine.py
@@ -145,44 +145,41 @@ def get_broken_up_contigs(contigs, num_per_sublist):
             broken_up_contigs.append(contig_sublist)
     return broken_up_contigs
 
-def split_bed_file(input_bed_file, output_folder, bam_file_paths, output_suffix='', run=False):
+def split_bed_file(input_bed_file, output_folder, bam_filepaths, output_suffix=''):
     """
-    Split a combined BED file into multiple BED files based on BAM file suffixes.
-    Match rows containing both the chromosome prefix and the cell ID suffix.
+    Split a BED file into multiple files based on suffixes in the first column.
+    Each line is assigned to the appropriate file based on the suffix.
     """
-    all_awk_commands = []
-    
     if not os.path.exists(output_folder):
         os.makedirs(output_folder)
 
-    # Extract chromosome and suffix pairs from BAM file paths
     suffix_pairs = [
-        (os.path.basename(bam).split("_")[0], os.path.basename(bam).split("_")[1].split(".")[0])
-        for bam in bam_file_paths
+        (os.path.basename(bam).split("_")[0], 
+         os.path.basename(bam).split("_")[1].split(".")[0]) for bam in bam_filepaths
     ]
-
-    # Run a grep/awk command for each suffix pair
-    for chrom, suffix in suffix_pairs:
-        output_bed_file = os.path.join(output_folder, f"combined_{output_suffix}_{chrom}_{suffix}.bed")
-        awk_command = f"awk '$1 ~ /^{chrom}_/ && $1 ~ /{suffix}/' {input_bed_file} > {output_bed_file}"
         
-        all_awk_commands.append(awk_command)
+    # Open file handles for each suffix
+    file_handles = {}
+    for prefix, suffix in suffix_pairs:
+        output_file = os.path.join(output_folder, f"combined_{output_suffix}_{prefix}_{suffix}.bed")
+        file_handles[prefix + suffix] = open(output_file, 'w')
 
-        if run:
-            try:
-                #print(f"Running: {awk_command}")
-                run_command(awk_command)
-                
-                # Check if the output file is empty
-                if os.path.getsize(output_bed_file) == 0:
-                    #print(f"No matches found for {chrom}_{suffix}. Removing empty file.")
-                    os.remove(output_bed_file)
-    
-            except subprocess.CalledProcessError as e:
-                #print(f"No matches found for {chrom}_{suffix}. Skipping...")
-                pass
-                
-    return all_awk_commands
+    try:
+        with open(input_bed_file, 'r') as infile:
+            for line in infile:
+                # Parse the first column to determine the suffix
+                columns = line.split()
+                chrom = columns[0]  # Assuming the first column is the chromosome
+                for prefix, suffix in suffix_pairs:
+                    if chrom.startswith(prefix) and chrom.endswith(suffix):
+                        file_handles[prefix + suffix].write(line)
+                        break
+
+    finally:
+        # Close all file handles
+        for handle in file_handles.values():
+            handle.close()
+            
 
 def generate_depths(output_folder, bam_filepaths, paired_end=False):
     
@@ -202,16 +199,13 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False):
 
     output_suffix = 'source_cells'
     if len(bam_filepaths) > 1:
-        separate_edit_sites_commands = split_bed_file(
+        split_bed_file(
             f"{output_folder}/combined_{output_suffix}.bed",
             f"{output_folder}/combined_{output_suffix}_split_by_suffix",
-            bam_filepaths, 
-            output_suffix=output_suffix,
-            run=True
+            bam_filepaths,
+            output_suffix=output_suffix
         )
         
-        all_depth_commands += separate_edit_sites_commands
-
     make_depth_command_script(paired_end, bam_filepaths, output_folder,
                               all_depth_commands=all_depth_commands, output_suffix='source_cells', run=True, processes=cores)
 
@@ -549,12 +543,11 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
         bam_filepaths = glob('{}/split_bams/*/*.bam'.format(output_folder))
         output_suffix = "all_cells"
         if len(bam_filepaths) > 1:
-            separate_edit_sites_commands = split_bed_file(
+            split_bed_file(
                 f"{output_folder}/combined_{output_suffix}.bed",
                 f"{output_folder}/combined_{output_suffix}_split_by_suffix",
                 bam_filepaths, 
-                output_suffix=output_suffix,
-                run=True
+                output_suffix=output_suffix
             )
 
         make_depth_command_script(paired_end,
diff --git a/src/utils.py b/src/utils.py
index e7b1de6..b532cf8 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -15,7 +15,7 @@
 import multiprocessing
 import time
 
-CB_N = 2
+CB_N = 1
 
 def generate_permutations_list_for_CB(n):
     """
@@ -779,7 +779,6 @@ def run_depth_command(command, pivot=False, output_matrix_folder=None):
         # Check if the depths file exists and is non-empty
         if not os.path.exists(depths_file) or os.path.getsize(depths_file) == 0:
             # File doesn't exist or is empty, run the command
-            print(f"Running coverage command for {depths_file}...")
             run_command(command)
 
         else:
@@ -898,6 +897,10 @@ def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_dept
 
     print(f"\tsamtools_depth_commands: {len(samtools_depth_commands)}")
 
+    with open('{}/depth_commands_{}.txt'.format(output_folder, output_suffix), 'w') as f:
+        for d in all_depth_commands:
+            f.write('{}\n\n'.format(d))
+            
     # Create a folder for matrix outputs if pivoting is enabled
     output_matrix_folder = f"{output_folder}/matrix_outputs"
     final_combined_matrices_folder = f"{output_folder}/final_matrix_outputs"

From 463690a3ee58ef13295b5e29d0e20d83d62d28eb Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-11-71.tscc.sdsc.edu>
Date: Thu, 28 Nov 2024 14:45:26 -0800
Subject: [PATCH 16/26] samtools count instead of view

---
 src/utils.py | 180 ++++++++++++++++++++++++++-------------------------
 1 file changed, 91 insertions(+), 89 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index b532cf8..5e91f70 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -660,7 +660,7 @@ def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_fi
     bash_command = f"""#!/bin/bash
     # Step 1: Adjust the depths file by adding a new column that incorporates both contig and position
     # for join purposes, and sort by this column. Output in tab-separated format.
-    awk -v OFS='\\t' '{{print $1, $2-1, $3, $1":"($2)}}' "{file2_path}" | sort -k4,4n | uniq > {output_folder}/depth_modified.tsv
+    awk -v OFS='\\t' '{{print $1, $2+1, $3, $1":"($2+1)}}' "{file2_path}" | sort -k4,4n | uniq > {output_folder}/depth_modified.tsv
     
     # Step 2: Sort the first file numerically by the join column (the column incuding both contig and position information)
     sort -k3,3n "{file1_path}" | uniq > {output_folder}/final_edit_info_no_coverage_sorted.tsv
@@ -683,42 +683,6 @@ def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_fi
     print(f"Merged file saved to {output_file_path}")
             
 
-def get_samtools_depth_commands(paired_end, bam_filepaths, output_folder, output_suffix=''):
-    """
-    Generate samtools depth commands for each BAM file using its corresponding split BED file.
-    """
-    all_depth_commands = []
-
-    # Format output suffix
-    if len(output_suffix) > 0:
-        output_suffix = '_{}'.format(output_suffix)
-        
-    # Paired-end option for samtools depth
-    paired_end_option = '-s ' if paired_end else ''
-
-    print("\n\tGetting suffixes from {} bam files...".format(len(bam_filepaths)))
-    for i, bam_filepath in enumerate(bam_filepaths):
-        # Extract suffix from BAM filename
-        bam_filename = os.path.basename(bam_filepath)
-        bam_prefix, bam_suffix = bam_filename.split("_")[0], bam_filename.split("_")[1].split(".")[0]
-
-        # Path to the corresponding split BED file
-        split_bed_file = f"{output_folder}/combined{output_suffix}_split_by_suffix/combined{output_suffix}_{bam_prefix}_{bam_suffix}.bed"
-
-        if os.path.exists(split_bed_file):
-            # Construct the samtools depth command
-            depth_command = (
-                f"echo '    running samtools depth on {bam_filepath}...';"
-                f"samtools depth {paired_end_option}-a -b {split_bed_file} {bam_filepath} >> {output_folder}/coverage/depths{output_suffix}_{bam_prefix}_{bam_suffix}.txt"
-            )
-
-            # Add to the list of commands
-            all_depth_commands.append(depth_command)
-
-    print("\n\tOnly {} of the bam files had edits".format(len(all_depth_commands)))
-    return all_depth_commands
-
-
 def generate_empty_matrix_file(matrix_output_filepath):
     pass
 
@@ -768,34 +732,6 @@ def run_command(command):
     subprocess.run(command, shell=True, check=True)
 
 
-def run_depth_command(command, pivot=False, output_matrix_folder=None):
-    """
-    Run a shell command and optionally process the output file with the pivot function.
-    """
-    try:
-        # Extract output file from the command
-        depths_file = command.split(">>")[-1].strip()
-        
-        # Check if the depths file exists and is non-empty
-        if not os.path.exists(depths_file) or os.path.getsize(depths_file) == 0:
-            # File doesn't exist or is empty, run the command
-            run_command(command)
-
-        else:
-            print(f"\t\tSkipping coverage generation for existing non-empty file: {depths_file}")
-        
-        if pivot:
-            matrix_output_file = os.path.join(output_matrix_folder, f"{os.path.basename(depths_file).split('.')[0]}_matrix.tsv")
-
-            if not os.path.exists(matrix_output_file) or os.path.getsize(matrix_output_file) == 0:
-                # Pivot the depths output file into a matrix
-                pivot_depths_output(depths_file, matrix_output_file)
-
-    except subprocess.CalledProcessError as e:
-        print(f"Command failed: {command}\nError: {e}", file=sys.stderr)
-        raise e
-
-
 def merge_files_by_chromosome(args):
     """
     Helper function to merge files for a single chromosome.
@@ -864,24 +800,92 @@ def merge_depth_files(output_folder, output_suffix=''):
     print(f"Depths merged into {merged_depth_file}.")
 
 
-def run_samtools_depth(commands, processes, pivot, output_matrix_folder):
+
+def calculate_coverage(bam_filepath, bed_filepath, output_filepath, output_matrix_folder):
     """
-    Runs samtools depth commands in parallel using multiprocessing.
+    Mimic samtools depth -a by including positions with zero coverage and applying equivalent filters.
     """
-    try:
-        # Prepare arguments for starmap
-        run_command_args = [
-            (command, pivot, output_matrix_folder) for command in commands
-        ]
 
+    depths_file = output_filepath
+    
+    print(f"Running samtools view on {bed_filepath} for {bam_filepath}, outputting to {output_filepath}")
+    
+    regions = []
+    with open(bed_filepath, "r") as bed:
+        for line in bed:
+            fields = line.strip().split()
+            chrom, start, end = fields[0], int(fields[1]), int(fields[2])
+            regions.append((chrom, start, end))
+
+    print(f"\t{len(regions)} regions")
+    
+    with pysam.AlignmentFile(bam_filepath, "rb") as bam, open(depths_file, "w") as out:
+        for chrom, start, end in regions:
+            # Get coverage, applying base quality filter
+            coverage = bam.count_coverage(
+                chrom, start, end, quality_threshold=0  # Mimics -Q 13 in samtools depth
+            )
+
+            # Calculate total coverage for each position and include zero-coverage positions
+            for pos in range(start, end):
+                total_coverage = sum(base[pos - start] for base in coverage)
+                out.write(f"{chrom}\t{pos}\t{total_coverage}\n")
+
+
+    if output_matrix_folder:
+        matrix_output_file = os.path.join(output_matrix_folder, f"{os.path.basename(depths_file).split('.')[0]}_matrix.tsv")
+    
+        if not os.path.exists(matrix_output_file) or os.path.getsize(matrix_output_file) == 0:
+            # Pivot the depths output file into a matrix
+            pivot_depths_output(depths_file, matrix_output_file)
+
+
+def run_pysam_count_coverage(args_list, processes):
+    """
+    Runs pysam.count_coverage in parallel using multiprocessing.
+    """
+    try:
         with Pool(processes=processes) as pool:
-            pool.starmap(run_depth_command, run_command_args)
+            pool.starmap(calculate_coverage, args_list)
 
     except Exception as e:
-        print(f"Aborting: One or more commands failed with an error: {e}", file=sys.stderr)
+        print(f"Aborting: One or more coverage calculations failed with an error: {e}", file=sys.stderr)
         sys.exit(1)  # Exit with an error code
 
 
+def prepare_pysam_coverage_args(bam_filepaths, output_folder, output_suffix='', pivot=False):
+    """
+    Prepare arguments for pysam coverage calculation for each BAM file and its BED file.
+    """
+    # Create a folder for matrix outputs if pivoting is enabled
+    if pivot:
+        output_matrix_folder = f"{output_folder}/matrix_outputs"
+        os.makedirs(output_matrix_folder, exist_ok=True)
+    else:
+        output_matrix_folder = None
+
+    args_list = []
+
+    # Format output suffix
+    if len(output_suffix) > 0:
+        output_suffix = f"_{output_suffix}"
+
+    for bam_filepath in bam_filepaths:
+        # Extract suffix from BAM filename
+        bam_filename = os.path.basename(bam_filepath)
+        bam_prefix, bam_suffix = bam_filename.split("_")[0], bam_filename.split("_")[1].split(".")[0]
+
+        # Path to the corresponding split BED file
+        bed_filepath = f"{output_folder}/combined{output_suffix}_split_by_suffix/combined{output_suffix}_{bam_prefix}_{bam_suffix}.bed"
+        output_filepath = f"{output_folder}/coverage/depths{output_suffix}_{bam_prefix}_{bam_suffix}.txt"
+
+        if os.path.exists(bed_filepath):
+            args_list.append((bam_filepath, bed_filepath, output_filepath, output_matrix_folder))
+
+    return args_list
+
+
+
 def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_depth_commands=[], 
                               output_suffix='', run=False, pivot=False, processes=4):
     """
@@ -889,32 +893,30 @@ def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_dept
     """
     samtools_depth_start_time = time.perf_counter()
 
-    # Generate samtools depth commands
-    samtools_depth_commands = get_samtools_depth_commands(paired_end, bam_filepaths, 
-                                                          output_folder, 
-                                                          output_suffix=output_suffix)
-    all_depth_commands += samtools_depth_commands
+     # Prepare pysam coverage arguments
+    pysam_coverage_args = prepare_pysam_coverage_args(bam_filepaths, output_folder, output_suffix=output_suffix, pivot=pivot)
+    
+    print(f"\tPrepared coverage arguments for {len(pysam_coverage_args)} BAM files.")
 
-    print(f"\tsamtools_depth_commands: {len(samtools_depth_commands)}")
+    #all_depth_commands += samtools_depth_commands
+    #print(f"\tsamtools_depth_commands: {len(samtools_depth_commands)}")
 
     with open('{}/depth_commands_{}.txt'.format(output_folder, output_suffix), 'w') as f:
         for d in all_depth_commands:
             f.write('{}\n\n'.format(d))
             
-    # Create a folder for matrix outputs if pivoting is enabled
-    output_matrix_folder = f"{output_folder}/matrix_outputs"
-    final_combined_matrices_folder = f"{output_folder}/final_matrix_outputs"
-    if pivot:
-        os.makedirs(output_matrix_folder, exist_ok=True)
-        os.makedirs(final_combined_matrices_folder, exist_ok=True)
-
     if run:
-        print("Calculating depths using multiprocessing...")
-        run_samtools_depth(samtools_depth_commands, processes, pivot, output_matrix_folder)
+        print("Calculating depths using multiprocessing with pysam...")
+        run_pysam_count_coverage(pysam_coverage_args, processes)
 
         merge_depth_files(output_folder, output_suffix)
 
         if pivot:
+            output_matrix_folder = f"{output_folder}/matrix_outputs"
+            final_combined_matrices_folder = f"{output_folder}/final_matrix_outputs"
+            os.makedirs(final_combined_matrices_folder, exist_ok=True)
+        
+            print("\nRunning pivot...\n")
             prepare_matrix_files_multiprocess(
                 output_matrix_folder, 
                 final_combined_matrices_folder,

From e03bf79db17b578cd72fc13ef7ce2e6ca50890aa Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-11-71.tscc.sdsc.edu>
Date: Thu, 28 Nov 2024 15:17:47 -0800
Subject: [PATCH 17/26] adjust for bulk

---
 marine.py    | 16 +++++++++++-----
 src/utils.py | 28 ++++++++++++++++++++--------
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/marine.py b/marine.py
index f9225e2..4aac9f5 100755
--- a/marine.py
+++ b/marine.py
@@ -181,7 +181,7 @@ def split_bed_file(input_bed_file, output_folder, bam_filepaths, output_suffix='
             handle.close()
             
 
-def generate_depths(output_folder, bam_filepaths, paired_end=False):
+def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=None):
     
     coverage_start_time = time.perf_counter()
 
@@ -207,7 +207,7 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False):
         )
         
     make_depth_command_script(paired_end, bam_filepaths, output_folder,
-                              all_depth_commands=all_depth_commands, output_suffix='source_cells', run=True, processes=cores)
+                              all_depth_commands=all_depth_commands, output_suffix='source_cells', run=True, processes=cores, barcode_tag=barcode_tag)
 
 
     print("Concatenating edit info files...")
@@ -429,7 +429,7 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
         reconfigured_bam_filepaths = glob('{}/split_bams/*/*.bam'.format(output_folder))
         
         print("Running samtools depth on {} subset bam paths...".format(len(reconfigured_bam_filepaths)))
-        total_time, total_seconds_for_contig_df = generate_depths(output_folder, reconfigured_bam_filepaths, paired_end=paired_end)
+        total_time, total_seconds_for_contig_df = generate_depths(output_folder, reconfigured_bam_filepaths, paired_end=paired_end, barcode_tag=barcode_tag)
                                               
         total_seconds_for_contig_df.to_csv("{}/coverage_calculation_timing.tsv".format(logging_folder), sep='\t')
          
@@ -556,7 +556,8 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
                                   output_suffix=output_suffix,
                                   run=True,
                                   pivot=True,
-                                  processes=cores
+                                  processes=cores,
+                                  barcode_tag=barcode_tag
                                  )
         
     if not keep_intermediate_files:
@@ -638,7 +639,7 @@ def check_samtools():
     paired_end = args.paired_end
     all_cells_coverage = args.all_cells_coverage
     skip_coverage = args.skip_coverage
-    
+
     barcode_tag = args.barcode_tag
     min_base_quality = args.min_base_quality
     min_read_quality = args.min_read_quality
@@ -650,6 +651,11 @@ def check_samtools():
     num_per_sublist = args.num_per_sublist
 
 
+    # all_cells_coverage only applies for single cell case
+    if not barcode_tag:
+        if all_cells_coverage == True:
+            all_cells_coverage = False
+        
     """
     # Convert all_cells_coverage into list of conversions for which to output all cell info
     if not all_cells_coverage is None:
diff --git a/src/utils.py b/src/utils.py
index 5e91f70..7994c5b 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -853,7 +853,7 @@ def run_pysam_count_coverage(args_list, processes):
         sys.exit(1)  # Exit with an error code
 
 
-def prepare_pysam_coverage_args(bam_filepaths, output_folder, output_suffix='', pivot=False):
+def prepare_pysam_coverage_args(bam_filepaths, output_folder, output_suffix='', pivot=False, barcode_tag=None):
     """
     Prepare arguments for pysam coverage calculation for each BAM file and its BED file.
     """
@@ -870,31 +870,43 @@ def prepare_pysam_coverage_args(bam_filepaths, output_folder, output_suffix='',
     if len(output_suffix) > 0:
         output_suffix = f"_{output_suffix}"
 
+    print("prepare_pysam_coverage_args, barcode_tag is {}".format(barcode_tag))
+
     for bam_filepath in bam_filepaths:
         # Extract suffix from BAM filename
         bam_filename = os.path.basename(bam_filepath)
         bam_prefix, bam_suffix = bam_filename.split("_")[0], bam_filename.split("_")[1].split(".")[0]
 
-        # Path to the corresponding split BED file
-        bed_filepath = f"{output_folder}/combined{output_suffix}_split_by_suffix/combined{output_suffix}_{bam_prefix}_{bam_suffix}.bed"
-        output_filepath = f"{output_folder}/coverage/depths{output_suffix}_{bam_prefix}_{bam_suffix}.txt"
-
+        if barcode_tag:
+            # Path to the corresponding split BED file
+            bed_filepath = f"{output_folder}/combined{output_suffix}_split_by_suffix/combined{output_suffix}_{bam_prefix}_{bam_suffix}.bed"
+            output_filepath = f"{output_folder}/coverage/depths{output_suffix}_{bam_prefix}_{bam_suffix}.txt"
+            
+        else:
+            # bulk case
+            bed_filepath = f"{output_folder}/combined{output_suffix}.bed"
+            output_filepath = f"{output_folder}/coverage/depths{output_suffix}.txt"
+            
+            
         if os.path.exists(bed_filepath):
             args_list.append((bam_filepath, bed_filepath, output_filepath, output_matrix_folder))
-
+        else:
+            print(f"Did not find {bed_filepath}")
+        
+        
     return args_list
 
 
 
 def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_depth_commands=[], 
-                              output_suffix='', run=False, pivot=False, processes=4):
+                              output_suffix='', run=False, pivot=False, processes=4, barcode_tag=None):
     """
     Main function to generate and execute samtools depth commands, and optionally pivot and merge matrices.
     """
     samtools_depth_start_time = time.perf_counter()
 
      # Prepare pysam coverage arguments
-    pysam_coverage_args = prepare_pysam_coverage_args(bam_filepaths, output_folder, output_suffix=output_suffix, pivot=pivot)
+    pysam_coverage_args = prepare_pysam_coverage_args(bam_filepaths, output_folder, output_suffix=output_suffix, pivot=pivot, barcode_tag=barcode_tag)
     
     print(f"\tPrepared coverage arguments for {len(pysam_coverage_args)} BAM files.")
 

From 39f04bcde20902872a479a3a414aed74ab6a49f1 Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-14-42.tscc.sdsc.edu>
Date: Thu, 28 Nov 2024 23:42:32 -0800
Subject: [PATCH 18/26] trying to fix bulk

---
 marine.py                     |  5 ++++-
 src/utils.py                  |  2 +-
 tests/integration_tests.ipynb | 19 ++++++++++++++-----
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/marine.py b/marine.py
index 4aac9f5..deb86e5 100755
--- a/marine.py
+++ b/marine.py
@@ -153,6 +153,8 @@ def split_bed_file(input_bed_file, output_folder, bam_filepaths, output_suffix='
     if not os.path.exists(output_folder):
         os.makedirs(output_folder)
 
+    single_cell_approach = len(bam_filepaths) > 0
+    
     suffix_pairs = [
         (os.path.basename(bam).split("_")[0], 
          os.path.basename(bam).split("_")[1].split(".")[0]) for bam in bam_filepaths
@@ -198,7 +200,8 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=
     all_depth_commands.append(combine_edit_sites_command)
 
     output_suffix = 'source_cells'
-    if len(bam_filepaths) > 1:
+    
+    if barcode_tag:
         split_bed_file(
             f"{output_folder}/combined_{output_suffix}.bed",
             f"{output_folder}/combined_{output_suffix}_split_by_suffix",
diff --git a/src/utils.py b/src/utils.py
index 7994c5b..d451e67 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -885,7 +885,7 @@ def prepare_pysam_coverage_args(bam_filepaths, output_folder, output_suffix='',
         else:
             # bulk case
             bed_filepath = f"{output_folder}/combined{output_suffix}.bed"
-            output_filepath = f"{output_folder}/coverage/depths{output_suffix}.txt"
+            output_filepath = f"{output_folder}/coverage/depths{output_suffix}_{bam_filename.split('_')[1]}.txt"
             
             
         if os.path.exists(bed_filepath):
diff --git a/tests/integration_tests.ipynb b/tests/integration_tests.ipynb
index 25464a8..53d1581 100644
--- a/tests/integration_tests.ipynb
+++ b/tests/integration_tests.ipynb
@@ -2889,7 +2889,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 12,
    "id": "f5c3ed3e-13dd-4399-924e-3d1ac17ce387",
    "metadata": {},
    "outputs": [
@@ -2932,12 +2932,18 @@
       "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
       "Checking results for pair_test\n",
       "\tExpecting: {'contig': '18', 'position': 49491556, 'count': 2, 'coverage': 2, 'num_rows': 1, 'strand_conversion': 'C>T', 'strand': '-', 'feature_name': 'RPL17-C18orf32,RPL17', 'feature_strand': '-,-'}\n",
+      "Exception: coverage was site_id\n",
+      "no_barcode_18_49491556_G_A_-    3\n",
+      "Name: coverage, dtype: int64\n",
       "\n",
-      "\t >>> pair_test passed! <<<\n",
+      "\t ~~~ pair_test FAILED! ~~~\n",
       "\n",
       "\tExpecting: {'contig': '18', 'position': 49567494, 'count': 2, 'coverage': 2, 'num_rows': 1, 'strand_conversion': 'C>T', 'strand': '+', 'feature_name': 'LIPG', 'feature_strand': '+'}\n",
+      "Exception: coverage was site_id\n",
+      "no_barcode_18_49567494_C_T_+    3\n",
+      "Name: coverage, dtype: int64\n",
       "\n",
-      "\t >>> pair_test passed! <<<\n",
+      "\t ~~~ pair_test FAILED! ~~~\n",
       "\n",
       "\n",
       "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
@@ -2971,8 +2977,11 @@
       "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
       "Checking results for same_pos_dif_reads_test\n",
       "\tExpecting: {'contig': 'chr17', 'position': 83199872, 'count': 9, 'coverage': 9, 'conversion': 'C>G', 'strand_conversion': 'C>G', 'strand': '+', 'feature_name': 'AC139099.2'}\n",
+      "Exception: coverage was site_id\n",
+      "no_barcode_chr17_83199872_C_G_+    14\n",
+      "Name: coverage, dtype: int64\n",
       "\n",
-      "\t >>> same_pos_dif_reads_test passed! <<<\n",
+      "\t ~~~ same_pos_dif_reads_test FAILED! ~~~\n",
       "\n",
       "\n",
       "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
@@ -3015,7 +3024,7 @@
       "\n",
       "\t >>> single-cell and bulk on same dataset comparison passed! <<<\n",
       "\n",
-      "There were 0 failures\n"
+      "There were 3 failures\n"
      ]
     }
    ],

From 97e242d00dbaf5024f2e97c82e188a5583a8e193 Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-14-42.tscc.sdsc.edu>
Date: Fri, 29 Nov 2024 09:00:18 -0800
Subject: [PATCH 19/26] adapt bulk mode

---
 marine.py                                     | 44 ++++++++++++-------
 src/utils.py                                  |  2 +-
 .../scripts/only_5_cells_bulk_mode_test.sh    |  2 +-
 .../scripts/only_5_cells_test.sh              |  2 +-
 4 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/marine.py b/marine.py
index deb86e5..9804f24 100755
--- a/marine.py
+++ b/marine.py
@@ -33,7 +33,7 @@
 from utils import get_intervals, index_bam, write_rows_to_info_file, write_header_to_edit_info, \
 write_read_to_bam_file, remove_file_if_exists, make_folder, concat_and_write_bams_wrapper, \
 pretty_print, read_barcode_whitelist_file, get_contigs_that_need_bams_written, \
-make_depth_command_script, generate_and_run_bash_merge, get_sailor_sites, \
+make_depth_command_script_single_cell, generate_and_run_bash_merge, get_sailor_sites, \
 concatenate_files, run_command, get_edits_with_coverage_df, zero_edit_found, delete_intermediate_files
 
 from core import run_edit_identifier, run_bam_reconfiguration, \
@@ -197,11 +197,16 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=
     ).format(output_folder, output_folder)
     run_command(combine_edit_sites_command)
 
+    
     all_depth_commands.append(combine_edit_sites_command)
 
     output_suffix = 'source_cells'
     
     if barcode_tag:
+        coverage_subfolder = '{}/coverage'.format(output_folder)
+        make_folder(coverage_subfolder)
+
+        # Single cell mode
         split_bed_file(
             f"{output_folder}/combined_{output_suffix}.bed",
             f"{output_folder}/combined_{output_suffix}_split_by_suffix",
@@ -209,9 +214,14 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=
             output_suffix=output_suffix
         )
         
-    make_depth_command_script(paired_end, bam_filepaths, output_folder,
-                              all_depth_commands=all_depth_commands, output_suffix='source_cells', run=True, processes=cores, barcode_tag=barcode_tag)
-
+        make_depth_command_script_single_cell(paired_end, bam_filepaths, output_folder,
+                                  all_depth_commands=all_depth_commands, output_suffix='source_cells', run=True, processes=cores, barcode_tag=barcode_tag)
+        
+    else:
+        # Bulk mode, we will not split the bed and simply use samtools depth on the combined.bed
+        samtools_depth_command = f"samtools depth -b {output_folder}/combined_source_cells.bed {bam_filepath} > {output_folder}/depths_source_cells.txt"
+        run_command(samtools_depth_command)
+        
 
     print("Concatenating edit info files...")
     concatenate_files(output_folder, "edit_info/*edit_info.tsv",
@@ -223,10 +233,12 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=
     header_columns = ['barcode', 'contig', 'position', 'ref', 'alt',
                       'read_id', 'strand', 'coverage']
 
+
     generate_and_run_bash_merge(output_folder,
                                 '{}/final_edit_info_no_coverage.tsv'.format(output_folder),
                             '{}/depths_source_cells.txt'.format(output_folder), 
-                            '{}/final_edit_info.tsv'.format(output_folder), header_columns)
+                            '{}/final_edit_info.tsv'.format(output_folder), 
+                                header_columns)
     
     coverage_total_time = time.perf_counter() - coverage_start_time
     
@@ -425,9 +437,6 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
         # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         pretty_print("Calculating coverage at edited sites, minimum read quality is {}...".format(min_read_quality), style='~')
         
-        coverage_subfolder = '{}/coverage'.format(output_folder)
-        make_folder(coverage_subfolder)
-        
         # We want to run the samtools depth command for each of the reconfigured bam files
         reconfigured_bam_filepaths = glob('{}/split_bams/*/*.bam'.format(output_folder))
         
@@ -553,15 +562,16 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
                 output_suffix=output_suffix
             )
 
-        make_depth_command_script(paired_end,
-                                  reconfigured_bam_filepaths, 
-                                  output_folder, 
-                                  output_suffix=output_suffix,
-                                  run=True,
-                                  pivot=True,
-                                  processes=cores,
-                                  barcode_tag=barcode_tag
-                                 )
+        make_depth_command_script_single_cell(
+            paired_end,
+            reconfigured_bam_filepaths, 
+            output_folder, 
+            output_suffix=output_suffix,
+            run=True,
+            pivot=True,
+            processes=cores,
+            barcode_tag=barcode_tag
+        )
         
     if not keep_intermediate_files:
         pretty_print("Deleting intermediate files...", style="-")
diff --git a/src/utils.py b/src/utils.py
index d451e67..dadda75 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -898,7 +898,7 @@ def prepare_pysam_coverage_args(bam_filepaths, output_folder, output_suffix='',
 
 
 
-def make_depth_command_script(paired_end, bam_filepaths, output_folder, all_depth_commands=[], 
+def make_depth_command_script_single_cell(paired_end, bam_filepaths, output_folder, all_depth_commands=[], 
                               output_suffix='', run=False, pivot=False, processes=4, barcode_tag=None):
     """
     Main function to generate and execute samtools depth commands, and optionally pivot and merge matrices.
diff --git a/tests/singlecell_tests/scripts/only_5_cells_bulk_mode_test.sh b/tests/singlecell_tests/scripts/only_5_cells_bulk_mode_test.sh
index 2a60470..88796ec 100644
--- a/tests/singlecell_tests/scripts/only_5_cells_bulk_mode_test.sh
+++ b/tests/singlecell_tests/scripts/only_5_cells_bulk_mode_test.sh
@@ -14,4 +14,4 @@ $MARINE/tests/singlecell_tests/only_5_cells_bulk_mode_test \
 --cores \
 4 \
 --strandedness 2 --verbose \
---contigs "9" --interval_length 32000000
\ No newline at end of file
+--contigs "9" --interval_length 32000000 --keep_intermediate_files
diff --git a/tests/singlecell_tests/scripts/only_5_cells_test.sh b/tests/singlecell_tests/scripts/only_5_cells_test.sh
index cfd6ccc..5413f0c 100644
--- a/tests/singlecell_tests/scripts/only_5_cells_test.sh
+++ b/tests/singlecell_tests/scripts/only_5_cells_test.sh
@@ -15,4 +15,4 @@ $MARINE/tests/singlecell_tests/only_5_cells_test \
 4 \
 --barcode_tag "CB" \
 --strandedness 2 \
---contigs "9" --interval_length 32000000
\ No newline at end of file
+--keep_intermediate_files --contigs "9" --interval_length 32000000

From 1616579b4caf8f41b4b125826f437e2710587f4d Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-14-42.tscc.sdsc.edu>
Date: Fri, 29 Nov 2024 09:04:22 -0800
Subject: [PATCH 20/26] paired end

---
 marine.py                     | 7 ++++++-
 tests/integration_tests.ipynb | 7 ++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/marine.py b/marine.py
index 9804f24..4221238 100755
--- a/marine.py
+++ b/marine.py
@@ -218,8 +218,13 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=
                                   all_depth_commands=all_depth_commands, output_suffix='source_cells', run=True, processes=cores, barcode_tag=barcode_tag)
         
     else:
+        if paired_end:
+            paired_end_flag = '-s '
+        else:
+            paired_end_flag = 's'
+            
         # Bulk mode, we will not split the bed and simply use samtools depth on the combined.bed
-        samtools_depth_command = f"samtools depth -b {output_folder}/combined_source_cells.bed {bam_filepath} > {output_folder}/depths_source_cells.txt"
+        samtools_depth_command = f"samtools depth {paired_end_flag}-a -b {output_folder}/combined_source_cells.bed {bam_filepath} > {output_folder}/depths_source_cells.txt"
         run_command(samtools_depth_command)
         
 
diff --git a/tests/integration_tests.ipynb b/tests/integration_tests.ipynb
index 53d1581..f8fe151 100644
--- a/tests/integration_tests.ipynb
+++ b/tests/integration_tests.ipynb
@@ -2889,7 +2889,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 1,
    "id": "f5c3ed3e-13dd-4399-924e-3d1ac17ce387",
    "metadata": {},
    "outputs": [
@@ -3021,10 +3021,11 @@
       "\t >>> long_read_sc_test passed! <<<\n",
       "\n",
       "Checking that analyzing a single-cell dataset in 'bulk' mode (i.e. not specificying the 'CB' barcode) yields the exact same positions and base changes, but with counts and coverages aggregated rather than at a single-cell resolution\n",
+      "Exception: \n",
       "\n",
-      "\t >>> single-cell and bulk on same dataset comparison passed! <<<\n",
+      "\t ~~~ single cell vs bulk modes on sc dataset equivalency test FAILED! ~~~\n",
       "\n",
-      "There were 3 failures\n"
+      "There were 4 failures\n"
      ]
     }
    ],

From 2aedc8fb32356a11569bcad6483ea31fd0b9cc73 Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-14-42.tscc.sdsc.edu>
Date: Fri, 29 Nov 2024 13:25:04 -0800
Subject: [PATCH 21/26] adjustment

---
 marine.py                     |  4 ++--
 src/utils.py                  |  9 +++++++--
 tests/integration_tests.ipynb | 20 ++++++--------------
 3 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/marine.py b/marine.py
index 4221238..0211128 100755
--- a/marine.py
+++ b/marine.py
@@ -221,7 +221,7 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=
         if paired_end:
             paired_end_flag = '-s '
         else:
-            paired_end_flag = 's'
+            paired_end_flag = ''
             
         # Bulk mode, we will not split the bed and simply use samtools depth on the combined.bed
         samtools_depth_command = f"samtools depth {paired_end_flag}-a -b {output_folder}/combined_source_cells.bed {bam_filepath} > {output_folder}/depths_source_cells.txt"
@@ -243,7 +243,7 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=
                                 '{}/final_edit_info_no_coverage.tsv'.format(output_folder),
                             '{}/depths_source_cells.txt'.format(output_folder), 
                             '{}/final_edit_info.tsv'.format(output_folder), 
-                                header_columns)
+                                header_columns, paired_end=paired_end)
     
     coverage_total_time = time.perf_counter() - coverage_start_time
     
diff --git a/src/utils.py b/src/utils.py
index dadda75..1534a13 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -652,15 +652,20 @@ def concat_and_write_bams_wrapper(params):
     concat_and_write_bams(contig, df_dict, header_string, split_bams_folder, barcode_tag=barcode_tag, number_of_expected_bams=number_of_expected_bams, verbose=verbose)
 
 
-def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_file_path, header_columns):
+def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_file_path, header_columns, paired_end=False):
     # Convert header list into a tab-separated string
     header = "\t".join(header_columns)
 
+    position_adjustment = '1'
+    if paired_end:
+        position_adjustment = '0' 
+    print(f"position adjustment is {position_adjustment} (paired_end is {paired_end})")
+        
     # Generate the Bash command for processing
     bash_command = f"""#!/bin/bash
     # Step 1: Adjust the depths file by adding a new column that incorporates both contig and position
     # for join purposes, and sort by this column. Output in tab-separated format.
-    awk -v OFS='\\t' '{{print $1, $2+1, $3, $1":"($2+1)}}' "{file2_path}" | sort -k4,4n | uniq > {output_folder}/depth_modified.tsv
+    awk -v OFS='\\t' '{{print $1, $2+{position_adjustment}, $3, $1":"($2+{position_adjustment})}}' "{file2_path}" | sort -k4,4n | uniq > {output_folder}/depth_modified.tsv
     
     # Step 2: Sort the first file numerically by the join column (the column incuding both contig and position information)
     sort -k3,3n "{file1_path}" | uniq > {output_folder}/final_edit_info_no_coverage_sorted.tsv
diff --git a/tests/integration_tests.ipynb b/tests/integration_tests.ipynb
index f8fe151..cd07c64 100644
--- a/tests/integration_tests.ipynb
+++ b/tests/integration_tests.ipynb
@@ -2889,7 +2889,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 4,
    "id": "f5c3ed3e-13dd-4399-924e-3d1ac17ce387",
    "metadata": {},
    "outputs": [
@@ -2932,18 +2932,12 @@
       "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
       "Checking results for pair_test\n",
       "\tExpecting: {'contig': '18', 'position': 49491556, 'count': 2, 'coverage': 2, 'num_rows': 1, 'strand_conversion': 'C>T', 'strand': '-', 'feature_name': 'RPL17-C18orf32,RPL17', 'feature_strand': '-,-'}\n",
-      "Exception: coverage was site_id\n",
-      "no_barcode_18_49491556_G_A_-    3\n",
-      "Name: coverage, dtype: int64\n",
       "\n",
-      "\t ~~~ pair_test FAILED! ~~~\n",
+      "\t >>> pair_test passed! <<<\n",
       "\n",
       "\tExpecting: {'contig': '18', 'position': 49567494, 'count': 2, 'coverage': 2, 'num_rows': 1, 'strand_conversion': 'C>T', 'strand': '+', 'feature_name': 'LIPG', 'feature_strand': '+'}\n",
-      "Exception: coverage was site_id\n",
-      "no_barcode_18_49567494_C_T_+    3\n",
-      "Name: coverage, dtype: int64\n",
       "\n",
-      "\t ~~~ pair_test FAILED! ~~~\n",
+      "\t >>> pair_test passed! <<<\n",
       "\n",
       "\n",
       "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
@@ -2977,11 +2971,8 @@
       "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
       "Checking results for same_pos_dif_reads_test\n",
       "\tExpecting: {'contig': 'chr17', 'position': 83199872, 'count': 9, 'coverage': 9, 'conversion': 'C>G', 'strand_conversion': 'C>G', 'strand': '+', 'feature_name': 'AC139099.2'}\n",
-      "Exception: coverage was site_id\n",
-      "no_barcode_chr17_83199872_C_G_+    14\n",
-      "Name: coverage, dtype: int64\n",
       "\n",
-      "\t ~~~ same_pos_dif_reads_test FAILED! ~~~\n",
+      "\t >>> same_pos_dif_reads_test passed! <<<\n",
       "\n",
       "\n",
       "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
@@ -3025,7 +3016,7 @@
       "\n",
       "\t ~~~ single cell vs bulk modes on sc dataset equivalency test FAILED! ~~~\n",
       "\n",
-      "There were 4 failures\n"
+      "There were 1 failures\n"
      ]
     }
    ],
@@ -3351,6 +3342,7 @@
     "bulk_folder = 'only_5_cells_bulk_mode_test'\n",
     "sc_5_cells = pd.read_csv('singlecell_tests/{}/final_filtered_site_info.tsv'.format(sc_folder), sep='\\t').sort_values(['position', 'strand_conversion'])\n",
     "bulk_5_cells = pd.read_csv('singlecell_tests/{}/final_filtered_site_info.tsv'.format(bulk_folder), sep='\\t').sort_values(['position', 'strand_conversion'])\n",
+    "\n",
     "print(\"Checking that analyzing a single-cell dataset in 'bulk' mode (i.e. not specificying the 'CB' barcode) yields the exact same positions and base changes, but with counts and coverages aggregated rather than at a single-cell resolution\")\n",
     "grouped_sc = pd.DataFrame(sc_5_cells.groupby(['contig', 'position', 'strand_conversion']).agg({'count': sum, 'strand_conversion': 'unique'}))\n",
     "grouped_sc.index.names = ['contig', 'position', 'c']\n",

From ba8b0c21bb9c517cb0fecb54d323a684df40900b Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-14-42.tscc.sdsc.edu>
Date: Fri, 29 Nov 2024 13:51:09 -0800
Subject: [PATCH 22/26] check for sc

---
 marine.py                     |  2 +-
 src/utils.py                  | 10 +++++-----
 tests/integration_tests.ipynb |  4 +++-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/marine.py b/marine.py
index 0211128..ebdce53 100755
--- a/marine.py
+++ b/marine.py
@@ -243,7 +243,7 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=
                                 '{}/final_edit_info_no_coverage.tsv'.format(output_folder),
                             '{}/depths_source_cells.txt'.format(output_folder), 
                             '{}/final_edit_info.tsv'.format(output_folder), 
-                                header_columns, paired_end=paired_end)
+                                header_columns, barcode_tag=barcode_tag)
     
     coverage_total_time = time.perf_counter() - coverage_start_time
     
diff --git a/src/utils.py b/src/utils.py
index 1534a13..7347c14 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -652,14 +652,14 @@ def concat_and_write_bams_wrapper(params):
     concat_and_write_bams(contig, df_dict, header_string, split_bams_folder, barcode_tag=barcode_tag, number_of_expected_bams=number_of_expected_bams, verbose=verbose)
 
 
-def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_file_path, header_columns, paired_end=False):
+def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_file_path, header_columns, barcode_tag=None):
     # Convert header list into a tab-separated string
     header = "\t".join(header_columns)
 
-    position_adjustment = '1'
-    if paired_end:
-        position_adjustment = '0' 
-    print(f"position adjustment is {position_adjustment} (paired_end is {paired_end})")
+    position_adjustment = '1' # for samtools view
+    if not barcode_tag:
+        position_adjustment = '0'  # for samtools depth
+    print(f"position adjustment is {position_adjustment} (barcode_tag is {barcode_tag})")
         
     # Generate the Bash command for processing
     bash_command = f"""#!/bin/bash
diff --git a/tests/integration_tests.ipynb b/tests/integration_tests.ipynb
index cd07c64..162ff9f 100644
--- a/tests/integration_tests.ipynb
+++ b/tests/integration_tests.ipynb
@@ -2889,7 +2889,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "id": "f5c3ed3e-13dd-4399-924e-3d1ac17ce387",
    "metadata": {},
    "outputs": [
@@ -3012,6 +3012,7 @@
       "\t >>> long_read_sc_test passed! <<<\n",
       "\n",
       "Checking that analyzing a single-cell dataset in 'bulk' mode (i.e. not specificying the 'CB' barcode) yields the exact same positions and base changes, but with counts and coverages aggregated rather than at a single-cell resolution\n",
+      "grouped_sc_rows: 62, bulk_rows: 26\n",
       "Exception: \n",
       "\n",
       "\t ~~~ single cell vs bulk modes on sc dataset equivalency test FAILED! ~~~\n",
@@ -3359,6 +3360,7 @@
     "    bulk_rows.append((r['contig'], r['position'], r['strand_conversion']))\n",
     "\n",
     "try:\n",
+    "    print(\"grouped_sc_rows: {}, bulk_rows: {}\".format(len(grouped_sc_rows), len(bulk_rows)))\n",
     "    assert(grouped_sc_rows == bulk_rows)\n",
     "    for bulk_item, grouped_sc_item in zip(bulk_rows, grouped_sc_rows):\n",
     "        assert(bulk_item == grouped_sc_item)\n",

From 00e193b3136b97d3b5b1da07caa0f9c708bae970 Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-14-42.tscc.sdsc.edu>
Date: Fri, 29 Nov 2024 14:39:33 -0800
Subject: [PATCH 23/26] all cells only for specific conversion

---
 marine.py                     | 42 +++++++++++++++++++++++++----------
 tests/integration_tests.ipynb |  9 ++++----
 2 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/marine.py b/marine.py
index ebdce53..3e85b43 100755
--- a/marine.py
+++ b/marine.py
@@ -289,7 +289,7 @@ def generate_bedgraphs(final_site_level_information_df, conversion_search, outpu
         sites_for_conversion_bedgraph_cols.to_csv('{}/{}_{}.bedgraph'.format(bedgraph_folder, output_folder.split('/')[-1], conversion), sep='\t', index=False, header=False)
 
 
-def generate_combined_sites_bed_for_all_cells(output_folder):
+def generate_combined_sites_bed_for_all_cells(output_folder, strand_conversion="A>G"):
     # go from:
     #
     # 9_GATCCCTCAGTAACGG-1	3000447	3000448
@@ -305,29 +305,47 @@ def generate_combined_sites_bed_for_all_cells(output_folder):
     # 2_CATCCCTCAGTAACGA-1	3000451	3000452
     
     # Input and output files
-    input_file = f"{output_folder}/combined_source_cells.bed"
+    input_file = f"{output_folder}/final_filtered_site_info.tsv"
     output_file = f"{output_folder}/combined_all_cells.bed"
     
     # Load the input data
-    df = pd.read_csv(input_file, sep="\t", header=None, names=["Contig", "Start", "End"])
+    df = pd.read_csv(input_file, sep="\t")
+    print(f"Found {len(df)} total edits...")
 
-    df['Chromosome'] = [str(i.split('_')[0]) for i in df['Contig']]
 
-    unique_chromosomes = df['Chromosome'].unique()
-    print("Unique chromosomes found: {}".format(unique_chromosomes))
+    df = df[df['strand_conversion'] == strand_conversion]
+    print(f"Running all positions for {len(df)} {strand_conversion} edits")
+    
+    df['chromosome'] = df['contig'].astype(str)
+    df['contig'] = df['contig'].astype(str) + '_' + df['barcode'].astype(str)
     
+    unique_chromosomes = df['chromosome'].unique()
+    print("Unique chromosomes found: {}".format(unique_chromosomes))
+
+    total_positions = 0
     for chromosome in unique_chromosomes: 
-        df_for_chromosome = df[df['Chromosome'] == str(chromosome)]
+        # Permute only within chromosome
+        
+        df_for_chromosome = df[df['chromosome'] == str(chromosome)]
         # Create unique lists:
-        unique_contigs = df_for_chromosome["Contig"].unique()  # Unique contig pairs, within this chromosome
-        unique_positions = df_for_chromosome[["Start", "End"]].drop_duplicates().values.tolist()  # Unique location pairs
+        unique_contigs = df_for_chromosome["contig"].unique()  # Unique contig pairs, within this chromosome
+
+        print("\n{} unique_contigs for chromosome {}".format(len(unique_contigs), chromosome))
         
+        unique_positions = set(df_for_chromosome["position"].tolist())  # Unique location pairs
+
+        print(f'unique_positions: {len(unique_positions)}, unique_contigs: {len(unique_contigs)}')
+
+        positions_for_chromosome = len(unique_positions) * len(unique_contigs)
+        total_positions += positions_for_chromosome
+        print('\ttotal_positions: {}\n'.format(positions_for_chromosome))
         # Open the output file and write combinations line by line
         with open(output_file, "a") as f:
             for contig in unique_contigs:
-                for start, end in unique_positions:
-                    f.write(f"{contig}\t{start}\t{end}\n")
-    
+                for position in unique_positions:
+                    f.write(f"{contig}\t{position-1}\t{position}\n")
+    print("Overall positions: {}\n\n".format(total_positions))
+          
     print(f"Output written to {output_file}")
 
 
diff --git a/tests/integration_tests.ipynb b/tests/integration_tests.ipynb
index 162ff9f..6c9d709 100644
--- a/tests/integration_tests.ipynb
+++ b/tests/integration_tests.ipynb
@@ -2889,7 +2889,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "f5c3ed3e-13dd-4399-924e-3d1ac17ce387",
    "metadata": {},
    "outputs": [
@@ -3012,12 +3012,11 @@
       "\t >>> long_read_sc_test passed! <<<\n",
       "\n",
       "Checking that analyzing a single-cell dataset in 'bulk' mode (i.e. not specificying the 'CB' barcode) yields the exact same positions and base changes, but with counts and coverages aggregated rather than at a single-cell resolution\n",
-      "grouped_sc_rows: 62, bulk_rows: 26\n",
-      "Exception: \n",
+      "grouped_sc_rows: 62, bulk_rows: 62\n",
       "\n",
-      "\t ~~~ single cell vs bulk modes on sc dataset equivalency test FAILED! ~~~\n",
+      "\t >>> single-cell and bulk on same dataset comparison passed! <<<\n",
       "\n",
-      "There were 1 failures\n"
+      "There were 0 failures\n"
      ]
     }
    ],

From 5a108114f4045d314f272d8eaf468bfa4a60cf9f Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-13-2.tscc.sdsc.edu>
Date: Sat, 30 Nov 2024 18:13:53 -0800
Subject: [PATCH 24/26] adjustments for error catching

---
 marine.py                      | 25 +++++++++++-----
 src/utils.py                   | 53 +++++++++++++++++++---------------
 tests/integration_tests_run.sh |  2 +-
 3 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/marine.py b/marine.py
index 3e85b43..a17b1a8 100755
--- a/marine.py
+++ b/marine.py
@@ -149,6 +149,15 @@ def split_bed_file(input_bed_file, output_folder, bam_filepaths, output_suffix='
     """
     Split a BED file into multiple files based on suffixes in the first column.
     Each line is assigned to the appropriate file based on the suffix.
+
+    e.g.:
+    
+    10_AAACGAAAGTCACACT-1   6143263         6143264
+    10_AAACGAAAGTCACACT-1   11912575        11912576
+    10_AAACGAAAGTCACACT-1   12209751        12209752
+    10_AAACGAAAGTCACACT-1   13320235        13320236
+    10_AAACGAAAGTCACACT-1   27036085        27036086
+
     """
     if not os.path.exists(output_folder):
         os.makedirs(output_folder)
@@ -171,9 +180,10 @@ def split_bed_file(input_bed_file, output_folder, bam_filepaths, output_suffix='
             for line in infile:
                 # Parse the first column to determine the suffix
                 columns = line.split()
+                
                 chrom = columns[0]  # Assuming the first column is the chromosome
                 for prefix, suffix in suffix_pairs:
-                    if chrom.startswith(prefix) and chrom.endswith(suffix):
+                    if chrom.startswith(f"{prefix}_") and chrom.endswith(suffix):
                         file_handles[prefix + suffix].write(line)
                         break
 
@@ -195,8 +205,9 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=
         "awk 'NR > 1 {{print $2, $4-1, $4}}' OFS='\t' \"$file\"; "
         "done | sort -k1,1 -k2,2n -u > {}/combined_source_cells.bed;"
     ).format(output_folder, output_folder)
-    run_command(combine_edit_sites_command)
 
+    if not os.path.exists(f'{output_folder}/combined_source_cells.bed'):
+        run_command(combine_edit_sites_command)
     
     all_depth_commands.append(combine_edit_sites_command)
 
@@ -447,11 +458,11 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
             print("Deleting overall_label_to_list_of_contents...")
             del overall_label_to_list_of_contents
 
-    
-    with open('{}/manifest.txt'.format(logging_folder), 'a+') as f:
-        f.write(f'total_reads_processed\t{overall_total_reads_processed}\n') 
-        for k, v in overall_counts_summary_df.items():
-            f.write(f'{k}\t{v}\n') 
+    if not coverage_only:
+        with open('{}/manifest.txt'.format(logging_folder), 'a+') as f:
+            f.write(f'total_reads_processed\t{overall_total_reads_processed}\n') 
+            for k, v in overall_counts_summary_df.items():
+                f.write(f'{k}\t{v}\n') 
 
         f.write(f'edits per read (EPR)\t{overall_counts_summary_df.get("total_edits")/overall_total_reads_processed}\n')
 
diff --git a/src/utils.py b/src/utils.py
index 7347c14..d005bde 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -665,10 +665,10 @@ def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_fi
     bash_command = f"""#!/bin/bash
     # Step 1: Adjust the depths file by adding a new column that incorporates both contig and position
     # for join purposes, and sort by this column. Output in tab-separated format.
-    awk -v OFS='\\t' '{{print $1, $2+{position_adjustment}, $3, $1":"($2+{position_adjustment})}}' "{file2_path}" | sort -k4,4n | uniq > {output_folder}/depth_modified.tsv
+    awk -v OFS='\\t' '{{print $1, $2+{position_adjustment}, $3, $1":"($2+{position_adjustment})}}' "{file2_path}" | sort -k4,4V | uniq > {output_folder}/depth_modified.tsv
     
     # Step 2: Sort the first file numerically by the join column (the column incuding both contig and position information)
-    sort -k3,3n "{file1_path}" | uniq > {output_folder}/final_edit_info_no_coverage_sorted.tsv
+    sort -k3,3V "{file1_path}" | uniq > {output_folder}/final_edit_info_no_coverage_sorted.tsv
 
     # Step 3: Join the files on the specified columns, output all columns, and select necessary columns with tab separation
 join -1 3 -2 4 -t $'\\t' {output_folder}/final_edit_info_no_coverage_sorted.tsv {output_folder}/depth_modified.tsv | awk -v OFS='\\t' '{{print $2, $3, $4, $5, $6, $7, $8, $11}}' > "{output_file_path}"
@@ -822,27 +822,34 @@ def calculate_coverage(bam_filepath, bed_filepath, output_filepath, output_matri
             chrom, start, end = fields[0], int(fields[1]), int(fields[2])
             regions.append((chrom, start, end))
 
-    print(f"\t{len(regions)} regions")
-    
-    with pysam.AlignmentFile(bam_filepath, "rb") as bam, open(depths_file, "w") as out:
-        for chrom, start, end in regions:
-            # Get coverage, applying base quality filter
-            coverage = bam.count_coverage(
-                chrom, start, end, quality_threshold=0  # Mimics -Q 13 in samtools depth
-            )
-
-            # Calculate total coverage for each position and include zero-coverage positions
-            for pos in range(start, end):
-                total_coverage = sum(base[pos - start] for base in coverage)
-                out.write(f"{chrom}\t{pos}\t{total_coverage}\n")
-
-
-    if output_matrix_folder:
-        matrix_output_file = os.path.join(output_matrix_folder, f"{os.path.basename(depths_file).split('.')[0]}_matrix.tsv")
-    
-        if not os.path.exists(matrix_output_file) or os.path.getsize(matrix_output_file) == 0:
-            # Pivot the depths output file into a matrix
-            pivot_depths_output(depths_file, matrix_output_file)
+    if len(regions) > 0:
+        print(f"\t{len(regions)} regions")
+        
+        with pysam.AlignmentFile(bam_filepath, "rb") as bam, open(depths_file, "w") as out:
+            try:
+                for chrom, start, end in regions:
+                    # Get coverage, applying base quality filter
+                    coverage = bam.count_coverage(
+                        chrom, start, end, quality_threshold=0  # Mimics -Q 13 in samtools depth
+                    )
+        
+                    # Calculate total coverage for each position and include zero-coverage positions
+                    for pos in range(start, end):
+                        total_coverage = sum(base[pos - start] for base in coverage)
+                        out.write(f"{chrom}\t{pos}\t{total_coverage}\n")
+            except Exception as e:
+                # Return the exception and the arguments for debugging
+                raise RuntimeError(
+                        f"Error processing bam {bam_filepath} using bed {bed_filepath} at {chrom}:{start}-{end}, "
+                        f"outputting to {depths_file}: {e}"
+                    )
+                    
+        if output_matrix_folder:
+            matrix_output_file = os.path.join(output_matrix_folder, f"{os.path.basename(depths_file).split('.')[0]}_matrix.tsv")
+        
+            if not os.path.exists(matrix_output_file) or os.path.getsize(matrix_output_file) == 0:
+                # Pivot the depths output file into a matrix
+                pivot_depths_output(depths_file, matrix_output_file)
 
 
 def run_pysam_count_coverage(args_list, processes):
diff --git a/tests/integration_tests_run.sh b/tests/integration_tests_run.sh
index f7ab6be..3d10906 100755
--- a/tests/integration_tests_run.sh
+++ b/tests/integration_tests_run.sh
@@ -34,7 +34,7 @@ echo "SC tests scripts"
 ls -lh $MARINE/tests/$tests_folder/scripts/
 
 
-for t in "only_5_cells_test" "only_5_cells_bulk_mode_test" "long_read_sc_test" "edge_case_test" "edge_case_dist_filter_test"
+for t in "only_5_cells_test" "only_5_cells_bulk_mode_test" "only_5_cells_all_cells_coverage_test" "long_read_sc_test" "edge_case_test" "edge_case_dist_filter_test"
 
 do
     echo $t

From 1683bb611324ec5c60105be0b386cdb0f8a1f14b Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-13-2.tscc.sdsc.edu>
Date: Sat, 30 Nov 2024 18:29:50 -0800
Subject: [PATCH 25/26] fix

---
 marine.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/marine.py b/marine.py
index a17b1a8..0891709 100755
--- a/marine.py
+++ b/marine.py
@@ -391,7 +391,8 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
         f.write('min_read_quality\t{}\n'.format(min_read_quality))
         f.write('min_dist_from_end\t{}\n'.format(min_dist_from_end))
         f.write('skip_coverage\t{}\n'.format(skip_coverage))
-        
+
+    overall_total_reads_processed = 0
     if not (coverage_only or filtering_only):
         if barcode_whitelist_file:
             barcode_whitelist = read_barcode_whitelist_file(barcode_whitelist_file)
@@ -464,7 +465,7 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
             for k, v in overall_counts_summary_df.items():
                 f.write(f'{k}\t{v}\n') 
 
-        f.write(f'edits per read (EPR)\t{overall_counts_summary_df.get("total_edits")/overall_total_reads_processed}\n')
+            f.write(f'edits per read (EPR)\t{overall_counts_summary_df.get("total_edits")/overall_total_reads_processed}\n')
 
     if not filtering_only and not skip_coverage:
         # Coverage calculation

From 9c663605a378125bb1fed1d67dd230bad461b388 Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-13-12.tscc.sdsc.edu>
Date: Sun, 1 Dec 2024 21:59:59 -0800
Subject: [PATCH 26/26] efficient for all cells

---
 marine.py | 220 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 134 insertions(+), 86 deletions(-)

diff --git a/marine.py b/marine.py
index 0891709..a605b51 100755
--- a/marine.py
+++ b/marine.py
@@ -300,64 +300,115 @@ def generate_bedgraphs(final_site_level_information_df, conversion_search, outpu
         sites_for_conversion_bedgraph_cols.to_csv('{}/{}_{}.bedgraph'.format(bedgraph_folder, output_folder.split('/')[-1], conversion), sep='\t', index=False, header=False)
 
 
-def generate_combined_sites_bed_for_all_cells(output_folder, strand_conversion="A>G"):
-    # go from:
-    #
-    # 9_GATCCCTCAGTAACGG-1	3000447	3000448
-    # 9_CATCCCTCAGTAACGA-1	3000468	3000469
-    # 2_CATCCCTCAGTAACGA-1	3000451	3000452
-    # 
-    # to:
-    # 
-    # 9_GATCCCTCAGTAACGG-1	3000447	3000448
-    # 9_GATCCCTCAGTAACGG-1	3000468	3000469
-    # 9_CATCCCTCAGTAACGA-1	3000447	3000448
-    # 9_CATCCCTCAGTAACGA-1	3000468	3000469
-    # 2_CATCCCTCAGTAACGA-1	3000451	3000452
-    
-    # Input and output files
-    input_file = f"{output_folder}/final_filtered_site_info.tsv"
-    output_file = f"{output_folder}/combined_all_cells.bed"
+def prepare_combinations_for_split(df, bam_filepaths, output_folder, output_suffix):
+    """
+    Prepares the chromosome-suffix combinations for multiprocessing.
+    For each edited position in a given barcode, we want to look at the coverage at that
+    position for that chromosome across all the other barcodes. 
+    
+    Args:
+        df (pd.DataFrame): Filtered DataFrame containing edit data.
+        bam_filepaths (list): List of BAM filepaths to extract suffix pairs.
+        output_folder (str): Path to the output folder for split BED files.
+        output_suffix (str): Suffix for output files.
+
+    Returns:
+        list: List of tuples for processing.
+    """
+    # Extract prefix and suffix from BAM filenames
+    suffix_pairs = [
+        (os.path.basename(bam).split("_")[0], os.path.basename(bam).split("_")[1].split(".")[0])
+        for bam in bam_filepaths
+    ]
+    print(f"suffix_pairs is {suffix_pairs}")
     
-    # Load the input data
-    df = pd.read_csv(input_file, sep="\t")
-    print(f"Found {len(df)} total edits...")
+    # Unique chromosomes in the dataset
+    unique_chromosomes = df['contig'].unique()
+
+    # Prepare combinations of chromosomes and suffix pairs
+    combinations = []
+    for chrom in unique_chromosomes:
+        print(f"\tChecking {chrom}...")
+
+        chrom = str(chrom)
+        df['contig'] = df['contig'].astype(str)
+        df_for_chrom = df[df['contig'] == chrom]
+        unique_positions = df_for_chrom.position.unique()
+        
+        for prefix, suffix in suffix_pairs:
+            
+            if prefix == chrom:
+                print(f"\t\tGenerating for ({prefix},{suffix})")                
+                df_for_prefix_suffix =  df_for_chrom[df_for_chrom['barcode'].str.endswith(suffix)]
+                unique_barcodes = df_for_prefix_suffix.barcode.unique()
+                
+                combinations.append((chrom, prefix, suffix, unique_positions, unique_barcodes, output_folder, output_suffix))
+    
+                print(f"\t\t\t{prefix}_{suffix}: Unique positions: {len(unique_positions)}, Unique barcodes: {len(unique_barcodes)}")
+            
+    return combinations
+
+def process_combination_for_split(args):
+    """
+    Processes a single combination of chromosome, prefix, suffix, positions, and barcodes 
+    to write split BED files.
 
+    Args:
+        args (tuple): Contains chromosome, prefix, suffix, positions, barcodes, 
+                      output folder, and output suffix.
+    """
+    chrom, prefix, suffix, unique_positions, unique_barcodes, output_folder, output_suffix = args
 
+    # Output file path
+    output_file = os.path.join(output_folder, f"combined_{output_suffix}_{prefix}_{suffix}.bed")
+
+    # Write combinations directly to the file
+    with open(output_file, "w") as f:
+        for position in unique_positions:
+            for barcode in unique_barcodes:
+                contig = f"{chrom}_{barcode}"  # Construct contig using chromosome and barcode
+                f.write(f"{contig}\t{position-1}\t{position}\n")
+
+    print(f"\t\t\t>>> Processed {chrom}, {prefix}_{suffix} -> {output_file}")
+
+
+def generate_and_split_bed_files_for_all_edits(output_folder, bam_filepaths, strand_conversion, processes=4, output_suffix="all_cells"):
+    """
+    Generates combined BED files for all edit sites and splits them into suffix-specific files.
+
+    Args:
+        output_folder (str): Path to the output folder.
+        bam_filepaths (list): List of BAM filepaths for suffix extraction.
+        strand_conversion (str): Strand conversion type (e.g., 'A>G').
+        processes (int): Number of multiprocessing workers.
+        output_suffix (str): Suffix for output files.
+    """
+    input_file = f"{output_folder}/final_filtered_site_info.tsv"
+    df = pd.read_csv(input_file, sep="\t")
+
+    # Filter by strand conversion
     df = df[df['strand_conversion'] == strand_conversion]
     print(f"Running all positions for {len(df)} {strand_conversion} edits")
+
+    # Prepare combinations for multiprocessing
+    split_bed_folder = f"{output_folder}/combined_{output_suffix}_split_by_suffix"
+    os.makedirs(split_bed_folder, exist_ok=True)
     
-    df['chromosome'] = df['contig'].astype(str)
-    df['contig'] = df['contig'].astype(str) + '_' + df['barcode'].astype(str)
+    # Cleanup existing .bed files in the output folder
+    existing_bed_files = glob(os.path.join(split_bed_folder, "*.bed"))
+    if existing_bed_files:
+        print(f"Found {len(existing_bed_files)} existing .bed files. Removing...")
+        for file in existing_bed_files:
+            os.remove(file)
+    print("Existing .bed files removed. Starting fresh.")
     
-    unique_chromosomes = df['chromosome'].unique()
-    print("Unique chromosomes found: {}".format(unique_chromosomes))
+    combinations = prepare_combinations_for_split(df, bam_filepaths, f"{output_folder}/combined_{output_suffix}_split_by_suffix", output_suffix)
 
-    total_positions = 0
-    for chromosome in unique_chromosomes: 
-        # Permute only within chromosome
-        
-        df_for_chromosome = df[df['chromosome'] == str(chromosome)]
-        # Create unique lists:
-        unique_contigs = df_for_chromosome["contig"].unique()  # Unique contig pairs, within this chromosome
+    # Run the processing with multiprocessing
+    with Pool(processes=processes) as pool:
+        pool.map(process_combination_for_split, combinations)
 
-        print("\n{} unique_contigs for chromosome {}".format(len(unique_contigs), chromosome))
-        
-        unique_positions = set(df_for_chromosome["position"].tolist())  # Unique location pairs
-
-        print(f'unique_positions: {len(unique_positions)}, unique_contigs: {len(unique_contigs)}')
-
-        positions_for_chromosome = len(unique_positions) * len(unique_contigs)
-        total_positions += positions_for_chromosome
-        print('\ttotal_positions: {}\n'.format(positions_for_chromosome))
-        # Open the output file and write combinations line by line
-        with open(output_file, "a") as f:
-            for contig in unique_contigs:
-                for position in unique_positions:
-                    f.write(f"{contig}\t{position-1}\t{position}\n")
-    print("Overall positions: {}\n\n".format(total_positions))
-          
-    print(f"Output written to {output_file}")
+    print(f"All split BED files generated in {output_folder}/combined_{output_suffix}_split_by_suffix")
 
 
     
@@ -392,8 +443,19 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
         f.write('min_dist_from_end\t{}\n'.format(min_dist_from_end))
         f.write('skip_coverage\t{}\n'.format(skip_coverage))
 
-    overall_total_reads_processed = 0
-    if not (coverage_only or filtering_only):
+    
+    # Check if filtering step finished
+    final_filtered_sites_path = '{}/final_filtered_site_info.tsv'.format(output_folder)
+    final_path_already_exists = False
+    final_annotated_path_already_exists = False
+
+    if os.path.exists(final_filtered_sites_path):
+        print("{} exists... skipping edit finding.".format(final_filtered_sites_path))
+        final_path_already_exists = True
+
+    # Edit finding
+    if not (coverage_only or filtering_only) and not final_path_already_exists:
+        overall_total_reads_processed = 0
         if barcode_whitelist_file:
             barcode_whitelist = read_barcode_whitelist_file(barcode_whitelist_file)
         else:
@@ -459,7 +521,7 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
             print("Deleting overall_label_to_list_of_contents...")
             del overall_label_to_list_of_contents
 
-    if not coverage_only:
+        
         with open('{}/manifest.txt'.format(logging_folder), 'a+') as f:
             f.write(f'total_reads_processed\t{overall_total_reads_processed}\n') 
             for k, v in overall_counts_summary_df.items():
@@ -467,14 +529,14 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
 
             f.write(f'edits per read (EPR)\t{overall_counts_summary_df.get("total_edits")/overall_total_reads_processed}\n')
 
-    if not filtering_only and not skip_coverage:
+    reconfigured_bam_filepaths = glob('{}/split_bams/*/*.bam'.format(output_folder))
+        
+    if not final_path_already_exists and not skip_coverage:
         # Coverage calculation
         # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         pretty_print("Calculating coverage at edited sites, minimum read quality is {}...".format(min_read_quality), style='~')
         
         # We want to run the samtools depth command for each of the reconfigured bam files
-        reconfigured_bam_filepaths = glob('{}/split_bams/*/*.bam'.format(output_folder))
-        
         print("Running samtools depth on {} subset bam paths...".format(len(reconfigured_bam_filepaths)))
         total_time, total_seconds_for_contig_df = generate_depths(output_folder, reconfigured_bam_filepaths, paired_end=paired_end, barcode_tag=barcode_tag)
                                               
@@ -482,19 +544,6 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
          
         pretty_print("Total time to calculate coverage: {} minutes".format(round(total_time/60, 3)))
     
-    # Check if filtering step finished
-    final_filtered_sites_path = '{}/final_filtered_site_info.tsv'.format(output_folder)
-    final_path_already_exists = False
-    final_annotated_path_already_exists = False
-    
-    if os.path.exists(final_filtered_sites_path):
-        print("{} exists...".format(final_filtered_sites_path))
-        final_path_already_exists = True
-
-    # Filtering steps
-    if not final_path_already_exists:
-        print("Filtering..")
-
         all_edit_info_unique_position_with_coverage_df = get_edits_with_coverage_df(output_folder,
                                                                                     barcode_tag=barcode_tag)
         
@@ -543,7 +592,8 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
         if len(final_site_level_information_df) == 0:
             output_zero_edit_files = zero_edit_found(final_site_level_information_df, output_folder, sailor_list, bedgraphs_list, keep_intermediate_files, start_time, logging_folder)
             return 'Done!'
-    
+
+    # Annotation option
     if final_path_already_exists and annotation_bedfile_path:
         final_site_level_information_df = pd.read_csv('{}/final_filtered_site_info.tsv'.format(output_folder), 
                                                   sep='\t')
@@ -553,12 +603,11 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
                                                   sep='\t', index=False)
         final_annotated_path_already_exists = True
 
-
+    # Make plot of edit distributions
     if final_path_already_exists:
         final_site_level_information_df = pd.read_csv('{}/final_filtered_site_info.tsv'.format(output_folder), 
                                                   sep='\t')
         
-        # Make plot of edit distributions
         plot_folder = '{}/plots'.format(output_folder)
         make_folder(plot_folder)
         
@@ -579,23 +628,22 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
     print(f"Current memory usage {current/1e6}MB; Peak: {peak/1e6}MB")
     print(f'Time elapsed: {time.time()-start_time:.2f}s')
 
-    if all_cells_coverage:
+    if final_path_already_exists and all_cells_coverage:
+        output_suffix = "all_cells"
         print("Calculating coverage at all edit sites in all cells...")
 
-        # Make a permuted version of the combined.bed file, such that every combination of contig
-        # and start-end is generated (this might be a very large file depending on number of edits and cells)
-        generate_combined_sites_bed_for_all_cells(output_folder)
-        
-         # We want to run the samtools depth command for each of the reconfigured bam files
-        bam_filepaths = glob('{}/split_bams/*/*.bam'.format(output_folder))
-        output_suffix = "all_cells"
-        if len(bam_filepaths) > 1:
-            split_bed_file(
-                f"{output_folder}/combined_{output_suffix}.bed",
-                f"{output_folder}/combined_{output_suffix}_split_by_suffix",
-                bam_filepaths, 
-                output_suffix=output_suffix
-            )
+        # Define the strand conversion type (e.g., 'A>G')
+        strand_conversion = "A>G"
+    
+        # Get the list of BAM file paths
+        bam_filepaths = glob(f"{output_folder}/split_bams/*/*.bam")
+    
+        # Generate and split BED files using multiprocessing
+        generate_and_split_bed_files_for_all_edits(output_folder,
+                                                   bam_filepaths, 
+                                                   strand_conversion="A>G",
+                                                   processes=cores, 
+                                                   output_suffix=output_suffix)
 
         make_depth_command_script_single_cell(
             paired_end,